diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5669bd8981d88d1e36835ffcbd7526af8f983861 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95186df5b3792449390b3cdfc58ce0d74b26c09e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73ee8d0959379988180c886f09b6361efa56ff39 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..822ed4403b350676bbdd1f6e1e4762d26ee0d709 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f1d7348a488cadd338fbf1a9c328264532056cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a950ee23a87d928ca1f5baf038e85e47702961df Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0f1e50604422b0d9a81fe3e64a945f9142452ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6d310777410b081a89f9a115e2fd20e5d8d46cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..867792a1f1467557441b0fe26cd1f07458203f21 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b925283cb545e2d57135995b0de76cb05bfe7a2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f05480658a3fd5c31d09032fcbc3595efb88e08d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9730ccc4546c51521088b4eecb4bd540b37bd820 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d73577acb483a732916e7dc8c1a1e9bfeefaca6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cd2c0173d810137bf8fd93be662bd17e2b48a35 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06256a50e92559ecccec367e932236eebe7b7c45 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89707ba3e0f24e6acaa247dcf1951101ef8ce6dd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a02bd58909c197344263f11e8bd459bee5ea8da3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ca02ac00841a2ae9c5f13b670384643c7074428 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04c67f829d305f6e67efe97c2fe4febe14f88741 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c98399c08a2fe92a230f85e8cdff536bb0422c3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/arrow.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/arrow.py new file mode 100644 index 0000000000000000000000000000000000000000..f9fea70d25fe2974e19d35186eee5de60a008eb4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/arrow.py @@ -0,0 +1,304 @@ +import errno +import io +import os +import secrets +import shutil +from contextlib import suppress +from functools import cached_property, wraps +from urllib.parse import parse_qs + +from fsspec.spec import AbstractFileSystem +from fsspec.utils import ( + get_package_version_without_import, + infer_storage_options, + mirror_from, + tokenize, +) + + +def wrap_exceptions(func): + @wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except OSError as exception: + if not exception.args: + raise + + message, *args = exception.args + if isinstance(message, str) and "does not exist" in message: + raise FileNotFoundError(errno.ENOENT, message) from exception + else: + raise + + return wrapper + + +PYARROW_VERSION = None + + +class ArrowFSWrapper(AbstractFileSystem): + """FSSpec-compatible wrapper of pyarrow.fs.FileSystem. + + Parameters + ---------- + fs : pyarrow.fs.FileSystem + + """ + + root_marker = "/" + + def __init__(self, fs, **kwargs): + global PYARROW_VERSION + PYARROW_VERSION = get_package_version_without_import("pyarrow") + self.fs = fs + super().__init__(**kwargs) + + @property + def protocol(self): + return self.fs.type_name + + @cached_property + def fsid(self): + return "hdfs_" + tokenize(self.fs.host, self.fs.port) + + @classmethod + def _strip_protocol(cls, path): + ops = infer_storage_options(path) + path = ops["path"] + if path.startswith("//"): + # special case for "hdfs://path" (without the triple slash) + path = path[1:] + return path + + def ls(self, path, detail=False, **kwargs): + path = self._strip_protocol(path) + from pyarrow.fs import FileSelector + + entries = [ + self._make_entry(entry) + for entry in self.fs.get_file_info(FileSelector(path)) + ] + if detail: + return entries + else: + return [entry["name"] for entry in entries] + + def info(self, path, **kwargs): + path = self._strip_protocol(path) + [info] = self.fs.get_file_info([path]) + return self._make_entry(info) + + def exists(self, path): + path = self._strip_protocol(path) + try: + self.info(path) + except FileNotFoundError: + return False + else: + return True + + def _make_entry(self, info): + from pyarrow.fs import FileType + + if info.type is FileType.Directory: + kind = "directory" + elif info.type is FileType.File: + kind = "file" + elif info.type is FileType.NotFound: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path) + else: + kind = "other" + + return { + "name": info.path, + "size": info.size, + "type": kind, + "mtime": info.mtime, + } + + @wrap_exceptions + def cp_file(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1).rstrip("/") + path2 = self._strip_protocol(path2).rstrip("/") + + with self._open(path1, "rb") as lstream: + tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}" + try: + with self.open(tmp_fname, "wb") as rstream: + shutil.copyfileobj(lstream, rstream) + self.fs.move(tmp_fname, path2) + except BaseException: # noqa + with suppress(FileNotFoundError): + self.fs.delete_file(tmp_fname) + raise + + @wrap_exceptions + def mv(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1).rstrip("/") + path2 = self._strip_protocol(path2).rstrip("/") + self.fs.move(path1, path2) + + @wrap_exceptions + def rm_file(self, path): + path = self._strip_protocol(path) + self.fs.delete_file(path) + + @wrap_exceptions + def rm(self, path, recursive=False, maxdepth=None): + path = self._strip_protocol(path).rstrip("/") + if self.isdir(path): + if recursive: + self.fs.delete_dir(path) + else: + raise ValueError("Can't delete directories without recursive=False") + else: + self.fs.delete_file(path) + + @wrap_exceptions + def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs): + if mode == "rb": + if seekable: + method = self.fs.open_input_file + else: + method = self.fs.open_input_stream + elif mode == "wb": + method = self.fs.open_output_stream + elif mode == "ab": + method = self.fs.open_append_stream + else: + raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}") + + _kwargs = {} + if mode != "rb" or not seekable: + if int(PYARROW_VERSION.split(".")[0]) >= 4: + # disable compression auto-detection + _kwargs["compression"] = None + stream = method(path, **_kwargs) + + return ArrowFile(self, stream, path, mode, block_size, **kwargs) + + @wrap_exceptions + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if create_parents: + self.makedirs(path, exist_ok=True) + else: + self.fs.create_dir(path, recursive=False) + + @wrap_exceptions + def makedirs(self, path, exist_ok=False): + path = self._strip_protocol(path) + self.fs.create_dir(path, recursive=True) + + @wrap_exceptions + def rmdir(self, path): + path = self._strip_protocol(path) + self.fs.delete_dir(path) + + @wrap_exceptions + def modified(self, path): + path = self._strip_protocol(path) + return self.fs.get_file_info(path).mtime + + def cat_file(self, path, start=None, end=None, **kwargs): + kwargs["seekable"] = start not in [None, 0] + return super().cat_file(path, start=None, end=None, **kwargs) + + def get_file(self, rpath, lpath, **kwargs): + kwargs["seekable"] = False + super().get_file(rpath, lpath, **kwargs) + + +@mirror_from( + "stream", + [ + "read", + "seek", + "tell", + "write", + "readable", + "writable", + "close", + "size", + "seekable", + ], +) +class ArrowFile(io.IOBase): + def __init__(self, fs, stream, path, mode, block_size=None, **kwargs): + self.path = path + self.mode = mode + + self.fs = fs + self.stream = stream + + self.blocksize = self.block_size = block_size + self.kwargs = kwargs + + def __enter__(self): + return self + + def __exit__(self, *args): + return self.close() + + +class HadoopFileSystem(ArrowFSWrapper): + """A wrapper on top of the pyarrow.fs.HadoopFileSystem + to connect it's interface with fsspec""" + + protocol = "hdfs" + + def __init__( + self, + host="default", + port=0, + user=None, + kerb_ticket=None, + replication=3, + extra_conf=None, + **kwargs, + ): + """ + + Parameters + ---------- + host: str + Hostname, IP or "default" to try to read from Hadoop config + port: int + Port to connect on, or default from Hadoop config if 0 + user: str or None + If given, connect as this username + kerb_ticket: str or None + If given, use this ticket for authentication + replication: int + set replication factor of file for write operations. default value is 3. + extra_conf: None or dict + Passed on to HadoopFileSystem + """ + from pyarrow.fs import HadoopFileSystem + + fs = HadoopFileSystem( + host=host, + port=port, + user=user, + kerb_ticket=kerb_ticket, + replication=replication, + extra_conf=extra_conf, + ) + super().__init__(fs=fs, **kwargs) + + @staticmethod + def _get_kwargs_from_urls(path): + ops = infer_storage_options(path) + out = {} + if ops.get("host", None): + out["host"] = ops["host"] + if ops.get("username", None): + out["user"] = ops["username"] + if ops.get("port", None): + out["port"] = ops["port"] + if ops.get("url_query", None): + queries = parse_qs(ops["url_query"]) + if queries.get("replication", None): + out["replication"] = int(queries["replication"][0]) + return out diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..6e7c7d88afdddf12f77b26bb635bd8bf1e2bd7f1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import abc +import hashlib + +from fsspec.implementations.local import make_path_posix + + +class AbstractCacheMapper(abc.ABC): + """Abstract super-class for mappers from remote URLs to local cached + basenames. + """ + + @abc.abstractmethod + def __call__(self, path: str) -> str: ... + + def __eq__(self, other: object) -> bool: + # Identity only depends on class. When derived classes have attributes + # they will need to be included. + return isinstance(other, type(self)) + + def __hash__(self) -> int: + # Identity only depends on class. When derived classes have attributes + # they will need to be included. + return hash(type(self)) + + +class BasenameCacheMapper(AbstractCacheMapper): + """Cache mapper that uses the basename of the remote URL and a fixed number + of directory levels above this. + + The default is zero directory levels, meaning different paths with the same + basename will have the same cached basename. + """ + + def __init__(self, directory_levels: int = 0): + if directory_levels < 0: + raise ValueError( + "BasenameCacheMapper requires zero or positive directory_levels" + ) + self.directory_levels = directory_levels + + # Separator for directories when encoded as strings. + self._separator = "_@_" + + def __call__(self, path: str) -> str: + path = make_path_posix(path) + prefix, *bits = path.rsplit("/", self.directory_levels + 1) + if bits: + return self._separator.join(bits) + else: + return prefix # No separator found, simple filename + + def __eq__(self, other: object) -> bool: + return super().__eq__(other) and self.directory_levels == other.directory_levels + + def __hash__(self) -> int: + return super().__hash__() ^ hash(self.directory_levels) + + +class HashCacheMapper(AbstractCacheMapper): + """Cache mapper that uses a hash of the remote URL.""" + + def __call__(self, path: str) -> str: + return hashlib.sha256(path.encode()).hexdigest() + + +def create_cache_mapper(same_names: bool) -> AbstractCacheMapper: + """Factory method to create cache mapper for backward compatibility with + ``CachingFileSystem`` constructor using ``same_names`` kwarg. + """ + if same_names: + return BasenameCacheMapper() + else: + return HashCacheMapper() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..bd9b5cdd99d7f4a0a989c0f7d0c70ddcf324816a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import os +import pickle +import time +from typing import TYPE_CHECKING + +from fsspec.utils import atomic_write + +try: + import ujson as json +except ImportError: + if not TYPE_CHECKING: + import json + +if TYPE_CHECKING: + from typing import Any, Dict, Iterator, Literal + + from typing_extensions import TypeAlias + + from .cached import CachingFileSystem + + Detail: TypeAlias = Dict[str, Any] + + +class CacheMetadata: + """Cache metadata. + + All reading and writing of cache metadata is performed by this class, + accessing the cached files and blocks is not. + + Metadata is stored in a single file per storage directory in JSON format. + For backward compatibility, also reads metadata stored in pickle format + which is converted to JSON when next saved. + """ + + def __init__(self, storage: list[str]): + """ + + Parameters + ---------- + storage: list[str] + Directories containing cached files, must be at least one. Metadata + is stored in the last of these directories by convention. + """ + if not storage: + raise ValueError("CacheMetadata expects at least one storage location") + + self._storage = storage + self.cached_files: list[Detail] = [{}] + + # Private attribute to force saving of metadata in pickle format rather than + # JSON for use in tests to confirm can read both pickle and JSON formats. + self._force_save_pickle = False + + def _load(self, fn: str) -> Detail: + """Low-level function to load metadata from specific file""" + try: + with open(fn, "r") as f: + loaded = json.load(f) + except ValueError: + with open(fn, "rb") as f: + loaded = pickle.load(f) + for c in loaded.values(): + if isinstance(c.get("blocks"), list): + c["blocks"] = set(c["blocks"]) + return loaded + + def _save(self, metadata_to_save: Detail, fn: str) -> None: + """Low-level function to save metadata to specific file""" + if self._force_save_pickle: + with atomic_write(fn) as f: + pickle.dump(metadata_to_save, f) + else: + with atomic_write(fn, mode="w") as f: + json.dump(metadata_to_save, f) + + def _scan_locations( + self, writable_only: bool = False + ) -> Iterator[tuple[str, str, bool]]: + """Yield locations (filenames) where metadata is stored, and whether + writable or not. + + Parameters + ---------- + writable: bool + Set to True to only yield writable locations. + + Returns + ------- + Yields (str, str, bool) + """ + n = len(self._storage) + for i, storage in enumerate(self._storage): + writable = i == n - 1 + if writable_only and not writable: + continue + yield os.path.join(storage, "cache"), storage, writable + + def check_file( + self, path: str, cfs: CachingFileSystem | None + ) -> Literal[False] | tuple[Detail, str]: + """If path is in cache return its details, otherwise return ``False``. + + If the optional CachingFileSystem is specified then it is used to + perform extra checks to reject possible matches, such as if they are + too old. + """ + for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files): + if path not in cache: + continue + detail = cache[path].copy() + + if cfs is not None: + if cfs.check_files and detail["uid"] != cfs.fs.ukey(path): + # Wrong file as determined by hash of file properties + continue + if cfs.expiry and time.time() - detail["time"] > cfs.expiry: + # Cached file has expired + continue + + fn = os.path.join(base, detail["fn"]) + if os.path.exists(fn): + return detail, fn + return False + + def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]: + """Remove expired metadata from the cache. + + Returns names of files corresponding to expired metadata and a boolean + flag indicating whether the writable cache is empty. Caller is + responsible for deleting the expired files. + """ + expired_files = [] + for path, detail in self.cached_files[-1].copy().items(): + if time.time() - detail["time"] > expiry_time: + fn = detail.get("fn", "") + if not fn: + raise RuntimeError( + f"Cache metadata does not contain 'fn' for {path}" + ) + fn = os.path.join(self._storage[-1], fn) + expired_files.append(fn) + self.cached_files[-1].pop(path) + + if self.cached_files[-1]: + cache_path = os.path.join(self._storage[-1], "cache") + self._save(self.cached_files[-1], cache_path) + + writable_cache_empty = not self.cached_files[-1] + return expired_files, writable_cache_empty + + def load(self) -> None: + """Load all metadata from disk and store in ``self.cached_files``""" + cached_files = [] + for fn, _, _ in self._scan_locations(): + if os.path.exists(fn): + # TODO: consolidate blocks here + cached_files.append(self._load(fn)) + else: + cached_files.append({}) + self.cached_files = cached_files or [{}] + + def on_close_cached_file(self, f: Any, path: str) -> None: + """Perform side-effect actions on closing a cached file. + + The actual closing of the file is the responsibility of the caller. + """ + # File must be writeble, so in self.cached_files[-1] + c = self.cached_files[-1][path] + if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size: + c["blocks"] = True + + def pop_file(self, path: str) -> str | None: + """Remove metadata of cached file. + + If path is in the cache, return the filename of the cached file, + otherwise return ``None``. Caller is responsible for deleting the + cached file. + """ + details = self.check_file(path, None) + if not details: + return None + _, fn = details + if fn.startswith(self._storage[-1]): + self.cached_files[-1].pop(path) + self.save() + else: + raise PermissionError( + "Can only delete cached file in last, writable cache location" + ) + return fn + + def save(self) -> None: + """Save metadata to disk""" + for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files): + if not writable: + continue + + if os.path.exists(fn): + cached_files = self._load(fn) + for k, c in cached_files.items(): + if k in cache: + if c["blocks"] is True or cache[k]["blocks"] is True: + c["blocks"] = True + else: + # self.cached_files[*][*]["blocks"] must continue to + # point to the same set object so that updates + # performed by MMapCache are propagated back to + # self.cached_files. + blocks = cache[k]["blocks"] + blocks.update(c["blocks"]) + c["blocks"] = blocks + c["time"] = max(c["time"], cache[k]["time"]) + c["uid"] = cache[k]["uid"] + + # Files can be added to cache after it was written once + for k, c in cache.items(): + if k not in cached_files: + cached_files[k] = c + else: + cached_files = cache + cache = {k: v.copy() for k, v in cached_files.items()} + for c in cache.values(): + if isinstance(c["blocks"], set): + c["blocks"] = list(c["blocks"]) + self._save(cache, fn) + self.cached_files[-1] = cached_files + + def update_file(self, path: str, detail: Detail) -> None: + """Update metadata for specific file in memory, do not save""" + self.cached_files[-1][path] = detail diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cached.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cached.py new file mode 100644 index 0000000000000000000000000000000000000000..c2ee17214a23e7d74ecf935449cf8923ad903ce1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cached.py @@ -0,0 +1,939 @@ +from __future__ import annotations + +import inspect +import logging +import os +import tempfile +import time +import weakref +from shutil import rmtree +from typing import TYPE_CHECKING, Any, Callable, ClassVar + +from fsspec import AbstractFileSystem, filesystem +from fsspec.callbacks import DEFAULT_CALLBACK +from fsspec.compression import compr +from fsspec.core import BaseCache, MMapCache +from fsspec.exceptions import BlocksizeMismatchError +from fsspec.implementations.cache_mapper import create_cache_mapper +from fsspec.implementations.cache_metadata import CacheMetadata +from fsspec.spec import AbstractBufferedFile +from fsspec.transaction import Transaction +from fsspec.utils import infer_compression + +if TYPE_CHECKING: + from fsspec.implementations.cache_mapper import AbstractCacheMapper + +logger = logging.getLogger("fsspec.cached") + + +class WriteCachedTransaction(Transaction): + def complete(self, commit=True): + rpaths = [f.path for f in self.files] + lpaths = [f.fn for f in self.files] + if commit: + self.fs.put(lpaths, rpaths) + self.files.clear() + self.fs._intrans = False + self.fs._transaction = None + self.fs = None # break cycle + + +class CachingFileSystem(AbstractFileSystem): + """Locally caching filesystem, layer over any other FS + + This class implements chunk-wise local storage of remote files, for quick + access after the initial download. The files are stored in a given + directory with hashes of URLs for the filenames. If no directory is given, + a temporary one is used, which should be cleaned up by the OS after the + process ends. The files themselves are sparse (as implemented in + :class:`~fsspec.caching.MMapCache`), so only the data which is accessed + takes up space. + + Restrictions: + + - the block-size must be the same for each access of a given file, unless + all blocks of the file have already been read + - caching can only be applied to file-systems which produce files + derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also + allowed, for testing + """ + + protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached") + + def __init__( + self, + target_protocol=None, + cache_storage="TMP", + cache_check=10, + check_files=False, + expiry_time=604800, + target_options=None, + fs=None, + same_names: bool | None = None, + compression=None, + cache_mapper: AbstractCacheMapper | None = None, + **kwargs, + ): + """ + + Parameters + ---------- + target_protocol: str (optional) + Target filesystem protocol. Provide either this or ``fs``. + cache_storage: str or list(str) + Location to store files. If "TMP", this is a temporary directory, + and will be cleaned up by the OS when this process ends (or later). + If a list, each location will be tried in the order given, but + only the last will be considered writable. + cache_check: int + Number of seconds between reload of cache metadata + check_files: bool + Whether to explicitly see if the UID of the remote file matches + the stored one before using. Warning: some file systems such as + HTTP cannot reliably give a unique hash of the contents of some + path, so be sure to set this option to False. + expiry_time: int + The time in seconds after which a local copy is considered useless. + Set to falsy to prevent expiry. The default is equivalent to one + week. + target_options: dict or None + Passed to the instantiation of the FS, if fs is None. + fs: filesystem instance + The target filesystem to run against. Provide this or ``protocol``. + same_names: bool (optional) + By default, target URLs are hashed using a ``HashCacheMapper`` so + that files from different backends with the same basename do not + conflict. If this argument is ``true``, a ``BasenameCacheMapper`` + is used instead. Other cache mapper options are available by using + the ``cache_mapper`` keyword argument. Only one of this and + ``cache_mapper`` should be specified. + compression: str (optional) + To decompress on download. Can be 'infer' (guess from the URL name), + one of the entries in ``fsspec.compression.compr``, or None for no + decompression. + cache_mapper: AbstractCacheMapper (optional) + The object use to map from original filenames to cached filenames. + Only one of this and ``same_names`` should be specified. + """ + super().__init__(**kwargs) + if fs is None and target_protocol is None: + raise ValueError( + "Please provide filesystem instance(fs) or target_protocol" + ) + if not (fs is None) ^ (target_protocol is None): + raise ValueError( + "Both filesystems (fs) and target_protocol may not be both given." + ) + if cache_storage == "TMP": + tempdir = tempfile.mkdtemp() + storage = [tempdir] + weakref.finalize(self, self._remove_tempdir, tempdir) + else: + if isinstance(cache_storage, str): + storage = [cache_storage] + else: + storage = cache_storage + os.makedirs(storage[-1], exist_ok=True) + self.storage = storage + self.kwargs = target_options or {} + self.cache_check = cache_check + self.check_files = check_files + self.expiry = expiry_time + self.compression = compression + + # Size of cache in bytes. If None then the size is unknown and will be + # recalculated the next time cache_size() is called. On writes to the + # cache this is reset to None. + self._cache_size = None + + if same_names is not None and cache_mapper is not None: + raise ValueError( + "Cannot specify both same_names and cache_mapper in " + "CachingFileSystem.__init__" + ) + if cache_mapper is not None: + self._mapper = cache_mapper + else: + self._mapper = create_cache_mapper( + same_names if same_names is not None else False + ) + + self.target_protocol = ( + target_protocol + if isinstance(target_protocol, str) + else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]) + ) + self._metadata = CacheMetadata(self.storage) + self.load_cache() + self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs) + + def _strip_protocol(path): + # acts as a method, since each instance has a difference target + return self.fs._strip_protocol(type(self)._strip_protocol(path)) + + self._strip_protocol: Callable = _strip_protocol + + @staticmethod + def _remove_tempdir(tempdir): + try: + rmtree(tempdir) + except Exception: + pass + + def _mkcache(self): + os.makedirs(self.storage[-1], exist_ok=True) + + def cache_size(self): + """Return size of cache in bytes. + + If more than one cache directory is in use, only the size of the last + one (the writable cache directory) is returned. + """ + if self._cache_size is None: + cache_dir = self.storage[-1] + self._cache_size = filesystem("file").du(cache_dir, withdirs=True) + return self._cache_size + + def load_cache(self): + """Read set of stored blocks from file""" + self._metadata.load() + self._mkcache() + self.last_cache = time.time() + + def save_cache(self): + """Save set of stored blocks from file""" + self._mkcache() + self._metadata.save() + self.last_cache = time.time() + self._cache_size = None + + def _check_cache(self): + """Reload caches if time elapsed or any disappeared""" + self._mkcache() + if not self.cache_check: + # explicitly told not to bother checking + return + timecond = time.time() - self.last_cache > self.cache_check + existcond = all(os.path.exists(storage) for storage in self.storage) + if timecond or not existcond: + self.load_cache() + + def _check_file(self, path): + """Is path in cache and still valid""" + path = self._strip_protocol(path) + self._check_cache() + return self._metadata.check_file(path, self) + + def clear_cache(self): + """Remove all files and metadata from the cache + + In the case of multiple cache locations, this clears only the last one, + which is assumed to be the read/write one. + """ + rmtree(self.storage[-1]) + self.load_cache() + self._cache_size = None + + def clear_expired_cache(self, expiry_time=None): + """Remove all expired files and metadata from the cache + + In the case of multiple cache locations, this clears only the last one, + which is assumed to be the read/write one. + + Parameters + ---------- + expiry_time: int + The time in seconds after which a local copy is considered useless. + If not defined the default is equivalent to the attribute from the + file caching instantiation. + """ + + if not expiry_time: + expiry_time = self.expiry + + self._check_cache() + + expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time) + for fn in expired_files: + if os.path.exists(fn): + os.remove(fn) + + if writable_cache_empty: + rmtree(self.storage[-1]) + self.load_cache() + + self._cache_size = None + + def pop_from_cache(self, path): + """Remove cached version of given file + + Deletes local copy of the given (remote) path. If it is found in a cache + location which is not the last, it is assumed to be read-only, and + raises PermissionError + """ + path = self._strip_protocol(path) + fn = self._metadata.pop_file(path) + if fn is not None: + os.remove(fn) + self._cache_size = None + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + """Wrap the target _open + + If the whole file exists in the cache, just open it locally and + return that. + + Otherwise, open the file on the target FS, and make it have a mmap + cache pointing to the location which we determine, in our cache. + The ``blocks`` instance is shared, so as the mmap cache instance + updates, so does the entry in our ``cached_files`` attribute. + We monkey-patch this file, so that when it closes, we call + ``close_and_update`` to save the state of the blocks. + """ + path = self._strip_protocol(path) + + path = self.fs._strip_protocol(path) + if "r" not in mode: + return self.fs._open( + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + **kwargs, + ) + detail = self._check_file(path) + if detail: + # file is in cache + detail, fn = detail + hash, blocks = detail["fn"], detail["blocks"] + if blocks is True: + # stored file is complete + logger.debug("Opening local copy of %s", path) + return open(fn, mode) + # TODO: action where partial file exists in read-only cache + logger.debug("Opening partially cached copy of %s", path) + else: + hash = self._mapper(path) + fn = os.path.join(self.storage[-1], hash) + blocks = set() + detail = { + "original": path, + "fn": hash, + "blocks": blocks, + "time": time.time(), + "uid": self.fs.ukey(path), + } + self._metadata.update_file(path, detail) + logger.debug("Creating local sparse file for %s", path) + + # call target filesystems open + self._mkcache() + f = self.fs._open( + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + cache_type="none", + **kwargs, + ) + if self.compression: + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") + if "blocksize" in detail: + if detail["blocksize"] != f.blocksize: + raise BlocksizeMismatchError( + f"Cached file must be reopened with same block" + f" size as original (old: {detail['blocksize']}," + f" new {f.blocksize})" + ) + else: + detail["blocksize"] = f.blocksize + f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks) + close = f.close + f.close = lambda: self.close_and_update(f, close) + self.save_cache() + return f + + def _parent(self, path): + return self.fs._parent(path) + + def hash_name(self, path: str, *args: Any) -> str: + # Kept for backward compatibility with downstream libraries. + # Ignores extra arguments, previously same_name boolean. + return self._mapper(path) + + def close_and_update(self, f, close): + """Called when a file is closing, so store the set of blocks""" + if f.closed: + return + path = self._strip_protocol(f.path) + self._metadata.on_close_cached_file(f, path) + try: + logger.debug("going to save") + self.save_cache() + logger.debug("saved") + except OSError: + logger.debug("Cache saving failed while closing file") + except NameError: + logger.debug("Cache save failed due to interpreter shutdown") + close() + f.closed = True + + def ls(self, path, detail=True): + return self.fs.ls(path, detail) + + def __getattribute__(self, item): + if item in { + "load_cache", + "_open", + "save_cache", + "close_and_update", + "__init__", + "__getattribute__", + "__reduce__", + "_make_local_details", + "open", + "cat", + "cat_file", + "cat_ranges", + "get", + "read_block", + "tail", + "head", + "info", + "ls", + "exists", + "isfile", + "isdir", + "_check_file", + "_check_cache", + "_mkcache", + "clear_cache", + "clear_expired_cache", + "pop_from_cache", + "_mkcache", + "local_file", + "_paths_from_path", + "get_mapper", + "open_many", + "commit_many", + "hash_name", + "__hash__", + "__eq__", + "to_json", + "cache_size", + "pipe_file", + "pipe", + "isdir", + "isfile", + "exists", + "start_transaction", + "end_transaction", + }: + # all the methods defined in this class. Note `open` here, since + # it calls `_open`, but is actually in superclass + return lambda *args, **kw: getattr(type(self), item).__get__(self)( + *args, **kw + ) + if item in ["__reduce_ex__"]: + raise AttributeError + if item in ["transaction"]: + # property + return type(self).transaction.__get__(self) + if item in ["_cache", "transaction_type"]: + # class attributes + return getattr(type(self), item) + if item == "__class__": + return type(self) + d = object.__getattribute__(self, "__dict__") + fs = d.get("fs", None) # fs is not immediately defined + if item in d: + return d[item] + elif fs is not None: + if item in fs.__dict__: + # attribute of instance + return fs.__dict__[item] + # attributed belonging to the target filesystem + cls = type(fs) + m = getattr(cls, item) + if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and ( + not hasattr(m, "__self__") or m.__self__ is None + ): + # instance method + return m.__get__(fs, cls) + return m # class method or attribute + else: + # attributes of the superclass, while target is being set up + return super().__getattribute__(item) + + def __eq__(self, other): + """Test for equality.""" + if self is other: + return True + if not isinstance(other, type(self)): + return False + return ( + self.storage == other.storage + and self.kwargs == other.kwargs + and self.cache_check == other.cache_check + and self.check_files == other.check_files + and self.expiry == other.expiry + and self.compression == other.compression + and self._mapper == other._mapper + and self.target_protocol == other.target_protocol + ) + + def __hash__(self): + """Calculate hash.""" + return ( + hash(tuple(self.storage)) + ^ hash(str(self.kwargs)) + ^ hash(self.cache_check) + ^ hash(self.check_files) + ^ hash(self.expiry) + ^ hash(self.compression) + ^ hash(self._mapper) + ^ hash(self.target_protocol) + ) + + def to_json(self): + """Calculate JSON representation. + + Not implemented yet for CachingFileSystem. + """ + raise NotImplementedError( + "CachingFileSystem JSON representation not implemented" + ) + + +class WholeFileCacheFileSystem(CachingFileSystem): + """Caches whole remote files on first access + + This class is intended as a layer over any other file system, and + will make a local copy of each file accessed, so that all subsequent + reads are local. This is similar to ``CachingFileSystem``, but without + the block-wise functionality and so can work even when sparse files + are not allowed. See its docstring for definition of the init + arguments. + + The class still needs access to the remote store for listing files, + and may refresh cached files. + """ + + protocol = "filecache" + local_file = True + + def open_many(self, open_files, **kwargs): + paths = [of.path for of in open_files] + if "r" in open_files.mode: + self._mkcache() + else: + return [ + LocalTempFile( + self.fs, + path, + mode=open_files.mode, + fn=os.path.join(self.storage[-1], self._mapper(path)), + **kwargs, + ) + for path in paths + ] + + if self.compression: + raise NotImplementedError + details = [self._check_file(sp) for sp in paths] + downpath = [p for p, d in zip(paths, details) if not d] + downfn0 = [ + os.path.join(self.storage[-1], self._mapper(p)) + for p, d in zip(paths, details) + ] # keep these path names for opening later + downfn = [fn for fn, d in zip(downfn0, details) if not d] + if downpath: + # skip if all files are already cached and up to date + self.fs.get(downpath, downfn) + + # update metadata - only happens when downloads are successful + newdetail = [ + { + "original": path, + "fn": self._mapper(path), + "blocks": True, + "time": time.time(), + "uid": self.fs.ukey(path), + } + for path in downpath + ] + for path, detail in zip(downpath, newdetail): + self._metadata.update_file(path, detail) + self.save_cache() + + def firstpart(fn): + # helper to adapt both whole-file and simple-cache + return fn[1] if isinstance(fn, tuple) else fn + + return [ + open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode) + for fn0, fn1 in zip(details, downfn0) + ] + + def commit_many(self, open_files): + self.fs.put([f.fn for f in open_files], [f.path for f in open_files]) + [f.close() for f in open_files] + for f in open_files: + # in case autocommit is off, and so close did not already delete + try: + os.remove(f.name) + except FileNotFoundError: + pass + self._cache_size = None + + def _make_local_details(self, path): + hash = self._mapper(path) + fn = os.path.join(self.storage[-1], hash) + detail = { + "original": path, + "fn": hash, + "blocks": True, + "time": time.time(), + "uid": self.fs.ukey(path), + } + self._metadata.update_file(path, detail) + logger.debug("Copying %s to local cache", path) + return fn + + def cat( + self, + path, + recursive=False, + on_error="raise", + callback=DEFAULT_CALLBACK, + **kwargs, + ): + paths = self.expand_path( + path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None) + ) + getpaths = [] + storepaths = [] + fns = [] + out = {} + for p in paths.copy(): + try: + detail = self._check_file(p) + if not detail: + fn = self._make_local_details(p) + getpaths.append(p) + storepaths.append(fn) + else: + detail, fn = detail if isinstance(detail, tuple) else (None, detail) + fns.append(fn) + except Exception as e: + if on_error == "raise": + raise + if on_error == "return": + out[p] = e + paths.remove(p) + + if getpaths: + self.fs.get(getpaths, storepaths) + self.save_cache() + + callback.set_size(len(paths)) + for p, fn in zip(paths, fns): + with open(fn, "rb") as f: + out[p] = f.read() + callback.relative_update(1) + if isinstance(path, str) and len(paths) == 1 and recursive is False: + out = out[paths[0]] + return out + + def _open(self, path, mode="rb", **kwargs): + path = self._strip_protocol(path) + if "r" not in mode: + fn = self._make_local_details(path) + user_specified_kwargs = { + k: v + for k, v in kwargs.items() + # those kwargs were added by open(), we don't want them + if k not in ["autocommit", "block_size", "cache_options"] + } + return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs) + detail = self._check_file(path) + if detail: + detail, fn = detail + _, blocks = detail["fn"], detail["blocks"] + if blocks is True: + logger.debug("Opening local copy of %s", path) + + # In order to support downstream filesystems to be able to + # infer the compression from the original filename, like + # the `TarFileSystem`, let's extend the `io.BufferedReader` + # fileobject protocol by adding a dedicated attribute + # `original`. + f = open(fn, mode) + f.original = detail.get("original") + return f + else: + raise ValueError( + f"Attempt to open partially cached file {path}" + f" as a wholly cached file" + ) + else: + fn = self._make_local_details(path) + kwargs["mode"] = mode + + # call target filesystems open + self._mkcache() + if self.compression: + with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2: + if isinstance(f, AbstractBufferedFile): + # want no type of caching if just downloading whole thing + f.cache = BaseCache(0, f.cache.fetcher, f.size) + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") + data = True + while data: + block = getattr(f, "blocksize", 5 * 2**20) + data = f.read(block) + f2.write(data) + else: + self.fs.get_file(path, fn) + self.save_cache() + return self._open(path, mode) + + +class SimpleCacheFileSystem(WholeFileCacheFileSystem): + """Caches whole remote files on first access + + This class is intended as a layer over any other file system, and + will make a local copy of each file accessed, so that all subsequent + reads are local. This implementation only copies whole files, and + does not keep any metadata about the download time or file details. + It is therefore safer to use in multi-threaded/concurrent situations. + + This is the only of the caching filesystems that supports write: you will + be given a real local open file, and upon close and commit, it will be + uploaded to the target filesystem; the writability or the target URL is + not checked until that time. + + """ + + protocol = "simplecache" + local_file = True + transaction_type = WriteCachedTransaction + + def __init__(self, **kwargs): + kw = kwargs.copy() + for key in ["cache_check", "expiry_time", "check_files"]: + kw[key] = False + super().__init__(**kw) + for storage in self.storage: + if not os.path.exists(storage): + os.makedirs(storage, exist_ok=True) + + def _check_file(self, path): + self._check_cache() + sha = self._mapper(path) + for storage in self.storage: + fn = os.path.join(storage, sha) + if os.path.exists(fn): + return fn + + def save_cache(self): + pass + + def load_cache(self): + pass + + def pipe_file(self, path, value=None, **kwargs): + if self._intrans: + with self.open(path, "wb") as f: + f.write(value) + else: + super().pipe_file(path, value) + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + details = [] + try: + details = self.fs.ls( + path, detail=True, **kwargs + ).copy() # don't edit original! + except FileNotFoundError as e: + ex = e + else: + ex = None + if self._intrans: + path1 = path.rstrip("/") + "/" + for f in self.transaction.files: + if f.path == path: + details.append( + {"name": path, "size": f.size or f.tell(), "type": "file"} + ) + elif f.path.startswith(path1): + if f.path.count("/") == path1.count("/"): + details.append( + {"name": f.path, "size": f.size or f.tell(), "type": "file"} + ) + else: + dname = "/".join(f.path.split("/")[: path1.count("/") + 1]) + details.append({"name": dname, "size": 0, "type": "directory"}) + if ex is not None and not details: + raise ex + if detail: + return details + return sorted(_["name"] for _ in details) + + def info(self, path, **kwargs): + path = self._strip_protocol(path) + if self._intrans: + f = [_ for _ in self.transaction.files if _.path == path] + if f: + return {"name": path, "size": f[0].size or f[0].tell(), "type": "file"} + f = any(_.path.startswith(path + "/") for _ in self.transaction.files) + if f: + return {"name": path, "size": 0, "type": "directory"} + return self.fs.info(path, **kwargs) + + def pipe(self, path, value=None, **kwargs): + if isinstance(path, str): + self.pipe_file(self._strip_protocol(path), value, **kwargs) + elif isinstance(path, dict): + for k, v in path.items(): + self.pipe_file(self._strip_protocol(k), v, **kwargs) + else: + raise ValueError("path must be str or dict") + + def cat_ranges( + self, paths, starts, ends, max_gap=None, on_error="return", **kwargs + ): + lpaths = [self._check_file(p) for p in paths] + rpaths = [p for l, p in zip(lpaths, paths) if l is False] + lpaths = [l for l, p in zip(lpaths, paths) if l is False] + self.fs.get(rpaths, lpaths) + return super().cat_ranges( + paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs + ) + + def _open(self, path, mode="rb", **kwargs): + path = self._strip_protocol(path) + sha = self._mapper(path) + + if "r" not in mode: + fn = os.path.join(self.storage[-1], sha) + user_specified_kwargs = { + k: v + for k, v in kwargs.items() + if k not in ["autocommit", "block_size", "cache_options"] + } # those were added by open() + return LocalTempFile( + self, + path, + mode=mode, + autocommit=not self._intrans, + fn=fn, + **user_specified_kwargs, + ) + fn = self._check_file(path) + if fn: + return open(fn, mode) + + fn = os.path.join(self.storage[-1], sha) + logger.debug("Copying %s to local cache", path) + kwargs["mode"] = mode + + self._mkcache() + self._cache_size = None + if self.compression: + with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2: + if isinstance(f, AbstractBufferedFile): + # want no type of caching if just downloading whole thing + f.cache = BaseCache(0, f.cache.fetcher, f.size) + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") + data = True + while data: + block = getattr(f, "blocksize", 5 * 2**20) + data = f.read(block) + f2.write(data) + else: + self.fs.get_file(path, fn) + return self._open(path, mode) + + +class LocalTempFile: + """A temporary local file, which will be uploaded on commit""" + + def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs): + self.fn = fn + self.fh = open(fn, mode) + self.mode = mode + if seek: + self.fh.seek(seek) + self.path = path + self.size = None + self.fs = fs + self.closed = False + self.autocommit = autocommit + self.kwargs = kwargs + + def __reduce__(self): + # always open in r+b to allow continuing writing at a location + return ( + LocalTempFile, + (self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()), + ) + + def __enter__(self): + return self.fh + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def close(self): + self.size = self.fh.tell() + if self.closed: + return + self.fh.close() + self.closed = True + if self.autocommit: + self.commit() + + def discard(self): + self.fh.close() + os.remove(self.fn) + + def commit(self): + self.fs.put(self.fn, self.path, **self.kwargs) + # we do not delete local copy - it's still in the cache + + @property + def name(self): + return self.fn + + def __repr__(self) -> str: + return f"LocalTempFile: {self.path}" + + def __getattr__(self, item): + return getattr(self.fh, item) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dask.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dask.py new file mode 100644 index 0000000000000000000000000000000000000000..3e1276463db6866665e6a0fe114efc247971b57e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dask.py @@ -0,0 +1,152 @@ +import dask +from distributed.client import Client, _get_global_client +from distributed.worker import Worker + +from fsspec import filesystem +from fsspec.spec import AbstractBufferedFile, AbstractFileSystem +from fsspec.utils import infer_storage_options + + +def _get_client(client): + if client is None: + return _get_global_client() + elif isinstance(client, Client): + return client + else: + # e.g., connection string + return Client(client) + + +def _in_worker(): + return bool(Worker._instances) + + +class DaskWorkerFileSystem(AbstractFileSystem): + """View files accessible to a worker as any other remote file-system + + When instances are run on the worker, uses the real filesystem. When + run on the client, they call the worker to provide information or data. + + **Warning** this implementation is experimental, and read-only for now. + """ + + def __init__( + self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs + ): + super().__init__(**kwargs) + if not (fs is None) ^ (target_protocol is None): + raise ValueError( + "Please provide one of filesystem instance (fs) or" + " target_protocol, not both" + ) + self.target_protocol = target_protocol + self.target_options = target_options + self.worker = None + self.client = client + self.fs = fs + self._determine_worker() + + @staticmethod + def _get_kwargs_from_urls(path): + so = infer_storage_options(path) + if "host" in so and "port" in so: + return {"client": f"{so['host']}:{so['port']}"} + else: + return {} + + def _determine_worker(self): + if _in_worker(): + self.worker = True + if self.fs is None: + self.fs = filesystem( + self.target_protocol, **(self.target_options or {}) + ) + else: + self.worker = False + self.client = _get_client(self.client) + self.rfs = dask.delayed(self) + + def mkdir(self, *args, **kwargs): + if self.worker: + self.fs.mkdir(*args, **kwargs) + else: + self.rfs.mkdir(*args, **kwargs).compute() + + def rm(self, *args, **kwargs): + if self.worker: + self.fs.rm(*args, **kwargs) + else: + self.rfs.rm(*args, **kwargs).compute() + + def copy(self, *args, **kwargs): + if self.worker: + self.fs.copy(*args, **kwargs) + else: + self.rfs.copy(*args, **kwargs).compute() + + def mv(self, *args, **kwargs): + if self.worker: + self.fs.mv(*args, **kwargs) + else: + self.rfs.mv(*args, **kwargs).compute() + + def ls(self, *args, **kwargs): + if self.worker: + return self.fs.ls(*args, **kwargs) + else: + return self.rfs.ls(*args, **kwargs).compute() + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + if self.worker: + return self.fs._open( + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + **kwargs, + ) + else: + return DaskFile( + fs=self, + path=path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + **kwargs, + ) + + def fetch_range(self, path, mode, start, end): + if self.worker: + with self._open(path, mode) as f: + f.seek(start) + return f.read(end - start) + else: + return self.rfs.fetch_range(path, mode, start, end).compute() + + +class DaskFile(AbstractBufferedFile): + def __init__(self, mode="rb", **kwargs): + if mode != "rb": + raise ValueError('Remote dask files can only be opened in "rb" mode') + super().__init__(**kwargs) + + def _upload_chunk(self, final=False): + pass + + def _initiate_upload(self): + """Create remote file/upload""" + pass + + def _fetch_range(self, start, end): + """Get the specified set of bytes from remote""" + return self.fs.fetch_range(self.path, self.mode, start, end) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/data.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/data.py new file mode 100644 index 0000000000000000000000000000000000000000..519032305bed633f2ba8a6148076433caf81710b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/data.py @@ -0,0 +1,58 @@ +import base64 +import io +from typing import Optional +from urllib.parse import unquote + +from fsspec import AbstractFileSystem + + +class DataFileSystem(AbstractFileSystem): + """A handy decoder for data-URLs + + Example + ------- + >>> with fsspec.open("data:,Hello%2C%20World%21") as f: + ... print(f.read()) + b"Hello, World!" + + See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs + """ + + protocol = "data" + + def __init__(self, **kwargs): + """No parameters for this filesystem""" + super().__init__(**kwargs) + + def cat_file(self, path, start=None, end=None, **kwargs): + pref, data = path.split(",", 1) + if pref.endswith("base64"): + return base64.b64decode(data)[start:end] + return unquote(data).encode()[start:end] + + def info(self, path, **kwargs): + pref, name = path.split(",", 1) + data = self.cat_file(path) + mime = pref.split(":", 1)[1].split(";", 1)[0] + return {"name": name, "size": len(data), "type": "file", "mimetype": mime} + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + if "r" not in mode: + raise ValueError("Read only filesystem") + return io.BytesIO(self.cat_file(path)) + + @staticmethod + def encode(data: bytes, mime: Optional[str] = None): + """Format the given data into data-URL syntax + + This version always base64 encodes, even when the data is ascii/url-safe. + """ + return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dbfs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dbfs.py new file mode 100644 index 0000000000000000000000000000000000000000..ce9f9eadb798577970ee95530743b4521813ca7c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dbfs.py @@ -0,0 +1,467 @@ +import base64 +import urllib + +import requests +import requests.exceptions +from requests.adapters import HTTPAdapter, Retry + +from fsspec import AbstractFileSystem +from fsspec.spec import AbstractBufferedFile + + +class DatabricksException(Exception): + """ + Helper class for exceptions raised in this module. + """ + + def __init__(self, error_code, message): + """Create a new DatabricksException""" + super().__init__(message) + + self.error_code = error_code + self.message = message + + +class DatabricksFileSystem(AbstractFileSystem): + """ + Get access to the Databricks filesystem implementation over HTTP. + Can be used inside and outside of a databricks cluster. + """ + + def __init__(self, instance, token, **kwargs): + """ + Create a new DatabricksFileSystem. + + Parameters + ---------- + instance: str + The instance URL of the databricks cluster. + For example for an Azure databricks cluster, this + has the form adb-..azuredatabricks.net. + token: str + Your personal token. Find out more + here: https://docs.databricks.com/dev-tools/api/latest/authentication.html + """ + self.instance = instance + self.token = token + self.session = requests.Session() + self.retries = Retry( + total=10, + backoff_factor=0.05, + status_forcelist=[408, 429, 500, 502, 503, 504], + ) + + self.session.mount("https://", HTTPAdapter(max_retries=self.retries)) + self.session.headers.update({"Authorization": f"Bearer {self.token}"}) + + super().__init__(**kwargs) + + def ls(self, path, detail=True, **kwargs): + """ + List the contents of the given path. + + Parameters + ---------- + path: str + Absolute path + detail: bool + Return not only the list of filenames, + but also additional information on file sizes + and types. + """ + out = self._ls_from_cache(path) + if not out: + try: + r = self._send_to_api( + method="get", endpoint="list", json={"path": path} + ) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) + + raise e + files = r["files"] + out = [ + { + "name": o["path"], + "type": "directory" if o["is_dir"] else "file", + "size": o["file_size"], + } + for o in files + ] + self.dircache[path] = out + + if detail: + return out + return [o["name"] for o in out] + + def makedirs(self, path, exist_ok=True): + """ + Create a given absolute path and all of its parents. + + Parameters + ---------- + path: str + Absolute path to create + exist_ok: bool + If false, checks if the folder + exists before creating it (and raises an + Exception if this is the case) + """ + if not exist_ok: + try: + # If the following succeeds, the path is already present + self._send_to_api( + method="get", endpoint="get-status", json={"path": path} + ) + raise FileExistsError(f"Path {path} already exists") + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + pass + + try: + self._send_to_api(method="post", endpoint="mkdirs", json={"path": path}) + except DatabricksException as e: + if e.error_code == "RESOURCE_ALREADY_EXISTS": + raise FileExistsError(e.message) + + raise e + self.invalidate_cache(self._parent(path)) + + def mkdir(self, path, create_parents=True, **kwargs): + """ + Create a given absolute path and all of its parents. + + Parameters + ---------- + path: str + Absolute path to create + create_parents: bool + Whether to create all parents or not. + "False" is not implemented so far. + """ + if not create_parents: + raise NotImplementedError + + self.mkdirs(path, **kwargs) + + def rm(self, path, recursive=False, **kwargs): + """ + Remove the file or folder at the given absolute path. + + Parameters + ---------- + path: str + Absolute path what to remove + recursive: bool + Recursively delete all files in a folder. + """ + try: + self._send_to_api( + method="post", + endpoint="delete", + json={"path": path, "recursive": recursive}, + ) + except DatabricksException as e: + # This is not really an exception, it just means + # not everything was deleted so far + if e.error_code == "PARTIAL_DELETE": + self.rm(path=path, recursive=recursive) + elif e.error_code == "IO_ERROR": + # Using the same exception as the os module would use here + raise OSError(e.message) + + raise e + self.invalidate_cache(self._parent(path)) + + def mv( + self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs + ): + """ + Move a source to a destination path. + + A note from the original [databricks API manual] + (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move). + + When moving a large number of files the API call will time out after + approximately 60s, potentially resulting in partially moved data. + Therefore, for operations that move more than 10k files, we strongly + discourage using the DBFS REST API. + + Parameters + ---------- + source_path: str + From where to move (absolute path) + destination_path: str + To where to move (absolute path) + recursive: bool + Not implemented to far. + maxdepth: + Not implemented to far. + """ + if recursive: + raise NotImplementedError + if maxdepth: + raise NotImplementedError + + try: + self._send_to_api( + method="post", + endpoint="move", + json={"source_path": source_path, "destination_path": destination_path}, + ) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) + elif e.error_code == "RESOURCE_ALREADY_EXISTS": + raise FileExistsError(e.message) + + raise e + self.invalidate_cache(self._parent(source_path)) + self.invalidate_cache(self._parent(destination_path)) + + def _open(self, path, mode="rb", block_size="default", **kwargs): + """ + Overwrite the base class method to make sure to create a DBFile. + All arguments are copied from the base method. + + Only the default blocksize is allowed. + """ + return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs) + + def _send_to_api(self, method, endpoint, json): + """ + Send the given json to the DBFS API + using a get or post request (specified by the argument `method`). + + Parameters + ---------- + method: str + Which http method to use for communication; "get" or "post". + endpoint: str + Where to send the request to (last part of the API URL) + json: dict + Dictionary of information to send + """ + if method == "post": + session_call = self.session.post + elif method == "get": + session_call = self.session.get + else: + raise ValueError(f"Do not understand method {method}") + + url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint) + + r = session_call(url, json=json) + + # The DBFS API will return a json, also in case of an exception. + # We want to preserve this information as good as possible. + try: + r.raise_for_status() + except requests.HTTPError as e: + # try to extract json error message + # if that fails, fall back to the original exception + try: + exception_json = e.response.json() + except Exception: + raise e + + raise DatabricksException(**exception_json) + + return r.json() + + def _create_handle(self, path, overwrite=True): + """ + Internal function to create a handle, which can be used to + write blocks of a file to DBFS. + A handle has a unique identifier which needs to be passed + whenever written during this transaction. + The handle is active for 10 minutes - after that a new + write transaction needs to be created. + Make sure to close the handle after you are finished. + + Parameters + ---------- + path: str + Absolute path for this file. + overwrite: bool + If a file already exist at this location, either overwrite + it or raise an exception. + """ + try: + r = self._send_to_api( + method="post", + endpoint="create", + json={"path": path, "overwrite": overwrite}, + ) + return r["handle"] + except DatabricksException as e: + if e.error_code == "RESOURCE_ALREADY_EXISTS": + raise FileExistsError(e.message) + + raise e + + def _close_handle(self, handle): + """ + Close a handle, which was opened by :func:`_create_handle`. + + Parameters + ---------- + handle: str + Which handle to close. + """ + try: + self._send_to_api(method="post", endpoint="close", json={"handle": handle}) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) + + raise e + + def _add_data(self, handle, data): + """ + Upload data to an already opened file handle + (opened by :func:`_create_handle`). + The maximal allowed data size is 1MB after + conversion to base64. + Remember to close the handle when you are finished. + + Parameters + ---------- + handle: str + Which handle to upload data to. + data: bytes + Block of data to add to the handle. + """ + data = base64.b64encode(data).decode() + try: + self._send_to_api( + method="post", + endpoint="add-block", + json={"handle": handle, "data": data}, + ) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) + elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED": + raise ValueError(e.message) + + raise e + + def _get_data(self, path, start, end): + """ + Download data in bytes from a given absolute path in a block + from [start, start+length]. + The maximum number of allowed bytes to read is 1MB. + + Parameters + ---------- + path: str + Absolute path to download data from + start: int + Start position of the block + end: int + End position of the block + """ + try: + r = self._send_to_api( + method="get", + endpoint="read", + json={"path": path, "offset": start, "length": end - start}, + ) + return base64.b64decode(r["data"]) + except DatabricksException as e: + if e.error_code == "RESOURCE_DOES_NOT_EXIST": + raise FileNotFoundError(e.message) + elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]: + raise ValueError(e.message) + + raise e + + def invalidate_cache(self, path=None): + if path is None: + self.dircache.clear() + else: + self.dircache.pop(path, None) + super().invalidate_cache(path) + + +class DatabricksFile(AbstractBufferedFile): + """ + Helper class for files referenced in the DatabricksFileSystem. + """ + + DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size + + def __init__( + self, + fs, + path, + mode="rb", + block_size="default", + autocommit=True, + cache_type="readahead", + cache_options=None, + **kwargs, + ): + """ + Create a new instance of the DatabricksFile. + + The blocksize needs to be the default one. + """ + if block_size is None or block_size == "default": + block_size = self.DEFAULT_BLOCK_SIZE + + assert ( + block_size == self.DEFAULT_BLOCK_SIZE + ), f"Only the default block size is allowed, not {block_size}" + + super().__init__( + fs, + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_type=cache_type, + cache_options=cache_options or {}, + **kwargs, + ) + + def _initiate_upload(self): + """Internal function to start a file upload""" + self.handle = self.fs._create_handle(self.path) + + def _upload_chunk(self, final=False): + """Internal function to add a chunk of data to a started upload""" + self.buffer.seek(0) + data = self.buffer.getvalue() + + data_chunks = [ + data[start:end] for start, end in self._to_sized_blocks(len(data)) + ] + + for data_chunk in data_chunks: + self.fs._add_data(handle=self.handle, data=data_chunk) + + if final: + self.fs._close_handle(handle=self.handle) + return True + + def _fetch_range(self, start, end): + """Internal function to download a block of data""" + return_buffer = b"" + length = end - start + for chunk_start, chunk_end in self._to_sized_blocks(length, start): + return_buffer += self.fs._get_data( + path=self.path, start=chunk_start, end=chunk_end + ) + + return return_buffer + + def _to_sized_blocks(self, length, start=0): + """Helper function to split a range from 0 to total_length into bloksizes""" + end = start + length + for data_chunk in range(start, end, self.blocksize): + data_start = data_chunk + data_end = min(end, data_chunk + self.blocksize) + yield data_start, data_end diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dirfs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dirfs.py new file mode 100644 index 0000000000000000000000000000000000000000..1b311ab2a1b53bd59fd785d3e4615aab6474c073 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dirfs.py @@ -0,0 +1,364 @@ +from .. import filesystem +from ..asyn import AsyncFileSystem + + +class DirFileSystem(AsyncFileSystem): + """Directory prefix filesystem + + The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with + is relative to the `path`. After performing the necessary paths operation it + delegates everything to the wrapped filesystem. + """ + + protocol = "dir" + + def __init__( + self, + path=None, + fs=None, + fo=None, + target_protocol=None, + target_options=None, + **storage_options, + ): + """ + Parameters + ---------- + path: str + Path to the directory. + fs: AbstractFileSystem + An instantiated filesystem to wrap. + target_protocol, target_options: + if fs is none, construct it from these + fo: str + Alternate for path; do not provide both + """ + super().__init__(**storage_options) + if fs is None: + fs = filesystem(protocol=target_protocol, **(target_options or {})) + if (path is not None) ^ (fo is not None) is False: + raise ValueError("Provide path or fo, not both") + path = path or fo + + if self.asynchronous and not fs.async_impl: + raise ValueError("can't use asynchronous with non-async fs") + + if fs.async_impl and self.asynchronous != fs.asynchronous: + raise ValueError("both dirfs and fs should be in the same sync/async mode") + + self.path = fs._strip_protocol(path) + self.fs = fs + + def _join(self, path): + if isinstance(path, str): + if not self.path: + return path + if not path: + return self.path + return self.fs.sep.join((self.path, self._strip_protocol(path))) + return [self._join(_path) for _path in path] + + def _relpath(self, path): + if isinstance(path, str): + if not self.path: + return path + if path == self.path: + return "" + prefix = self.path + self.fs.sep + assert path.startswith(prefix) + return path[len(prefix) :] + return [self._relpath(_path) for _path in path] + + # Wrappers below + + @property + def sep(self): + return self.fs.sep + + async def set_session(self, *args, **kwargs): + return await self.fs.set_session(*args, **kwargs) + + async def _rm_file(self, path, **kwargs): + return await self.fs._rm_file(self._join(path), **kwargs) + + def rm_file(self, path, **kwargs): + return self.fs.rm_file(self._join(path), **kwargs) + + async def _rm(self, path, *args, **kwargs): + return await self.fs._rm(self._join(path), *args, **kwargs) + + def rm(self, path, *args, **kwargs): + return self.fs.rm(self._join(path), *args, **kwargs) + + async def _cp_file(self, path1, path2, **kwargs): + return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs) + + def cp_file(self, path1, path2, **kwargs): + return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs) + + async def _copy( + self, + path1, + path2, + *args, + **kwargs, + ): + return await self.fs._copy( + self._join(path1), + self._join(path2), + *args, + **kwargs, + ) + + def copy(self, path1, path2, *args, **kwargs): + return self.fs.copy( + self._join(path1), + self._join(path2), + *args, + **kwargs, + ) + + async def _pipe(self, path, *args, **kwargs): + return await self.fs._pipe(self._join(path), *args, **kwargs) + + def pipe(self, path, *args, **kwargs): + return self.fs.pipe(self._join(path), *args, **kwargs) + + async def _pipe_file(self, path, *args, **kwargs): + return await self.fs._pipe_file(self._join(path), *args, **kwargs) + + def pipe_file(self, path, *args, **kwargs): + return self.fs.pipe_file(self._join(path), *args, **kwargs) + + async def _cat_file(self, path, *args, **kwargs): + return await self.fs._cat_file(self._join(path), *args, **kwargs) + + def cat_file(self, path, *args, **kwargs): + return self.fs.cat_file(self._join(path), *args, **kwargs) + + async def _cat(self, path, *args, **kwargs): + ret = await self.fs._cat( + self._join(path), + *args, + **kwargs, + ) + + if isinstance(ret, dict): + return {self._relpath(key): value for key, value in ret.items()} + + return ret + + def cat(self, path, *args, **kwargs): + ret = self.fs.cat( + self._join(path), + *args, + **kwargs, + ) + + if isinstance(ret, dict): + return {self._relpath(key): value for key, value in ret.items()} + + return ret + + async def _put_file(self, lpath, rpath, **kwargs): + return await self.fs._put_file(lpath, self._join(rpath), **kwargs) + + def put_file(self, lpath, rpath, **kwargs): + return self.fs.put_file(lpath, self._join(rpath), **kwargs) + + async def _put( + self, + lpath, + rpath, + *args, + **kwargs, + ): + return await self.fs._put( + lpath, + self._join(rpath), + *args, + **kwargs, + ) + + def put(self, lpath, rpath, *args, **kwargs): + return self.fs.put( + lpath, + self._join(rpath), + *args, + **kwargs, + ) + + async def _get_file(self, rpath, lpath, **kwargs): + return await self.fs._get_file(self._join(rpath), lpath, **kwargs) + + def get_file(self, rpath, lpath, **kwargs): + return self.fs.get_file(self._join(rpath), lpath, **kwargs) + + async def _get(self, rpath, *args, **kwargs): + return await self.fs._get(self._join(rpath), *args, **kwargs) + + def get(self, rpath, *args, **kwargs): + return self.fs.get(self._join(rpath), *args, **kwargs) + + async def _isfile(self, path): + return await self.fs._isfile(self._join(path)) + + def isfile(self, path): + return self.fs.isfile(self._join(path)) + + async def _isdir(self, path): + return await self.fs._isdir(self._join(path)) + + def isdir(self, path): + return self.fs.isdir(self._join(path)) + + async def _size(self, path): + return await self.fs._size(self._join(path)) + + def size(self, path): + return self.fs.size(self._join(path)) + + async def _exists(self, path): + return await self.fs._exists(self._join(path)) + + def exists(self, path): + return self.fs.exists(self._join(path)) + + async def _info(self, path, **kwargs): + return await self.fs._info(self._join(path), **kwargs) + + def info(self, path, **kwargs): + return self.fs.info(self._join(path), **kwargs) + + async def _ls(self, path, detail=True, **kwargs): + ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy() + if detail: + out = [] + for entry in ret: + entry = entry.copy() + entry["name"] = self._relpath(entry["name"]) + out.append(entry) + return out + + return self._relpath(ret) + + def ls(self, path, detail=True, **kwargs): + ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy() + if detail: + out = [] + for entry in ret: + entry = entry.copy() + entry["name"] = self._relpath(entry["name"]) + out.append(entry) + return out + + return self._relpath(ret) + + async def _walk(self, path, *args, **kwargs): + async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs): + yield self._relpath(root), dirs, files + + def walk(self, path, *args, **kwargs): + for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs): + yield self._relpath(root), dirs, files + + async def _glob(self, path, **kwargs): + detail = kwargs.get("detail", False) + ret = await self.fs._glob(self._join(path), **kwargs) + if detail: + return {self._relpath(path): info for path, info in ret.items()} + return self._relpath(ret) + + def glob(self, path, **kwargs): + detail = kwargs.get("detail", False) + ret = self.fs.glob(self._join(path), **kwargs) + if detail: + return {self._relpath(path): info for path, info in ret.items()} + return self._relpath(ret) + + async def _du(self, path, *args, **kwargs): + total = kwargs.get("total", True) + ret = await self.fs._du(self._join(path), *args, **kwargs) + if total: + return ret + + return {self._relpath(path): size for path, size in ret.items()} + + def du(self, path, *args, **kwargs): + total = kwargs.get("total", True) + ret = self.fs.du(self._join(path), *args, **kwargs) + if total: + return ret + + return {self._relpath(path): size for path, size in ret.items()} + + async def _find(self, path, *args, **kwargs): + detail = kwargs.get("detail", False) + ret = await self.fs._find(self._join(path), *args, **kwargs) + if detail: + return {self._relpath(path): info for path, info in ret.items()} + return self._relpath(ret) + + def find(self, path, *args, **kwargs): + detail = kwargs.get("detail", False) + ret = self.fs.find(self._join(path), *args, **kwargs) + if detail: + return {self._relpath(path): info for path, info in ret.items()} + return self._relpath(ret) + + async def _expand_path(self, path, *args, **kwargs): + return self._relpath( + await self.fs._expand_path(self._join(path), *args, **kwargs) + ) + + def expand_path(self, path, *args, **kwargs): + return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs)) + + async def _mkdir(self, path, *args, **kwargs): + return await self.fs._mkdir(self._join(path), *args, **kwargs) + + def mkdir(self, path, *args, **kwargs): + return self.fs.mkdir(self._join(path), *args, **kwargs) + + async def _makedirs(self, path, *args, **kwargs): + return await self.fs._makedirs(self._join(path), *args, **kwargs) + + def makedirs(self, path, *args, **kwargs): + return self.fs.makedirs(self._join(path), *args, **kwargs) + + def rmdir(self, path): + return self.fs.rmdir(self._join(path)) + + def mv(self, path1, path2, **kwargs): + return self.fs.mv( + self._join(path1), + self._join(path2), + **kwargs, + ) + + def touch(self, path, **kwargs): + return self.fs.touch(self._join(path), **kwargs) + + def created(self, path): + return self.fs.created(self._join(path)) + + def modified(self, path): + return self.fs.modified(self._join(path)) + + def sign(self, path, *args, **kwargs): + return self.fs.sign(self._join(path), *args, **kwargs) + + def __repr__(self): + return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})" + + def open( + self, + path, + *args, + **kwargs, + ): + return self.fs.open( + self._join(path), + *args, + **kwargs, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/ftp.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/ftp.py new file mode 100644 index 0000000000000000000000000000000000000000..415f4844952f362188561a3e41425d364a115400 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/ftp.py @@ -0,0 +1,385 @@ +import os +import sys +import uuid +import warnings +from ftplib import FTP, Error, error_perm +from typing import Any + +from ..spec import AbstractBufferedFile, AbstractFileSystem +from ..utils import infer_storage_options, isfilelike + + +class FTPFileSystem(AbstractFileSystem): + """A filesystem over classic FTP""" + + root_marker = "/" + cachable = False + protocol = "ftp" + + def __init__( + self, + host, + port=21, + username=None, + password=None, + acct=None, + block_size=None, + tempdir=None, + timeout=30, + encoding="utf-8", + **kwargs, + ): + """ + You can use _get_kwargs_from_urls to get some kwargs from + a reasonable FTP url. + + Authentication will be anonymous if username/password are not + given. + + Parameters + ---------- + host: str + The remote server name/ip to connect to + port: int + Port to connect with + username: str or None + If authenticating, the user's identifier + password: str of None + User's password on the server, if using + acct: str or None + Some servers also need an "account" string for auth + block_size: int or None + If given, the read-ahead or write buffer size. + tempdir: str + Directory on remote to put temporary files when in a transaction + timeout: int + Timeout of the ftp connection in seconds + encoding: str + Encoding to use for directories and filenames in FTP connection + """ + super().__init__(**kwargs) + self.host = host + self.port = port + self.tempdir = tempdir or "/tmp" + self.cred = username, password, acct + self.timeout = timeout + self.encoding = encoding + if block_size is not None: + self.blocksize = block_size + else: + self.blocksize = 2**16 + self._connect() + + def _connect(self): + if sys.version_info >= (3, 9): + self.ftp = FTP(timeout=self.timeout, encoding=self.encoding) + elif self.encoding: + warnings.warn("`encoding` not supported for python<3.9, ignoring") + self.ftp = FTP(timeout=self.timeout) + else: + self.ftp = FTP(timeout=self.timeout) + self.ftp.connect(self.host, self.port) + self.ftp.login(*self.cred) + + @classmethod + def _strip_protocol(cls, path): + return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/") + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + return out + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + out = [] + if path not in self.dircache: + try: + try: + out = [ + (fn, details) + for (fn, details) in self.ftp.mlsd(path) + if fn not in [".", ".."] + and details["type"] not in ["pdir", "cdir"] + ] + except error_perm: + out = _mlsd2(self.ftp, path) # Not platform independent + for fn, details in out: + if path == "/": + path = "" # just for forming the names, below + details["name"] = "/".join([path, fn.lstrip("/")]) + if details["type"] == "file": + details["size"] = int(details["size"]) + else: + details["size"] = 0 + if details["type"] == "dir": + details["type"] = "directory" + self.dircache[path] = out + except Error: + try: + info = self.info(path) + if info["type"] == "file": + out = [(path, info)] + except (Error, IndexError): + raise FileNotFoundError(path) + files = self.dircache.get(path, out) + if not detail: + return sorted([fn for fn, details in files]) + return [details for fn, details in files] + + def info(self, path, **kwargs): + # implement with direct method + path = self._strip_protocol(path) + if path == "/": + # special case, since this dir has no real entry + return {"name": "/", "size": 0, "type": "directory"} + files = self.ls(self._parent(path).lstrip("/"), True) + try: + out = [f for f in files if f["name"] == path][0] + except IndexError: + raise FileNotFoundError(path) + return out + + def get_file(self, rpath, lpath, **kwargs): + if self.isdir(rpath): + if not os.path.exists(lpath): + os.mkdir(lpath) + return + if isfilelike(lpath): + outfile = lpath + else: + outfile = open(lpath, "wb") + + def cb(x): + outfile.write(x) + + self.ftp.retrbinary( + f"RETR {rpath}", + blocksize=self.blocksize, + callback=cb, + ) + if not isfilelike(lpath): + outfile.close() + + def cat_file(self, path, start=None, end=None, **kwargs): + if end is not None: + return super().cat_file(path, start, end, **kwargs) + out = [] + + def cb(x): + out.append(x) + + try: + self.ftp.retrbinary( + f"RETR {path}", + blocksize=self.blocksize, + rest=start, + callback=cb, + ) + except (Error, error_perm) as orig_exc: + raise FileNotFoundError(path) from orig_exc + return b"".join(out) + + def _open( + self, + path, + mode="rb", + block_size=None, + cache_options=None, + autocommit=True, + **kwargs, + ): + path = self._strip_protocol(path) + block_size = block_size or self.blocksize + return FTPFile( + self, + path, + mode=mode, + block_size=block_size, + tempdir=self.tempdir, + autocommit=autocommit, + cache_options=cache_options, + ) + + def _rm(self, path): + path = self._strip_protocol(path) + self.ftp.delete(path) + self.invalidate_cache(self._parent(path)) + + def rm(self, path, recursive=False, maxdepth=None): + paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) + for p in reversed(paths): + if self.isfile(p): + self.rm_file(p) + else: + self.rmdir(p) + + def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None: + path = self._strip_protocol(path) + parent = self._parent(path) + if parent != self.root_marker and not self.exists(parent) and create_parents: + self.mkdir(parent, create_parents=create_parents) + + self.ftp.mkd(path) + self.invalidate_cache(self._parent(path)) + + def makedirs(self, path: str, exist_ok: bool = False) -> None: + path = self._strip_protocol(path) + if self.exists(path): + # NB: "/" does not "exist" as it has no directory entry + if not exist_ok: + raise FileExistsError(f"{path} exists without `exist_ok`") + # exists_ok=True -> no-op + else: + self.mkdir(path, create_parents=True) + + def rmdir(self, path): + path = self._strip_protocol(path) + self.ftp.rmd(path) + self.invalidate_cache(self._parent(path)) + + def mv(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1) + path2 = self._strip_protocol(path2) + self.ftp.rename(path1, path2) + self.invalidate_cache(self._parent(path1)) + self.invalidate_cache(self._parent(path2)) + + def __del__(self): + self.ftp.close() + + def invalidate_cache(self, path=None): + if path is None: + self.dircache.clear() + else: + self.dircache.pop(path, None) + super().invalidate_cache(path) + + +class TransferDone(Exception): + """Internal exception to break out of transfer""" + + pass + + +class FTPFile(AbstractBufferedFile): + """Interact with a remote FTP file with read/write buffering""" + + def __init__( + self, + fs, + path, + mode="rb", + block_size="default", + autocommit=True, + cache_type="readahead", + cache_options=None, + **kwargs, + ): + super().__init__( + fs, + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_type=cache_type, + cache_options=cache_options, + **kwargs, + ) + if not autocommit: + self.target = self.path + self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())]) + + def commit(self): + self.fs.mv(self.path, self.target) + + def discard(self): + self.fs.rm(self.path) + + def _fetch_range(self, start, end): + """Get bytes between given byte limits + + Implemented by raising an exception in the fetch callback when the + number of bytes received reaches the requested amount. + + Will fail if the server does not respect the REST command on + retrieve requests. + """ + out = [] + total = [0] + + def callback(x): + total[0] += len(x) + if total[0] > end - start: + out.append(x[: (end - start) - total[0]]) + if end < self.size: + raise TransferDone + else: + out.append(x) + + if total[0] == end - start and end < self.size: + raise TransferDone + + try: + self.fs.ftp.retrbinary( + f"RETR {self.path}", + blocksize=self.blocksize, + rest=start, + callback=callback, + ) + except TransferDone: + try: + # stop transfer, we got enough bytes for this block + self.fs.ftp.abort() + self.fs.ftp.getmultiline() + except Error: + self.fs._connect() + + return b"".join(out) + + def _upload_chunk(self, final=False): + self.buffer.seek(0) + self.fs.ftp.storbinary( + f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset + ) + return True + + +def _mlsd2(ftp, path="."): + """ + Fall back to using `dir` instead of `mlsd` if not supported. + + This parses a Linux style `ls -l` response to `dir`, but the response may + be platform dependent. + + Parameters + ---------- + ftp: ftplib.FTP + path: str + Expects to be given path, but defaults to ".". + """ + lines = [] + minfo = [] + ftp.dir(path, lines.append) + for line in lines: + split_line = line.split() + if len(split_line) < 9: + continue + this = ( + split_line[-1], + { + "modify": " ".join(split_line[5:8]), + "unix.owner": split_line[2], + "unix.group": split_line[3], + "unix.mode": split_line[0], + "size": split_line[4], + }, + ) + if "d" == this[1]["unix.mode"][0]: + this[1]["type"] = "dir" + else: + this[1]["type"] = "file" + minfo.append(this) + return minfo diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/git.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/git.py new file mode 100644 index 0000000000000000000000000000000000000000..7c34d93e08c20fc65421e5aa4bab53e8c683fee7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/git.py @@ -0,0 +1,127 @@ +import os + +import pygit2 + +from fsspec.spec import AbstractFileSystem + +from .memory import MemoryFile + + +class GitFileSystem(AbstractFileSystem): + """Browse the files of a local git repo at any hash/tag/branch + + (experimental backend) + """ + + root_marker = "" + cachable = True + + def __init__(self, path=None, fo=None, ref=None, **kwargs): + """ + + Parameters + ---------- + path: str (optional) + Local location of the repo (uses current directory if not given). + May be deprecated in favour of ``fo``. When used with a higher + level function such as fsspec.open(), may be of the form + "git://[path-to-repo[:]][ref@]path/to/file" (but the actual + file path should not contain "@" or ":"). + fo: str (optional) + Same as ``path``, but passed as part of a chained URL. This one + takes precedence if both are given. + ref: str (optional) + Reference to work with, could be a hash, tag or branch name. Defaults + to current working tree. Note that ``ls`` and ``open`` also take hash, + so this becomes the default for those operations + kwargs + """ + super().__init__(**kwargs) + self.repo = pygit2.Repository(fo or path or os.getcwd()) + self.ref = ref or "master" + + @classmethod + def _strip_protocol(cls, path): + path = super()._strip_protocol(path).lstrip("/") + if ":" in path: + path = path.split(":", 1)[1] + if "@" in path: + path = path.split("@", 1)[1] + return path.lstrip("/") + + def _path_to_object(self, path, ref): + comm, ref = self.repo.resolve_refish(ref or self.ref) + parts = path.split("/") + tree = comm.tree + for part in parts: + if part and isinstance(tree, pygit2.Tree): + tree = tree[part] + return tree + + @staticmethod + def _get_kwargs_from_urls(path): + if path.startswith("git://"): + path = path[6:] + out = {} + if ":" in path: + out["path"], path = path.split(":", 1) + if "@" in path: + out["ref"], path = path.split("@", 1) + return out + + def ls(self, path, detail=True, ref=None, **kwargs): + path = self._strip_protocol(path) + tree = self._path_to_object(path, ref) + if isinstance(tree, pygit2.Tree): + out = [] + for obj in tree: + if isinstance(obj, pygit2.Tree): + out.append( + { + "type": "directory", + "name": "/".join([path, obj.name]).lstrip("/"), + "hex": obj.hex, + "mode": f"{obj.filemode:o}", + "size": 0, + } + ) + else: + out.append( + { + "type": "file", + "name": "/".join([path, obj.name]).lstrip("/"), + "hex": obj.hex, + "mode": f"{obj.filemode:o}", + "size": obj.size, + } + ) + else: + obj = tree + out = [ + { + "type": "file", + "name": obj.name, + "hex": obj.hex, + "mode": f"{obj.filemode:o}", + "size": obj.size, + } + ] + if detail: + return out + return [o["name"] for o in out] + + def ukey(self, path, ref=None): + return self.info(path, ref=ref)["hex"] + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + ref=None, + **kwargs, + ): + obj = self._path_to_object(path, ref or self.ref) + return MemoryFile(data=obj.data) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py new file mode 100644 index 0000000000000000000000000000000000000000..e8fe7a2f67d7d9648e75b82d5f0b5cf2bf7a4868 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py @@ -0,0 +1,227 @@ +import requests + +from ..spec import AbstractFileSystem +from ..utils import infer_storage_options +from .memory import MemoryFile + +# TODO: add GIST backend, would be very similar + + +class GithubFileSystem(AbstractFileSystem): + """Interface to files in github + + An instance of this class provides the files residing within a remote github + repository. You may specify a point in the repos history, by SHA, branch + or tag (default is current master). + + Given that code files tend to be small, and that github does not support + retrieving partial content, we always fetch whole files. + + When using fsspec.open, allows URIs of the form: + + - "github://path/file", in which case you must specify org, repo and + may specify sha in the extra args + - 'github://org:repo@/precip/catalog.yml', where the org and repo are + part of the URI + - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included + + ``sha`` can be the full or abbreviated hex of the commit you want to fetch + from, or a branch or tag name (so long as it doesn't contain special characters + like "/", "?", which would have to be HTTP-encoded). + + For authorised access, you must provide username and token, which can be made + at https://github.com/settings/tokens + """ + + url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" + rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" + protocol = "github" + timeout = (60, 60) # connect, read timeouts + + def __init__( + self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs + ): + super().__init__(**kwargs) + self.org = org + self.repo = repo + if (username is None) ^ (token is None): + raise ValueError("Auth required both username and token") + self.username = username + self.token = token + if timeout is not None: + self.timeout = timeout + if sha is None: + # look up default branch (not necessarily "master") + u = "https://api.github.com/repos/{org}/{repo}" + r = requests.get( + u.format(org=org, repo=repo), timeout=self.timeout, **self.kw + ) + r.raise_for_status() + sha = r.json()["default_branch"] + + self.root = sha + self.ls("") + + @property + def kw(self): + if self.username: + return {"auth": (self.username, self.token)} + return {} + + @classmethod + def repos(cls, org_or_user, is_org=True): + """List repo names for given org or user + + This may become the top level of the FS + + Parameters + ---------- + org_or_user: str + Name of the github org or user to query + is_org: bool (default True) + Whether the name is an organisation (True) or user (False) + + Returns + ------- + List of string + """ + r = requests.get( + f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos", + timeout=cls.timeout, + ) + r.raise_for_status() + return [repo["name"] for repo in r.json()] + + @property + def tags(self): + """Names of tags in the repo""" + r = requests.get( + f"https://api.github.com/repos/{self.org}/{self.repo}/tags", + timeout=self.timeout, + **self.kw, + ) + r.raise_for_status() + return [t["name"] for t in r.json()] + + @property + def branches(self): + """Names of branches in the repo""" + r = requests.get( + f"https://api.github.com/repos/{self.org}/{self.repo}/branches", + timeout=self.timeout, + **self.kw, + ) + r.raise_for_status() + return [t["name"] for t in r.json()] + + @property + def refs(self): + """Named references, tags and branches""" + return {"tags": self.tags, "branches": self.branches} + + def ls(self, path, detail=False, sha=None, _sha=None, **kwargs): + """List files at given path + + Parameters + ---------- + path: str + Location to list, relative to repo root + detail: bool + If True, returns list of dicts, one per file; if False, returns + list of full filenames only + sha: str (optional) + List at the given point in the repo history, branch or tag name or commit + SHA + _sha: str (optional) + List this specific tree object (used internally to descend into trees) + """ + path = self._strip_protocol(path) + if path == "": + _sha = sha or self.root + if _sha is None: + parts = path.rstrip("/").split("/") + so_far = "" + _sha = sha or self.root + for part in parts: + out = self.ls(so_far, True, sha=sha, _sha=_sha) + so_far += "/" + part if so_far else part + out = [o for o in out if o["name"] == so_far] + if not out: + raise FileNotFoundError(path) + out = out[0] + if out["type"] == "file": + if detail: + return [out] + else: + return path + _sha = out["sha"] + if path not in self.dircache or sha not in [self.root, None]: + r = requests.get( + self.url.format(org=self.org, repo=self.repo, sha=_sha), + timeout=self.timeout, + **self.kw, + ) + if r.status_code == 404: + raise FileNotFoundError(path) + r.raise_for_status() + types = {"blob": "file", "tree": "directory"} + out = [ + { + "name": path + "/" + f["path"] if path else f["path"], + "mode": f["mode"], + "type": types[f["type"]], + "size": f.get("size", 0), + "sha": f["sha"], + } + for f in r.json()["tree"] + if f["type"] in types + ] + if sha in [self.root, None]: + self.dircache[path] = out + else: + out = self.dircache[path] + if detail: + return out + else: + return sorted([f["name"] for f in out]) + + def invalidate_cache(self, path=None): + self.dircache.clear() + + @classmethod + def _strip_protocol(cls, path): + opts = infer_storage_options(path) + if "username" not in opts: + return super()._strip_protocol(path) + return opts["path"].lstrip("/") + + @staticmethod + def _get_kwargs_from_urls(path): + opts = infer_storage_options(path) + if "username" not in opts: + return {} + out = {"org": opts["username"], "repo": opts["password"]} + if opts["host"]: + out["sha"] = opts["host"] + return out + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + sha=None, + **kwargs, + ): + if mode != "rb": + raise NotImplementedError + url = self.rurl.format( + org=self.org, repo=self.repo, path=path, sha=sha or self.root + ) + r = requests.get(url, timeout=self.timeout, **self.kw) + if r.status_code == 404: + raise FileNotFoundError(path) + r.raise_for_status() + return MemoryFile(None, None, r.content) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/http.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/http.py new file mode 100644 index 0000000000000000000000000000000000000000..4580764ce8495294bfa0ca44fcb60a4c2377f6b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/http.py @@ -0,0 +1,871 @@ +import asyncio +import io +import logging +import re +import weakref +from copy import copy +from urllib.parse import urlparse + +import aiohttp +import yarl + +from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper +from fsspec.callbacks import DEFAULT_CALLBACK +from fsspec.exceptions import FSTimeoutError +from fsspec.spec import AbstractBufferedFile +from fsspec.utils import ( + DEFAULT_BLOCK_SIZE, + glob_translate, + isfilelike, + nullcontext, + tokenize, +) + +from ..caching import AllBytes + +# https://stackoverflow.com/a/15926317/3821154 +ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P[^"']+)""") +ex2 = re.compile(r"""(?Phttp[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""") +logger = logging.getLogger("fsspec.http") + + +async def get_client(**kwargs): + return aiohttp.ClientSession(**kwargs) + + +class HTTPFileSystem(AsyncFileSystem): + """ + Simple File-System for fetching data via HTTP(S) + + ``ls()`` is implemented by loading the parent page and doing a regex + match on the result. If simple_link=True, anything of the form + "http(s)://server.com/stuff?thing=other"; otherwise only links within + HTML href tags will be used. + """ + + sep = "/" + + def __init__( + self, + simple_links=True, + block_size=None, + same_scheme=True, + size_policy=None, + cache_type="bytes", + cache_options=None, + asynchronous=False, + loop=None, + client_kwargs=None, + get_client=get_client, + encoded=False, + **storage_options, + ): + """ + NB: if this is called async, you must await set_client + + Parameters + ---------- + block_size: int + Blocks to read bytes; if 0, will default to raw requests file-like + objects instead of HTTPFile instances + simple_links: bool + If True, will consider both HTML tags and anything that looks + like a URL; if False, will consider only the former. + same_scheme: True + When doing ls/glob, if this is True, only consider paths that have + http/https matching the input URLs. + size_policy: this argument is deprecated + client_kwargs: dict + Passed to aiohttp.ClientSession, see + https://docs.aiohttp.org/en/stable/client_reference.html + For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}`` + get_client: Callable[..., aiohttp.ClientSession] + A callable which takes keyword arguments and constructs + an aiohttp.ClientSession. It's state will be managed by + the HTTPFileSystem class. + storage_options: key-value + Any other parameters passed on to requests + cache_type, cache_options: defaults used in open + """ + super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options) + self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE + self.simple_links = simple_links + self.same_schema = same_scheme + self.cache_type = cache_type + self.cache_options = cache_options + self.client_kwargs = client_kwargs or {} + self.get_client = get_client + self.encoded = encoded + self.kwargs = storage_options + self._session = None + + # Clean caching-related parameters from `storage_options` + # before propagating them as `request_options` through `self.kwargs`. + # TODO: Maybe rename `self.kwargs` to `self.request_options` to make + # it clearer. + request_options = copy(storage_options) + self.use_listings_cache = request_options.pop("use_listings_cache", False) + request_options.pop("listings_expiry_time", None) + request_options.pop("max_paths", None) + request_options.pop("skip_instance_cache", None) + self.kwargs = request_options + + @property + def fsid(self): + return "http" + + def encode_url(self, url): + return yarl.URL(url, encoded=self.encoded) + + @staticmethod + def close_session(loop, session): + if loop is not None and loop.is_running(): + try: + sync(loop, session.close, timeout=0.1) + return + except (TimeoutError, FSTimeoutError, NotImplementedError): + pass + connector = getattr(session, "_connector", None) + if connector is not None: + # close after loop is dead + connector._close() + + async def set_session(self): + if self._session is None: + self._session = await self.get_client(loop=self.loop, **self.client_kwargs) + if not self.asynchronous: + weakref.finalize(self, self.close_session, self.loop, self._session) + return self._session + + @classmethod + def _strip_protocol(cls, path): + """For HTTP, we always want to keep the full URL""" + return path + + @classmethod + def _parent(cls, path): + # override, since _strip_protocol is different for URLs + par = super()._parent(path) + if len(par) > 7: # "http://..." + return par + return "" + + async def _ls_real(self, url, detail=True, **kwargs): + # ignoring URL-encoded arguments + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(url) + session = await self.set_session() + async with session.get(self.encode_url(url), **self.kwargs) as r: + self._raise_not_found_for_status(r, url) + try: + text = await r.text() + if self.simple_links: + links = ex2.findall(text) + [u[2] for u in ex.findall(text)] + else: + links = [u[2] for u in ex.findall(text)] + except UnicodeDecodeError: + links = [] # binary, not HTML + out = set() + parts = urlparse(url) + for l in links: + if isinstance(l, tuple): + l = l[1] + if l.startswith("/") and len(l) > 1: + # absolute URL on this server + l = f"{parts.scheme}://{parts.netloc}{l}" + if l.startswith("http"): + if self.same_schema and l.startswith(url.rstrip("/") + "/"): + out.add(l) + elif l.replace("https", "http").startswith( + url.replace("https", "http").rstrip("/") + "/" + ): + # allowed to cross http <-> https + out.add(l) + else: + if l not in ["..", "../"]: + # Ignore FTP-like "parent" + out.add("/".join([url.rstrip("/"), l.lstrip("/")])) + if not out and url.endswith("/"): + out = await self._ls_real(url.rstrip("/"), detail=False) + if detail: + return [ + { + "name": u, + "size": None, + "type": "directory" if u.endswith("/") else "file", + } + for u in out + ] + else: + return sorted(out) + + async def _ls(self, url, detail=True, **kwargs): + if self.use_listings_cache and url in self.dircache: + out = self.dircache[url] + else: + out = await self._ls_real(url, detail=detail, **kwargs) + self.dircache[url] = out + return out + + ls = sync_wrapper(_ls) + + def _raise_not_found_for_status(self, response, url): + """ + Raises FileNotFoundError for 404s, otherwise uses raise_for_status. + """ + if response.status == 404: + raise FileNotFoundError(url) + response.raise_for_status() + + async def _cat_file(self, url, start=None, end=None, **kwargs): + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(url) + + if start is not None or end is not None: + if start == end: + return b"" + headers = kw.pop("headers", {}).copy() + + headers["Range"] = await self._process_limits(url, start, end) + kw["headers"] = headers + session = await self.set_session() + async with session.get(self.encode_url(url), **kw) as r: + out = await r.read() + self._raise_not_found_for_status(r, url) + return out + + async def _get_file( + self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs + ): + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(rpath) + session = await self.set_session() + async with session.get(self.encode_url(rpath), **kw) as r: + try: + size = int(r.headers["content-length"]) + except (ValueError, KeyError): + size = None + + callback.set_size(size) + self._raise_not_found_for_status(r, rpath) + if isfilelike(lpath): + outfile = lpath + else: + outfile = open(lpath, "wb") # noqa: ASYNC101 + + try: + chunk = True + while chunk: + chunk = await r.content.read(chunk_size) + outfile.write(chunk) + callback.relative_update(len(chunk)) + finally: + if not isfilelike(lpath): + outfile.close() + + async def _put_file( + self, + lpath, + rpath, + chunk_size=5 * 2**20, + callback=DEFAULT_CALLBACK, + method="post", + **kwargs, + ): + async def gen_chunks(): + # Support passing arbitrary file-like objects + # and use them instead of streams. + if isinstance(lpath, io.IOBase): + context = nullcontext(lpath) + use_seek = False # might not support seeking + else: + context = open(lpath, "rb") # noqa: ASYNC101 + use_seek = True + + with context as f: + if use_seek: + callback.set_size(f.seek(0, 2)) + f.seek(0) + else: + callback.set_size(getattr(f, "size", None)) + + chunk = f.read(chunk_size) + while chunk: + yield chunk + callback.relative_update(len(chunk)) + chunk = f.read(chunk_size) + + kw = self.kwargs.copy() + kw.update(kwargs) + session = await self.set_session() + + method = method.lower() + if method not in ("post", "put"): + raise ValueError( + f"method has to be either 'post' or 'put', not: {method!r}" + ) + + meth = getattr(session, method) + async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp: + self._raise_not_found_for_status(resp, rpath) + + async def _exists(self, path, **kwargs): + kw = self.kwargs.copy() + kw.update(kwargs) + try: + logger.debug(path) + session = await self.set_session() + r = await session.get(self.encode_url(path), **kw) + async with r: + return r.status < 400 + except aiohttp.ClientError: + return False + + async def _isfile(self, path, **kwargs): + return await self._exists(path, **kwargs) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=None, # XXX: This differs from the base class. + cache_type=None, + cache_options=None, + size=None, + **kwargs, + ): + """Make a file-like object + + Parameters + ---------- + path: str + Full URL with protocol + mode: string + must be "rb" + block_size: int or None + Bytes to download in one request; use instance value if None. If + zero, will return a streaming Requests file-like instance. + kwargs: key-value + Any other parameters, passed to requests calls + """ + if mode != "rb": + raise NotImplementedError + block_size = block_size if block_size is not None else self.block_size + kw = self.kwargs.copy() + kw["asynchronous"] = self.asynchronous + kw.update(kwargs) + size = size or self.info(path, **kwargs)["size"] + session = sync(self.loop, self.set_session) + if block_size and size: + return HTTPFile( + self, + path, + session=session, + block_size=block_size, + mode=mode, + size=size, + cache_type=cache_type or self.cache_type, + cache_options=cache_options or self.cache_options, + loop=self.loop, + **kw, + ) + else: + return HTTPStreamFile( + self, + path, + mode=mode, + loop=self.loop, + session=session, + **kw, + ) + + async def open_async(self, path, mode="rb", size=None, **kwargs): + session = await self.set_session() + if size is None: + try: + size = (await self._info(path, **kwargs))["size"] + except FileNotFoundError: + pass + return AsyncStreamFile( + self, + path, + loop=self.loop, + session=session, + size=size, + **kwargs, + ) + + def ukey(self, url): + """Unique identifier; assume HTTP files are static, unchanging""" + return tokenize(url, self.kwargs, self.protocol) + + async def _info(self, url, **kwargs): + """Get info of URL + + Tries to access location via HEAD, and then GET methods, but does + not fetch the data. + + It is possible that the server does not supply any size information, in + which case size will be given as None (and certain operations on the + corresponding file will not work). + """ + info = {} + session = await self.set_session() + + for policy in ["head", "get"]: + try: + info.update( + await _file_info( + self.encode_url(url), + size_policy=policy, + session=session, + **self.kwargs, + **kwargs, + ) + ) + if info.get("size") is not None: + break + except Exception as exc: + if policy == "get": + # If get failed, then raise a FileNotFoundError + raise FileNotFoundError(url) from exc + logger.debug("", exc_info=exc) + + return {"name": url, "size": None, **info, "type": "file"} + + async def _glob(self, path, maxdepth=None, **kwargs): + """ + Find files by glob-matching. + + This implementation is idntical to the one in AbstractFileSystem, + but "?" is not considered as a character for globbing, because it is + so common in URLs, often identifying the "query" part. + """ + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + import re + + ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash + path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*")) + idx_star = path.find("*") if path.find("*") >= 0 else len(path) + idx_brace = path.find("[") if path.find("[") >= 0 else len(path) + + min_idx = min(idx_star, idx_brace) + + detail = kwargs.pop("detail", False) + + if not has_magic(path): + if await self._exists(path, **kwargs): + if not detail: + return [path] + else: + return {path: await self._info(path, **kwargs)} + else: + if not detail: + return [] # glob of non-existent returns empty + else: + return {} + elif "/" in path[:min_idx]: + min_idx = path[:min_idx].rindex("/") + root = path[: min_idx + 1] + depth = path[min_idx + 1 :].count("/") + 1 + else: + root = "" + depth = path[min_idx + 1 :].count("/") + 1 + + if "**" in path: + if maxdepth is not None: + idx_double_stars = path.find("**") + depth_double_stars = path[idx_double_stars:].count("/") + 1 + depth = depth - depth_double_stars + maxdepth + else: + depth = None + + allpaths = await self._find( + root, maxdepth=depth, withdirs=True, detail=True, **kwargs + ) + + pattern = glob_translate(path + ("/" if ends_with_slash else "")) + pattern = re.compile(pattern) + + out = { + ( + p.rstrip("/") + if not append_slash_to_dirname + and info["type"] == "directory" + and p.endswith("/") + else p + ): info + for p, info in sorted(allpaths.items()) + if pattern.match(p.rstrip("/")) + } + + if detail: + return out + else: + return list(out) + + async def _isdir(self, path): + # override, since all URLs are (also) files + try: + return bool(await self._ls(path)) + except (FileNotFoundError, ValueError): + return False + + +class HTTPFile(AbstractBufferedFile): + """ + A file-like object pointing to a remove HTTP(S) resource + + Supports only reading, with read-ahead of a predermined block-size. + + In the case that the server does not supply the filesize, only reading of + the complete file in one go is supported. + + Parameters + ---------- + url: str + Full URL of the remote resource, including the protocol + session: aiohttp.ClientSession or None + All calls will be made within this session, to avoid restarting + connections where the server allows this + block_size: int or None + The amount of read-ahead to do, in bytes. Default is 5MB, or the value + configured for the FileSystem creating this file + size: None or int + If given, this is the size of the file in bytes, and we don't attempt + to call the server to find the value. + kwargs: all other key-values are passed to requests calls. + """ + + def __init__( + self, + fs, + url, + session=None, + block_size=None, + mode="rb", + cache_type="bytes", + cache_options=None, + size=None, + loop=None, + asynchronous=False, + **kwargs, + ): + if mode != "rb": + raise NotImplementedError("File mode not supported") + self.asynchronous = asynchronous + self.url = url + self.session = session + self.details = {"name": url, "size": size, "type": "file"} + super().__init__( + fs=fs, + path=url, + mode=mode, + block_size=block_size, + cache_type=cache_type, + cache_options=cache_options, + **kwargs, + ) + self.loop = loop + + def read(self, length=-1): + """Read bytes from file + + Parameters + ---------- + length: int + Read up to this many bytes. If negative, read all content to end of + file. If the server has not supplied the filesize, attempting to + read only part of the data will raise a ValueError. + """ + if ( + (length < 0 and self.loc == 0) # explicit read all + # but not when the size is known and fits into a block anyways + and not (self.size is not None and self.size <= self.blocksize) + ): + self._fetch_all() + if self.size is None: + if length < 0: + self._fetch_all() + else: + length = min(self.size - self.loc, length) + return super().read(length) + + async def async_fetch_all(self): + """Read whole file in one shot, without caching + + This is only called when position is still at zero, + and read() is called without a byte-count. + """ + logger.debug(f"Fetch all for {self}") + if not isinstance(self.cache, AllBytes): + r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs) + async with r: + r.raise_for_status() + out = await r.read() + self.cache = AllBytes( + size=len(out), fetcher=None, blocksize=None, data=out + ) + self.size = len(out) + + _fetch_all = sync_wrapper(async_fetch_all) + + def _parse_content_range(self, headers): + """Parse the Content-Range header""" + s = headers.get("Content-Range", "") + m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s) + if not m: + return None, None, None + + if m[1] == "*": + start = end = None + else: + start, end = [int(x) for x in m[1].split("-")] + total = None if m[2] == "*" else int(m[2]) + return start, end, total + + async def async_fetch_range(self, start, end): + """Download a block of data + + The expectation is that the server returns only the requested bytes, + with HTTP code 206. If this is not the case, we first check the headers, + and then stream the output - if the data size is bigger than we + requested, an exception is raised. + """ + logger.debug(f"Fetch range for {self}: {start}-{end}") + kwargs = self.kwargs.copy() + headers = kwargs.pop("headers", {}).copy() + headers["Range"] = f"bytes={start}-{end - 1}" + logger.debug(f"{self.url} : {headers['Range']}") + r = await self.session.get( + self.fs.encode_url(self.url), headers=headers, **kwargs + ) + async with r: + if r.status == 416: + # range request outside file + return b"" + r.raise_for_status() + + # If the server has handled the range request, it should reply + # with status 206 (partial content). But we'll guess that a suitable + # Content-Range header or a Content-Length no more than the + # requested range also mean we have got the desired range. + response_is_range = ( + r.status == 206 + or self._parse_content_range(r.headers)[0] == start + or int(r.headers.get("Content-Length", end + 1)) <= end - start + ) + + if response_is_range: + # partial content, as expected + out = await r.read() + elif start > 0: + raise ValueError( + "The HTTP server doesn't appear to support range requests. " + "Only reading this file from the beginning is supported. " + "Open with block_size=0 for a streaming file interface." + ) + else: + # Response is not a range, but we want the start of the file, + # so we can read the required amount anyway. + cl = 0 + out = [] + while True: + chunk = await r.content.read(2**20) + # data size unknown, let's read until we have enough + if chunk: + out.append(chunk) + cl += len(chunk) + if cl > end - start: + break + else: + break + out = b"".join(out)[: end - start] + return out + + _fetch_range = sync_wrapper(async_fetch_range) + + def __reduce__(self): + return ( + reopen, + ( + self.fs, + self.url, + self.mode, + self.blocksize, + self.cache.name if self.cache else "none", + self.size, + ), + ) + + +def reopen(fs, url, mode, blocksize, cache_type, size=None): + return fs.open( + url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size + ) + + +magic_check = re.compile("([*[])") + + +def has_magic(s): + match = magic_check.search(s) + return match is not None + + +class HTTPStreamFile(AbstractBufferedFile): + def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs): + self.asynchronous = kwargs.pop("asynchronous", False) + self.url = url + self.loop = loop + self.session = session + if mode != "rb": + raise ValueError + self.details = {"name": url, "size": None} + super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs) + + async def cor(): + r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__() + self.fs._raise_not_found_for_status(r, url) + return r + + self.r = sync(self.loop, cor) + + def seek(self, loc, whence=0): + if loc == 0 and whence == 1: + return + if loc == self.loc and whence == 0: + return + raise ValueError("Cannot seek streaming HTTP file") + + async def _read(self, num=-1): + out = await self.r.content.read(num) + self.loc += len(out) + return out + + read = sync_wrapper(_read) + + async def _close(self): + self.r.close() + + def close(self): + asyncio.run_coroutine_threadsafe(self._close(), self.loop) + super().close() + + def __reduce__(self): + return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name) + + +class AsyncStreamFile(AbstractAsyncStreamedFile): + def __init__( + self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs + ): + self.url = url + self.session = session + self.r = None + if mode != "rb": + raise ValueError + self.details = {"name": url, "size": None} + self.kwargs = kwargs + super().__init__(fs=fs, path=url, mode=mode, cache_type="none") + self.size = size + + async def read(self, num=-1): + if self.r is None: + r = await self.session.get( + self.fs.encode_url(self.url), **self.kwargs + ).__aenter__() + self.fs._raise_not_found_for_status(r, self.url) + self.r = r + out = await self.r.content.read(num) + self.loc += len(out) + return out + + async def close(self): + if self.r is not None: + self.r.close() + self.r = None + await super().close() + + +async def get_range(session, url, start, end, file=None, **kwargs): + # explicit get a range when we know it must be safe + kwargs = kwargs.copy() + headers = kwargs.pop("headers", {}).copy() + headers["Range"] = f"bytes={start}-{end - 1}" + r = await session.get(url, headers=headers, **kwargs) + r.raise_for_status() + async with r: + out = await r.read() + if file: + with open(file, "r+b") as f: # noqa: ASYNC101 + f.seek(start) + f.write(out) + else: + return out + + +async def _file_info(url, session, size_policy="head", **kwargs): + """Call HEAD on the server to get details about the file (size/checksum etc.) + + Default operation is to explicitly allow redirects and use encoding + 'identity' (no compression) to get the true size of the target. + """ + logger.debug("Retrieve file size for %s", url) + kwargs = kwargs.copy() + ar = kwargs.pop("allow_redirects", True) + head = kwargs.get("headers", {}).copy() + head["Accept-Encoding"] = "identity" + kwargs["headers"] = head + + info = {} + if size_policy == "head": + r = await session.head(url, allow_redirects=ar, **kwargs) + elif size_policy == "get": + r = await session.get(url, allow_redirects=ar, **kwargs) + else: + raise TypeError(f'size_policy must be "head" or "get", got {size_policy}') + async with r: + r.raise_for_status() + + # TODO: + # recognise lack of 'Accept-Ranges', + # or 'Accept-Ranges': 'none' (not 'bytes') + # to mean streaming only, no random access => return None + if "Content-Length" in r.headers: + # Some servers may choose to ignore Accept-Encoding and return + # compressed content, in which case the returned size is unreliable. + if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [ + "identity", + "", + ]: + info["size"] = int(r.headers["Content-Length"]) + elif "Content-Range" in r.headers: + info["size"] = int(r.headers["Content-Range"].split("/")[1]) + + if "Content-Type" in r.headers: + info["mimetype"] = r.headers["Content-Type"].partition(";")[0] + + info["url"] = str(r.url) + + for checksum_field in ["ETag", "Content-MD5", "Digest"]: + if r.headers.get(checksum_field): + info[checksum_field] = r.headers[checksum_field] + + return info + + +async def _file_size(url, session=None, *args, **kwargs): + if session is None: + session = await get_client() + info = await _file_info(url, session=session, *args, **kwargs) + return info.get("size") + + +file_size = sync_wrapper(_file_size) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/jupyter.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/jupyter.py new file mode 100644 index 0000000000000000000000000000000000000000..2839f4c1feea56dddd54bdc00f0b884c8461d29e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/jupyter.py @@ -0,0 +1,124 @@ +import base64 +import io +import re + +import requests + +import fsspec + + +class JupyterFileSystem(fsspec.AbstractFileSystem): + """View of the files as seen by a Jupyter server (notebook or lab)""" + + protocol = ("jupyter", "jlab") + + def __init__(self, url, tok=None, **kwargs): + """ + + Parameters + ---------- + url : str + Base URL of the server, like "http://127.0.0.1:8888". May include + token in the string, which is given by the process when starting up + tok : str + If the token is obtained separately, can be given here + kwargs + """ + if "?" in url: + if tok is None: + try: + tok = re.findall("token=([a-z0-9]+)", url)[0] + except IndexError as e: + raise ValueError("Could not determine token") from e + url = url.split("?", 1)[0] + self.url = url.rstrip("/") + "/api/contents" + self.session = requests.Session() + if tok: + self.session.headers["Authorization"] = f"token {tok}" + + super().__init__(**kwargs) + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + r = self.session.get(f"{self.url}/{path}") + if r.status_code == 404: + return FileNotFoundError(path) + r.raise_for_status() + out = r.json() + + if out["type"] == "directory": + out = out["content"] + else: + out = [out] + for o in out: + o["name"] = o.pop("path") + o.pop("content") + if o["type"] == "notebook": + o["type"] = "file" + if detail: + return out + return [o["name"] for o in out] + + def cat_file(self, path, start=None, end=None, **kwargs): + path = self._strip_protocol(path) + r = self.session.get(f"{self.url}/{path}") + if r.status_code == 404: + return FileNotFoundError(path) + r.raise_for_status() + out = r.json() + if out["format"] == "text": + # data should be binary + b = out["content"].encode() + else: + b = base64.b64decode(out["content"]) + return b[start:end] + + def pipe_file(self, path, value, **_): + path = self._strip_protocol(path) + json = { + "name": path.rsplit("/", 1)[-1], + "path": path, + "size": len(value), + "content": base64.b64encode(value).decode(), + "format": "base64", + "type": "file", + } + self.session.put(f"{self.url}/{path}", json=json) + + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if create_parents and "/" in path: + self.mkdir(path.rsplit("/", 1)[0], True) + json = { + "name": path.rsplit("/", 1)[-1], + "path": path, + "size": None, + "content": None, + "type": "directory", + } + self.session.put(f"{self.url}/{path}", json=json) + + def _rm(self, path): + path = self._strip_protocol(path) + self.session.delete(f"{self.url}/{path}") + + def _open(self, path, mode="rb", **kwargs): + path = self._strip_protocol(path) + if mode == "rb": + data = self.cat_file(path) + return io.BytesIO(data) + else: + return SimpleFileWriter(self, path, mode="wb") + + +class SimpleFileWriter(fsspec.spec.AbstractBufferedFile): + def _upload_chunk(self, final=False): + """Never uploads a chunk until file is done + + Not suitable for large files + """ + if final is False: + return False + self.buffer.seek(0) + data = self.buffer.read() + self.fs.pipe_file(self.path, data) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py new file mode 100644 index 0000000000000000000000000000000000000000..eb6f145352e1989e0477e259be02d8d7f4d729e2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py @@ -0,0 +1,213 @@ +from contextlib import contextmanager +from ctypes import ( + CFUNCTYPE, + POINTER, + c_int, + c_longlong, + c_void_p, + cast, + create_string_buffer, +) + +import libarchive +import libarchive.ffi as ffi + +from fsspec import open_files +from fsspec.archive import AbstractArchiveFileSystem +from fsspec.implementations.memory import MemoryFile +from fsspec.utils import DEFAULT_BLOCK_SIZE + +# Libarchive requires seekable files or memory only for certain archive +# types. However, since we read the directory first to cache the contents +# and also allow random access to any file, the file-like object needs +# to be seekable no matter what. + +# Seek call-backs (not provided in the libarchive python wrapper) +SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int) +read_set_seek_callback = ffi.ffi( + "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int +) +new_api = hasattr(ffi, "NO_OPEN_CB") + + +@contextmanager +def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size): + """Read an archive from a seekable file-like object. + + The `file` object must support the standard `readinto` and 'seek' methods. + """ + buf = create_string_buffer(block_size) + buf_p = cast(buf, c_void_p) + + def read_func(archive_p, context, ptrptr): + # readinto the buffer, returns number of bytes read + length = file.readinto(buf) + # write the address of the buffer into the pointer + ptrptr = cast(ptrptr, POINTER(c_void_p)) + ptrptr[0] = buf_p + # tell libarchive how much data was written into the buffer + return length + + def seek_func(archive_p, context, offset, whence): + file.seek(offset, whence) + # tell libarchvie the current position + return file.tell() + + read_cb = ffi.READ_CALLBACK(read_func) + seek_cb = SEEK_CALLBACK(seek_func) + + if new_api: + open_cb = ffi.NO_OPEN_CB + close_cb = ffi.NO_CLOSE_CB + else: + open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB) + close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB) + + with libarchive.read.new_archive_read(format_name, filter_name) as archive_p: + read_set_seek_callback(archive_p, seek_cb) + ffi.read_open(archive_p, None, open_cb, read_cb, close_cb) + yield libarchive.read.ArchiveRead(archive_p) + + +class LibArchiveFileSystem(AbstractArchiveFileSystem): + """Compressed archives as a file-system (read-only) + + Supports the following formats: + tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar + Microsoft CAB, 7-Zip, WARC + + See the libarchive documentation for further restrictions. + https://www.libarchive.org/ + + Keeps file object open while instance lives. It only works in seekable + file-like objects. In case the filesystem does not support this kind of + file object, it is recommended to cache locally. + + This class is pickleable, but not necessarily thread-safe (depends on the + platform). See libarchive documentation for details. + """ + + root_marker = "" + protocol = "libarchive" + cachable = False + + def __init__( + self, + fo="", + mode="r", + target_protocol=None, + target_options=None, + block_size=DEFAULT_BLOCK_SIZE, + **kwargs, + ): + """ + Parameters + ---------- + fo: str or file-like + Contains ZIP, and must exist. If a str, will fetch file using + :meth:`~fsspec.open_files`, which must return one file exactly. + mode: str + Currently, only 'r' accepted + target_protocol: str (optional) + If ``fo`` is a string, this value can be used to override the + FS protocol inferred from a URL + target_options: dict (optional) + Kwargs passed when instantiating the target FS, if ``fo`` is + a string. + """ + super().__init__(self, **kwargs) + if mode != "r": + raise ValueError("Only read from archive files accepted") + if isinstance(fo, str): + files = open_files(fo, protocol=target_protocol, **(target_options or {})) + if len(files) != 1: + raise ValueError( + f'Path "{fo}" did not resolve to exactly one file: "{files}"' + ) + fo = files[0] + self.of = fo + self.fo = fo.__enter__() # the whole instance is a context + self.block_size = block_size + self.dir_cache = None + + @contextmanager + def _open_archive(self): + self.fo.seek(0) + with custom_reader(self.fo, block_size=self.block_size) as arc: + yield arc + + @classmethod + def _strip_protocol(cls, path): + # file paths are always relative to the archive root + return super()._strip_protocol(path).lstrip("/") + + def _get_dirs(self): + fields = { + "name": "pathname", + "size": "size", + "created": "ctime", + "mode": "mode", + "uid": "uid", + "gid": "gid", + "mtime": "mtime", + } + + if self.dir_cache is not None: + return + + self.dir_cache = {} + list_names = [] + with self._open_archive() as arc: + for entry in arc: + if not entry.isdir and not entry.isfile: + # Skip symbolic links, fifo entries, etc. + continue + self.dir_cache.update( + { + dirname: {"name": dirname, "size": 0, "type": "directory"} + for dirname in self._all_dirnames(set(entry.name)) + } + ) + f = {key: getattr(entry, fields[key]) for key in fields} + f["type"] = "directory" if entry.isdir else "file" + list_names.append(entry.name) + + self.dir_cache[f["name"]] = f + # libarchive does not seem to return an entry for the directories (at least + # not in all formats), so get the directories names from the files names + self.dir_cache.update( + { + dirname: {"name": dirname, "size": 0, "type": "directory"} + for dirname in self._all_dirnames(list_names) + } + ) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + path = self._strip_protocol(path) + if mode != "rb": + raise NotImplementedError + + data = bytes() + with self._open_archive() as arc: + for entry in arc: + if entry.pathname != path: + continue + + if entry.size == 0: + # empty file, so there are no blocks + break + + for block in entry.get_blocks(entry.size): + data = block + break + else: + raise ValueError + return MemoryFile(fs=self, path=path, data=data) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/local.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/local.py new file mode 100644 index 0000000000000000000000000000000000000000..9881606f138f59ba88a4c2882ef7f7b5de06c122 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/local.py @@ -0,0 +1,467 @@ +import datetime +import io +import logging +import os +import os.path as osp +import shutil +import stat +import tempfile + +from fsspec import AbstractFileSystem +from fsspec.compression import compr +from fsspec.core import get_compression +from fsspec.utils import isfilelike, stringify_path + +logger = logging.getLogger("fsspec.local") + + +class LocalFileSystem(AbstractFileSystem): + """Interface to files on local storage + + Parameters + ---------- + auto_mkdir: bool + Whether, when opening a file, the directory containing it should + be created (if it doesn't already exist). This is assumed by pyarrow + code. + """ + + root_marker = "/" + protocol = "file", "local" + local_file = True + + def __init__(self, auto_mkdir=False, **kwargs): + super().__init__(**kwargs) + self.auto_mkdir = auto_mkdir + + @property + def fsid(self): + return "local" + + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if self.exists(path): + raise FileExistsError(path) + if create_parents: + self.makedirs(path, exist_ok=True) + else: + os.mkdir(path, **kwargs) + + def makedirs(self, path, exist_ok=False): + path = self._strip_protocol(path) + os.makedirs(path, exist_ok=exist_ok) + + def rmdir(self, path): + path = self._strip_protocol(path) + os.rmdir(path) + + def ls(self, path, detail=False, **kwargs): + path = self._strip_protocol(path) + info = self.info(path) + if info["type"] == "directory": + with os.scandir(path) as it: + infos = [self.info(f) for f in it] + else: + infos = [info] + + if not detail: + return [i["name"] for i in infos] + return infos + + def info(self, path, **kwargs): + if isinstance(path, os.DirEntry): + # scandir DirEntry + out = path.stat(follow_symlinks=False) + link = path.is_symlink() + if path.is_dir(follow_symlinks=False): + t = "directory" + elif path.is_file(follow_symlinks=False): + t = "file" + else: + t = "other" + path = self._strip_protocol(path.path) + else: + # str or path-like + path = self._strip_protocol(path) + out = os.stat(path, follow_symlinks=False) + link = stat.S_ISLNK(out.st_mode) + if link: + out = os.stat(path, follow_symlinks=True) + if stat.S_ISDIR(out.st_mode): + t = "directory" + elif stat.S_ISREG(out.st_mode): + t = "file" + else: + t = "other" + result = { + "name": path, + "size": out.st_size, + "type": t, + "created": out.st_ctime, + "islink": link, + } + for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]: + result[field] = getattr(out, f"st_{field}") + if result["islink"]: + result["destination"] = os.readlink(path) + try: + out2 = os.stat(path, follow_symlinks=True) + result["size"] = out2.st_size + except OSError: + result["size"] = 0 + return result + + def lexists(self, path, **kwargs): + return osp.lexists(path) + + def cp_file(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1) + path2 = self._strip_protocol(path2) + if self.auto_mkdir: + self.makedirs(self._parent(path2), exist_ok=True) + if self.isfile(path1): + shutil.copyfile(path1, path2) + elif self.isdir(path1): + self.mkdirs(path2, exist_ok=True) + else: + raise FileNotFoundError(path1) + + def isfile(self, path): + path = self._strip_protocol(path) + return os.path.isfile(path) + + def isdir(self, path): + path = self._strip_protocol(path) + return os.path.isdir(path) + + def get_file(self, path1, path2, callback=None, **kwargs): + if isfilelike(path2): + with open(path1, "rb") as f: + shutil.copyfileobj(f, path2) + else: + return self.cp_file(path1, path2, **kwargs) + + def put_file(self, path1, path2, callback=None, **kwargs): + return self.cp_file(path1, path2, **kwargs) + + def mv(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1) + path2 = self._strip_protocol(path2) + shutil.move(path1, path2) + + def link(self, src, dst, **kwargs): + src = self._strip_protocol(src) + dst = self._strip_protocol(dst) + os.link(src, dst, **kwargs) + + def symlink(self, src, dst, **kwargs): + src = self._strip_protocol(src) + dst = self._strip_protocol(dst) + os.symlink(src, dst, **kwargs) + + def islink(self, path) -> bool: + return os.path.islink(self._strip_protocol(path)) + + def rm_file(self, path): + os.remove(self._strip_protocol(path)) + + def rm(self, path, recursive=False, maxdepth=None): + if not isinstance(path, list): + path = [path] + + for p in path: + p = self._strip_protocol(p) + if self.isdir(p): + if not recursive: + raise ValueError("Cannot delete directory, set recursive=True") + if osp.abspath(p) == os.getcwd(): + raise ValueError("Cannot delete current working directory") + shutil.rmtree(p) + else: + os.remove(p) + + def unstrip_protocol(self, name): + name = self._strip_protocol(name) # normalise for local/win/... + return f"file://{name}" + + def _open(self, path, mode="rb", block_size=None, **kwargs): + path = self._strip_protocol(path) + if self.auto_mkdir and "w" in mode: + self.makedirs(self._parent(path), exist_ok=True) + return LocalFileOpener(path, mode, fs=self, **kwargs) + + def touch(self, path, truncate=True, **kwargs): + path = self._strip_protocol(path) + if self.auto_mkdir: + self.makedirs(self._parent(path), exist_ok=True) + if self.exists(path): + os.utime(path, None) + else: + open(path, "a").close() + if truncate: + os.truncate(path, 0) + + def created(self, path): + info = self.info(path=path) + return datetime.datetime.fromtimestamp( + info["created"], tz=datetime.timezone.utc + ) + + def modified(self, path): + info = self.info(path=path) + return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc) + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path) + if os.sep == "/": + # posix native + return path.rsplit("/", 1)[0] or "/" + else: + # NT + path_ = path.rsplit("/", 1)[0] + if len(path_) <= 3: + if path_[1:2] == ":": + # nt root (something like c:/) + return path_[0] + ":/" + # More cases may be required here + return path_ + + @classmethod + def _strip_protocol(cls, path): + path = stringify_path(path) + if path.startswith("file://"): + path = path[7:] + elif path.startswith("file:"): + path = path[5:] + elif path.startswith("local://"): + path = path[8:] + elif path.startswith("local:"): + path = path[6:] + + path = make_path_posix(path) + if os.sep != "/": + # This code-path is a stripped down version of + # > drive, path = ntpath.splitdrive(path) + if path[1:2] == ":": + # Absolute drive-letter path, e.g. X:\Windows + # Relative path with drive, e.g. X:Windows + drive, path = path[:2], path[2:] + elif path[:2] == "//": + # UNC drives, e.g. \\server\share or \\?\UNC\server\share + # Device drives, e.g. \\.\device or \\?\device + if (index1 := path.find("/", 2)) == -1 or ( + index2 := path.find("/", index1 + 1) + ) == -1: + drive, path = path, "" + else: + drive, path = path[:index2], path[index2:] + else: + # Relative path, e.g. Windows + drive = "" + + path = path.rstrip("/") or cls.root_marker + return drive + path + + else: + return path.rstrip("/") or cls.root_marker + + def _isfilestore(self): + # Inheriting from DaskFileSystem makes this False (S3, etc. were) + # the original motivation. But we are a posix-like file system. + # See https://github.com/dask/dask/issues/5526 + return True + + def chmod(self, path, mode): + path = stringify_path(path) + return os.chmod(path, mode) + + +def make_path_posix(path): + """Make path generic and absolute for current OS""" + if not isinstance(path, str): + if isinstance(path, (list, set, tuple)): + return type(path)(make_path_posix(p) for p in path) + else: + path = stringify_path(path) + if not isinstance(path, str): + raise TypeError(f"could not convert {path!r} to string") + if os.sep == "/": + # Native posix + if path.startswith("/"): + # most common fast case for posix + return path + elif path.startswith("~"): + return osp.expanduser(path) + elif path.startswith("./"): + path = path[2:] + elif path == ".": + path = "" + return f"{os.getcwd()}/{path}" + else: + # NT handling + if path[0:1] == "/" and path[2:3] == ":": + # path is like "/c:/local/path" + path = path[1:] + if path[1:2] == ":": + # windows full path like "C:\\local\\path" + if len(path) <= 3: + # nt root (something like c:/) + return path[0] + ":/" + path = path.replace("\\", "/") + return path + elif path[0:1] == "~": + return make_path_posix(osp.expanduser(path)) + elif path.startswith(("\\\\", "//")): + # windows UNC/DFS-style paths + return "//" + path[2:].replace("\\", "/") + elif path.startswith(("\\", "/")): + # windows relative path with root + path = path.replace("\\", "/") + return f"{osp.splitdrive(os.getcwd())[0]}{path}" + else: + path = path.replace("\\", "/") + if path.startswith("./"): + path = path[2:] + elif path == ".": + path = "" + return f"{make_path_posix(os.getcwd())}/{path}" + + +def trailing_sep(path): + """Return True if the path ends with a path separator. + + A forward slash is always considered a path separator, even on Operating + Systems that normally use a backslash. + """ + # TODO: if all incoming paths were posix-compliant then separator would + # always be a forward slash, simplifying this function. + # See https://github.com/fsspec/filesystem_spec/pull/1250 + return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep)) + + +class LocalFileOpener(io.IOBase): + def __init__( + self, path, mode, autocommit=True, fs=None, compression=None, **kwargs + ): + logger.debug("open file: %s", path) + self.path = path + self.mode = mode + self.fs = fs + self.f = None + self.autocommit = autocommit + self.compression = get_compression(path, compression) + self.blocksize = io.DEFAULT_BUFFER_SIZE + self._open() + + def _open(self): + if self.f is None or self.f.closed: + if self.autocommit or "w" not in self.mode: + self.f = open(self.path, mode=self.mode) + if self.compression: + compress = compr[self.compression] + self.f = compress(self.f, mode=self.mode) + else: + # TODO: check if path is writable? + i, name = tempfile.mkstemp() + os.close(i) # we want normal open and normal buffered file + self.temp = name + self.f = open(name, mode=self.mode) + if "w" not in self.mode: + self.size = self.f.seek(0, 2) + self.f.seek(0) + self.f.size = self.size + + def _fetch_range(self, start, end): + # probably only used by cached FS + if "r" not in self.mode: + raise ValueError + self._open() + self.f.seek(start) + return self.f.read(end - start) + + def __setstate__(self, state): + self.f = None + loc = state.pop("loc", None) + self.__dict__.update(state) + if "r" in state["mode"]: + self.f = None + self._open() + self.f.seek(loc) + + def __getstate__(self): + d = self.__dict__.copy() + d.pop("f") + if "r" in self.mode: + d["loc"] = self.f.tell() + else: + if not self.f.closed: + raise ValueError("Cannot serialise open write-mode local file") + return d + + def commit(self): + if self.autocommit: + raise RuntimeError("Can only commit if not already set to autocommit") + shutil.move(self.temp, self.path) + + def discard(self): + if self.autocommit: + raise RuntimeError("Cannot discard if set to autocommit") + os.remove(self.temp) + + def readable(self) -> bool: + return True + + def writable(self) -> bool: + return "r" not in self.mode + + def read(self, *args, **kwargs): + return self.f.read(*args, **kwargs) + + def write(self, *args, **kwargs): + return self.f.write(*args, **kwargs) + + def tell(self, *args, **kwargs): + return self.f.tell(*args, **kwargs) + + def seek(self, *args, **kwargs): + return self.f.seek(*args, **kwargs) + + def seekable(self, *args, **kwargs): + return self.f.seekable(*args, **kwargs) + + def readline(self, *args, **kwargs): + return self.f.readline(*args, **kwargs) + + def readlines(self, *args, **kwargs): + return self.f.readlines(*args, **kwargs) + + def close(self): + return self.f.close() + + def truncate(self, size=None) -> int: + return self.f.truncate(size) + + @property + def closed(self): + return self.f.closed + + def fileno(self): + return self.raw.fileno() + + def flush(self) -> None: + self.f.flush() + + def __iter__(self): + return self.f.__iter__() + + def __getattr__(self, item): + return getattr(self.f, item) + + def __enter__(self): + self._incontext = True + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._incontext = False + self.f.__exit__(exc_type, exc_value, traceback) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/memory.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..83e7e74d6ceceaf6e75268923094bfdf56b72fc7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/memory.py @@ -0,0 +1,303 @@ +from __future__ import annotations + +import logging +from datetime import datetime, timezone +from errno import ENOTEMPTY +from io import BytesIO +from pathlib import PurePath, PureWindowsPath +from typing import Any, ClassVar + +from fsspec import AbstractFileSystem +from fsspec.implementations.local import LocalFileSystem +from fsspec.utils import stringify_path + +logger = logging.getLogger("fsspec.memoryfs") + + +class MemoryFileSystem(AbstractFileSystem): + """A filesystem based on a dict of BytesIO objects + + This is a global filesystem so instances of this class all point to the same + in memory filesystem. + """ + + store: ClassVar[dict[str, Any]] = {} # global, do not overwrite! + pseudo_dirs = [""] # global, do not overwrite! + protocol = "memory" + root_marker = "/" + + @classmethod + def _strip_protocol(cls, path): + if isinstance(path, PurePath): + if isinstance(path, PureWindowsPath): + return LocalFileSystem._strip_protocol(path) + else: + path = stringify_path(path) + + if path.startswith("memory://"): + path = path[len("memory://") :] + if "::" in path or "://" in path: + return path.rstrip("/") + path = path.lstrip("/").rstrip("/") + return "/" + path if path else "" + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + if path in self.store: + # there is a key with this exact name + if not detail: + return [path] + return [ + { + "name": path, + "size": self.store[path].size, + "type": "file", + "created": self.store[path].created.timestamp(), + } + ] + paths = set() + starter = path + "/" + out = [] + for p2 in tuple(self.store): + if p2.startswith(starter): + if "/" not in p2[len(starter) :]: + # exact child + out.append( + { + "name": p2, + "size": self.store[p2].size, + "type": "file", + "created": self.store[p2].created.timestamp(), + } + ) + elif len(p2) > len(starter): + # implied child directory + ppath = starter + p2[len(starter) :].split("/", 1)[0] + if ppath not in paths: + out = out or [] + out.append( + { + "name": ppath, + "size": 0, + "type": "directory", + } + ) + paths.add(ppath) + for p2 in self.pseudo_dirs: + if p2.startswith(starter): + if "/" not in p2[len(starter) :]: + # exact child pdir + if p2 not in paths: + out.append({"name": p2, "size": 0, "type": "directory"}) + paths.add(p2) + else: + # directory implied by deeper pdir + ppath = starter + p2[len(starter) :].split("/", 1)[0] + if ppath not in paths: + out.append({"name": ppath, "size": 0, "type": "directory"}) + paths.add(ppath) + if not out: + if path in self.pseudo_dirs: + # empty dir + return [] + raise FileNotFoundError(path) + if detail: + return out + return sorted([f["name"] for f in out]) + + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if path in self.store or path in self.pseudo_dirs: + raise FileExistsError(path) + if self._parent(path).strip("/") and self.isfile(self._parent(path)): + raise NotADirectoryError(self._parent(path)) + if create_parents and self._parent(path).strip("/"): + try: + self.mkdir(self._parent(path), create_parents, **kwargs) + except FileExistsError: + pass + if path and path not in self.pseudo_dirs: + self.pseudo_dirs.append(path) + + def makedirs(self, path, exist_ok=False): + try: + self.mkdir(path, create_parents=True) + except FileExistsError: + if not exist_ok: + raise + + def pipe_file(self, path, value, **kwargs): + """Set the bytes of given file + + Avoids copies of the data if possible + """ + self.open(path, "wb", data=value) + + def rmdir(self, path): + path = self._strip_protocol(path) + if path == "": + # silently avoid deleting FS root + return + if path in self.pseudo_dirs: + if not self.ls(path): + self.pseudo_dirs.remove(path) + else: + raise OSError(ENOTEMPTY, "Directory not empty", path) + else: + raise FileNotFoundError(path) + + def info(self, path, **kwargs): + logger.debug("info: %s", path) + path = self._strip_protocol(path) + if path in self.pseudo_dirs or any( + p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs + ): + return { + "name": path, + "size": 0, + "type": "directory", + } + elif path in self.store: + filelike = self.store[path] + return { + "name": path, + "size": filelike.size, + "type": "file", + "created": getattr(filelike, "created", None), + } + else: + raise FileNotFoundError(path) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + path = self._strip_protocol(path) + if path in self.pseudo_dirs: + raise IsADirectoryError(path) + parent = path + while len(parent) > 1: + parent = self._parent(parent) + if self.isfile(parent): + raise FileExistsError(parent) + if mode in ["rb", "ab", "r+b"]: + if path in self.store: + f = self.store[path] + if mode == "ab": + # position at the end of file + f.seek(0, 2) + else: + # position at the beginning of file + f.seek(0) + return f + else: + raise FileNotFoundError(path) + elif mode == "wb": + m = MemoryFile(self, path, kwargs.get("data")) + if not self._intrans: + m.commit() + return m + else: + name = self.__class__.__name__ + raise ValueError(f"unsupported file mode for {name}: {mode!r}") + + def cp_file(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1) + path2 = self._strip_protocol(path2) + if self.isfile(path1): + self.store[path2] = MemoryFile( + self, path2, self.store[path1].getvalue() + ) # implicit copy + elif self.isdir(path1): + if path2 not in self.pseudo_dirs: + self.pseudo_dirs.append(path2) + else: + raise FileNotFoundError(path1) + + def cat_file(self, path, start=None, end=None, **kwargs): + logger.debug("cat: %s", path) + path = self._strip_protocol(path) + try: + return bytes(self.store[path].getbuffer()[start:end]) + except KeyError: + raise FileNotFoundError(path) + + def _rm(self, path): + path = self._strip_protocol(path) + try: + del self.store[path] + except KeyError as e: + raise FileNotFoundError(path) from e + + def modified(self, path): + path = self._strip_protocol(path) + try: + return self.store[path].modified + except KeyError: + raise FileNotFoundError(path) + + def created(self, path): + path = self._strip_protocol(path) + try: + return self.store[path].created + except KeyError: + raise FileNotFoundError(path) + + def rm(self, path, recursive=False, maxdepth=None): + if isinstance(path, str): + path = self._strip_protocol(path) + else: + path = [self._strip_protocol(p) for p in path] + paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) + for p in reversed(paths): + # If the expanded path doesn't exist, it is only because the expanded + # path was a directory that does not exist in self.pseudo_dirs. This + # is possible if you directly create files without making the + # directories first. + if not self.exists(p): + continue + if self.isfile(p): + self.rm_file(p) + else: + self.rmdir(p) + + +class MemoryFile(BytesIO): + """A BytesIO which can't close and works as a context manager + + Can initialise with data. Each path should only be active once at any moment. + + No need to provide fs, path if auto-committing (default) + """ + + def __init__(self, fs=None, path=None, data=None): + logger.debug("open file %s", path) + self.fs = fs + self.path = path + self.created = datetime.now(tz=timezone.utc) + self.modified = datetime.now(tz=timezone.utc) + if data: + super().__init__(data) + self.seek(0) + + @property + def size(self): + return self.getbuffer().nbytes + + def __enter__(self): + return self + + def close(self): + pass + + def discard(self): + pass + + def commit(self): + self.fs.store[self.path] = self + self.modified = datetime.now(tz=timezone.utc) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/reference.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/reference.py new file mode 100644 index 0000000000000000000000000000000000000000..e202c96d6bcb000144710115156c1c4c78935bc7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/reference.py @@ -0,0 +1,1160 @@ +import base64 +import collections +import io +import itertools +import logging +import math +import os +from functools import lru_cache +from typing import TYPE_CHECKING + +import fsspec.core + +try: + import ujson as json +except ImportError: + if not TYPE_CHECKING: + import json + +from ..asyn import AsyncFileSystem +from ..callbacks import DEFAULT_CALLBACK +from ..core import filesystem, open, split_protocol +from ..utils import isfilelike, merge_offset_ranges, other_paths + +logger = logging.getLogger("fsspec.reference") + + +class ReferenceNotReachable(RuntimeError): + def __init__(self, reference, target, *args): + super().__init__(*args) + self.reference = reference + self.target = target + + def __str__(self): + return f'Reference "{self.reference}" failed to fetch target {self.target}' + + +def _first(d): + return list(d.values())[0] + + +def _prot_in_references(path, references): + ref = references.get(path) + if isinstance(ref, (list, tuple)): + return split_protocol(ref[0])[0] if ref[0] else ref[0] + + +def _protocol_groups(paths, references): + if isinstance(paths, str): + return {_prot_in_references(paths, references): [paths]} + out = {} + for path in paths: + protocol = _prot_in_references(path, references) + out.setdefault(protocol, []).append(path) + return out + + +class RefsValuesView(collections.abc.ValuesView): + def __iter__(self): + for val in self._mapping.zmetadata.values(): + yield json.dumps(val).encode() + yield from self._mapping._items.values() + for field in self._mapping.listdir(): + chunk_sizes = self._mapping._get_chunk_sizes(field) + if len(chunk_sizes) == 0: + yield self._mapping[field + "/0"] + continue + yield from self._mapping._generate_all_records(field) + + +class RefsItemsView(collections.abc.ItemsView): + def __iter__(self): + return zip(self._mapping.keys(), self._mapping.values()) + + +def ravel_multi_index(idx, sizes): + val = 0 + mult = 1 + for i, s in zip(idx[::-1], sizes[::-1]): + val += i * mult + mult *= s + return val + + +class LazyReferenceMapper(collections.abc.MutableMapping): + """This interface can be used to read/write references from Parquet stores. + It is not intended for other types of references. + It can be used with Kerchunk's MultiZarrToZarr method to combine + references into a parquet store. + Examples of this use-case can be found here: + https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage""" + + # import is class level to prevent numpy dep requirement for fsspec + @property + def np(self): + import numpy as np + + return np + + @property + def pd(self): + import pandas as pd + + return pd + + def __init__( + self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10 + ): + """ + + This instance will be writable, storing changes in memory until full partitions + are accumulated or .flush() is called. + + To create an empty lazy store, use .create() + + Parameters + ---------- + root : str + Root of parquet store + fs : fsspec.AbstractFileSystem + fsspec filesystem object, default is local filesystem. + cache_size : int, default=128 + Maximum size of LRU cache, where cache_size*record_size denotes + the total number of references that can be loaded in memory at once. + categorical_threshold : int + Encode urls as pandas.Categorical to reduce memory footprint if the ratio + of the number of unique urls to total number of refs for each variable + is greater than or equal to this number. (default 10) + """ + self.root = root + self.chunk_sizes = {} + self.out_root = out_root or self.root + self.cat_thresh = categorical_threshold + self.cache_size = cache_size + self.dirs = None + self.url = self.root + "/{field}/refs.{record}.parq" + # TODO: derive fs from `root` + self.fs = fsspec.filesystem("file") if fs is None else fs + + def __getattr__(self, item): + if item in ("_items", "record_size", "zmetadata"): + self.setup() + # avoid possible recursion if setup fails somehow + return self.__dict__[item] + raise AttributeError(item) + + def setup(self): + self._items = {} + self._items[".zmetadata"] = self.fs.cat_file( + "/".join([self.root, ".zmetadata"]) + ) + met = json.loads(self._items[".zmetadata"]) + self.record_size = met["record_size"] + self.zmetadata = met["metadata"] + + # Define function to open and decompress refs + @lru_cache(maxsize=self.cache_size) + def open_refs(field, record): + """cached parquet file loader""" + path = self.url.format(field=field, record=record) + data = io.BytesIO(self.fs.cat_file(path)) + df = self.pd.read_parquet(data, engine="fastparquet") + refs = {c: df[c].values for c in df.columns} + return refs + + self.open_refs = open_refs + + @staticmethod + def create(root, storage_options=None, fs=None, record_size=10000, **kwargs): + """Make empty parquet reference set + + First deletes the contents of the given directory, if it exists. + + Parameters + ---------- + root: str + Directory to contain the output; will be created + storage_options: dict | None + For making the filesystem to use for writing is fs is None + fs: FileSystem | None + Filesystem for writing + record_size: int + Number of references per parquet file + kwargs: passed to __init__ + + Returns + ------- + LazyReferenceMapper instance + """ + met = {"metadata": {}, "record_size": record_size} + if fs is None: + fs, root = fsspec.core.url_to_fs(root, **(storage_options or {})) + if fs.exists(root): + fs.rm(root, recursive=True) + fs.makedirs(root, exist_ok=True) + fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode()) + return LazyReferenceMapper(root, fs, **kwargs) + + def listdir(self, basename=True): + """List top-level directories""" + # cache me? + if self.dirs is None: + dirs = [p.split("/", 1)[0] for p in self.zmetadata] + self.dirs = {p for p in dirs if p and not p.startswith(".")} + listing = self.dirs + if basename: + listing = [os.path.basename(path) for path in listing] + return listing + + def ls(self, path="", detail=True): + """Shortcut file listings""" + if not path: + dirnames = self.listdir() + others = set( + [".zmetadata"] + + [name for name in self.zmetadata if "/" not in name] + + [name for name in self._items if "/" not in name] + ) + if detail is False: + others.update(dirnames) + return sorted(others) + dirinfo = [ + {"name": name, "type": "directory", "size": 0} for name in dirnames + ] + fileinfo = [ + { + "name": name, + "type": "file", + "size": len( + json.dumps(self.zmetadata[name]) + if name in self.zmetadata + else self._items[name] + ), + } + for name in others + ] + return sorted(dirinfo + fileinfo, key=lambda s: s["name"]) + parts = path.split("/", 1) + if len(parts) > 1: + raise FileNotFoundError("Cannot list within directories right now") + field = parts[0] + others = set( + [name for name in self.zmetadata if name.startswith(f"{path}/")] + + [name for name in self._items if name.startswith(f"{path}/")] + ) + fileinfo = [ + { + "name": name, + "type": "file", + "size": len( + json.dumps(self.zmetadata[name]) + if name in self.zmetadata + else self._items[name] + ), + } + for name in others + ] + keys = self._keys_in_field(field) + + if detail is False: + return list(others) + list(keys) + recs = self._generate_all_records(field) + recinfo = [ + {"name": name, "type": "file", "size": rec[-1]} + for name, rec in zip(keys, recs) + if rec[0] # filters out path==None, deleted/missing + ] + return fileinfo + recinfo + + def _load_one_key(self, key): + """Get the reference for one key + + Returns bytes, one-element list or three-element list. + """ + if key in self._items: + return self._items[key] + elif key in self.zmetadata: + return json.dumps(self.zmetadata[key]).encode() + elif "/" not in key or self._is_meta(key): + raise KeyError(key) + field, _ = key.rsplit("/", 1) + record, ri, chunk_size = self._key_to_record(key) + maybe = self._items.get((field, record), {}).get(ri, False) + if maybe is None: + # explicitly deleted + raise KeyError + elif maybe: + return maybe + elif chunk_size == 0: + return b"" + + # Chunk keys can be loaded from row group and cached in LRU cache + try: + refs = self.open_refs(field, record) + except (ValueError, TypeError, FileNotFoundError): + raise KeyError(key) + columns = ["path", "offset", "size", "raw"] + selection = [refs[c][ri] if c in refs else None for c in columns] + raw = selection[-1] + if raw is not None: + return raw + if selection[0] is None: + raise KeyError("This reference does not exist or has been deleted") + if selection[1:3] == [0, 0]: + # URL only + return selection[:1] + # URL, offset, size + return selection[:3] + + @lru_cache(4096) + def _key_to_record(self, key): + """Details needed to construct a reference for one key""" + field, chunk = key.rsplit("/", 1) + chunk_sizes = self._get_chunk_sizes(field) + if len(chunk_sizes) == 0: + return 0, 0, 0 + chunk_idx = [int(c) for c in chunk.split(".")] + chunk_number = ravel_multi_index(chunk_idx, chunk_sizes) + record = chunk_number // self.record_size + ri = chunk_number % self.record_size + return record, ri, len(chunk_sizes) + + def _get_chunk_sizes(self, field): + """The number of chunks along each axis for a given field""" + if field not in self.chunk_sizes: + zarray = self.zmetadata[f"{field}/.zarray"] + size_ratio = [ + math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"]) + ] + self.chunk_sizes[field] = size_ratio or [1] + return self.chunk_sizes[field] + + def _generate_record(self, field, record): + """The references for a given parquet file of a given field""" + refs = self.open_refs(field, record) + it = iter(zip(*refs.values())) + if len(refs) == 3: + # All urls + return (list(t) for t in it) + elif len(refs) == 1: + # All raws + return refs["raw"] + else: + # Mix of urls and raws + return (list(t[:3]) if not t[3] else t[3] for t in it) + + def _generate_all_records(self, field): + """Load all the references within a field by iterating over the parquet files""" + nrec = 1 + for ch in self._get_chunk_sizes(field): + nrec *= ch + nrec = math.ceil(nrec / self.record_size) + for record in range(nrec): + yield from self._generate_record(field, record) + + def values(self): + return RefsValuesView(self) + + def items(self): + return RefsItemsView(self) + + def __hash__(self): + return id(self) + + def __getitem__(self, key): + return self._load_one_key(key) + + def __setitem__(self, key, value): + if "/" in key and not self._is_meta(key): + field, chunk = key.rsplit("/", 1) + record, i, _ = self._key_to_record(key) + subdict = self._items.setdefault((field, record), {}) + subdict[i] = value + if len(subdict) == self.record_size: + self.write(field, record) + else: + # metadata or top-level + self._items[key] = value + new_value = json.loads( + value.decode() if isinstance(value, bytes) else value + ) + self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value} + + @staticmethod + def _is_meta(key): + return key.startswith(".z") or "/.z" in key + + def __delitem__(self, key): + if key in self._items: + del self._items[key] + elif key in self.zmetadata: + del self.zmetadata[key] + else: + if "/" in key and not self._is_meta(key): + field, _ = key.rsplit("/", 1) + record, i, _ = self._key_to_record(key) + subdict = self._items.setdefault((field, record), {}) + subdict[i] = None + if len(subdict) == self.record_size: + self.write(field, record) + else: + # metadata or top-level + self._items[key] = None + + def write(self, field, record, base_url=None, storage_options=None): + # extra requirements if writing + import kerchunk.df + import numpy as np + import pandas as pd + + partition = self._items[(field, record)] + original = False + if len(partition) < self.record_size: + try: + original = self.open_refs(field, record) + except IOError: + pass + + if original: + paths = original["path"] + offsets = original["offset"] + sizes = original["size"] + raws = original["raw"] + else: + paths = np.full(self.record_size, np.nan, dtype="O") + offsets = np.zeros(self.record_size, dtype="int64") + sizes = np.zeros(self.record_size, dtype="int64") + raws = np.full(self.record_size, np.nan, dtype="O") + for j, data in partition.items(): + if isinstance(data, list): + if ( + str(paths.dtype) == "category" + and data[0] not in paths.dtype.categories + ): + paths = paths.add_categories(data[0]) + paths[j] = data[0] + if len(data) > 1: + offsets[j] = data[1] + sizes[j] = data[2] + elif data is None: + # delete + paths[j] = None + offsets[j] = 0 + sizes[j] = 0 + raws[j] = None + else: + # this is the only call into kerchunk, could remove + raws[j] = kerchunk.df._proc_raw(data) + # TODO: only save needed columns + df = pd.DataFrame( + { + "path": paths, + "offset": offsets, + "size": sizes, + "raw": raws, + }, + copy=False, + ) + if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh: + df["path"] = df["path"].astype("category") + object_encoding = {"raw": "bytes", "path": "utf8"} + has_nulls = ["path", "raw"] + + fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq" + self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True) + df.to_parquet( + fn, + engine="fastparquet", + storage_options=storage_options + or getattr(self.fs, "storage_options", None), + compression="zstd", + index=False, + stats=False, + object_encoding=object_encoding, + has_nulls=has_nulls, + # **kwargs, + ) + partition.clear() + self._items.pop((field, record)) + + def flush(self, base_url=None, storage_options=None): + """Output any modified or deleted keys + + Parameters + ---------- + base_url: str + Location of the output + """ + # write what we have so far and clear sub chunks + for thing in list(self._items): + if isinstance(thing, tuple): + field, record = thing + self.write( + field, + record, + base_url=base_url, + storage_options=storage_options, + ) + + # gather .zmetadata from self._items and write that too + for k in list(self._items): + if k != ".zmetadata" and ".z" in k: + self.zmetadata[k] = json.loads(self._items.pop(k)) + met = {"metadata": self.zmetadata, "record_size": self.record_size} + self._items[".zmetadata"] = json.dumps(met).encode() + self.fs.pipe( + "/".join([base_url or self.out_root, ".zmetadata"]), + self._items[".zmetadata"], + ) + + # TODO: only clear those that we wrote to? + self.open_refs.cache_clear() + + def __len__(self): + # Caveat: This counts expected references, not actual - but is fast + count = 0 + for field in self.listdir(): + if field.startswith("."): + count += 1 + else: + count += math.prod(self._get_chunk_sizes(field)) + count += len(self.zmetadata) # all metadata keys + # any other files not in reference partitions + count += sum(1 for _ in self._items if not isinstance(_, tuple)) + return count + + def __iter__(self): + # Caveat: returns only existing keys, so the number of these does not + # match len(self) + metas = set(self.zmetadata) + metas.update(self._items) + for bit in metas: + if isinstance(bit, str): + yield bit + for field in self.listdir(): + for k in self._keys_in_field(field): + if k in self: + yield k + + def __contains__(self, item): + try: + self._load_one_key(item) + return True + except KeyError: + return False + + def _keys_in_field(self, field): + """List key names in given field + + Produces strings like "field/x.y" appropriate from the chunking of the array + """ + chunk_sizes = self._get_chunk_sizes(field) + if len(chunk_sizes) == 0: + yield field + "/0" + return + inds = itertools.product(*(range(i) for i in chunk_sizes)) + for ind in inds: + yield field + "/" + ".".join([str(c) for c in ind]) + + +class ReferenceFileSystem(AsyncFileSystem): + """View byte ranges of some other file as a file system + Initial version: single file system target, which must support + async, and must allow start and end args in _cat_file. Later versions + may allow multiple arbitrary URLs for the targets. + This FileSystem is read-only. It is designed to be used with async + targets (for now). This FileSystem only allows whole-file access, no + ``open``. We do not get original file details from the target FS. + Configuration is by passing a dict of references at init, or a URL to + a JSON file containing the same; this dict + can also contain concrete data for some set of paths. + Reference dict format: + {path0: bytes_data, path1: (target_url, offset, size)} + https://github.com/fsspec/kerchunk/blob/main/README.md + """ + + protocol = "reference" + + def __init__( + self, + fo, + target=None, + ref_storage_args=None, + target_protocol=None, + target_options=None, + remote_protocol=None, + remote_options=None, + fs=None, + template_overrides=None, + simple_templates=True, + max_gap=64_000, + max_block=256_000_000, + cache_size=128, + **kwargs, + ): + """ + Parameters + ---------- + fo : dict or str + The set of references to use for this instance, with a structure as above. + If str referencing a JSON file, will use fsspec.open, in conjunction + with target_options and target_protocol to open and parse JSON at this + location. If a directory, then assume references are a set of parquet + files to be loaded lazily. + target : str + For any references having target_url as None, this is the default file + target to use + ref_storage_args : dict + If references is a str, use these kwargs for loading the JSON file. + Deprecated: use target_options instead. + target_protocol : str + Used for loading the reference file, if it is a path. If None, protocol + will be derived from the given path + target_options : dict + Extra FS options for loading the reference file ``fo``, if given as a path + remote_protocol : str + The protocol of the filesystem on which the references will be evaluated + (unless fs is provided). If not given, will be derived from the first + URL that has a protocol in the templates or in the references, in that + order. + remote_options : dict + kwargs to go with remote_protocol + fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict)) + Directly provide a file system(s): + - a single filesystem instance + - a dict of protocol:filesystem, where each value is either a filesystem + instance, or a dict of kwargs that can be used to create in + instance for the given protocol + + If this is given, remote_options and remote_protocol are ignored. + template_overrides : dict + Swap out any templates in the references file with these - useful for + testing. + simple_templates: bool + Whether templates can be processed with simple replace (True) or if + jinja is needed (False, much slower). All reference sets produced by + ``kerchunk`` are simple in this sense, but the spec allows for complex. + max_gap, max_block: int + For merging multiple concurrent requests to the same remote file. + Neighboring byte ranges will only be merged when their + inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0 + to only merge when it requires no extra bytes. Pass a negative + number to disable merging, appropriate for local target files. + Neighboring byte ranges will only be merged when the size of + the aggregated range is <= ``max_block``. Default is 256MB. + cache_size : int + Maximum size of LRU cache, where cache_size*record_size denotes + the total number of references that can be loaded in memory at once. + Only used for lazily loaded references. + kwargs : passed to parent class + """ + super().__init__(**kwargs) + self.target = target + self.template_overrides = template_overrides + self.simple_templates = simple_templates + self.templates = {} + self.fss = {} + self._dircache = {} + self.max_gap = max_gap + self.max_block = max_block + if isinstance(fo, str): + dic = dict( + **(ref_storage_args or target_options or {}), protocol=target_protocol + ) + ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic) + if ref_fs.isfile(fo2): + # text JSON + with fsspec.open(fo, "rb", **dic) as f: + logger.info("Read reference from URL %s", fo) + text = json.load(f) + self._process_references(text, template_overrides) + else: + # Lazy parquet refs + logger.info("Open lazy reference dict from URL %s", fo) + self.references = LazyReferenceMapper( + fo2, + fs=ref_fs, + cache_size=cache_size, + ) + else: + # dictionaries + self._process_references(fo, template_overrides) + if isinstance(fs, dict): + self.fss = { + k: ( + fsspec.filesystem(k.split(":", 1)[0], **opts) + if isinstance(opts, dict) + else opts + ) + for k, opts in fs.items() + } + if None not in self.fss: + self.fss[None] = filesystem("file") + return + if fs is not None: + # single remote FS + remote_protocol = ( + fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol + ) + self.fss[remote_protocol] = fs + + if remote_protocol is None: + # get single protocol from any templates + for ref in self.templates.values(): + if callable(ref): + ref = ref() + protocol, _ = fsspec.core.split_protocol(ref) + if protocol and protocol not in self.fss: + fs = filesystem(protocol, **(remote_options or {})) + self.fss[protocol] = fs + if remote_protocol is None: + # get single protocol from references + # TODO: warning here, since this can be very expensive? + for ref in self.references.values(): + if callable(ref): + ref = ref() + if isinstance(ref, list) and ref[0]: + protocol, _ = fsspec.core.split_protocol(ref[0]) + if protocol not in self.fss: + fs = filesystem(protocol, **(remote_options or {})) + self.fss[protocol] = fs + # only use first remote URL + break + + if remote_protocol and remote_protocol not in self.fss: + fs = filesystem(remote_protocol, **(remote_options or {})) + self.fss[remote_protocol] = fs + + self.fss[None] = fs or filesystem("file") # default one + + def _cat_common(self, path, start=None, end=None): + path = self._strip_protocol(path) + logger.debug(f"cat: {path}") + try: + part = self.references[path] + except KeyError: + raise FileNotFoundError(path) + if isinstance(part, str): + part = part.encode() + if isinstance(part, bytes): + logger.debug(f"Reference: {path}, type bytes") + if part.startswith(b"base64:"): + part = base64.b64decode(part[7:]) + return part, None, None + + if len(part) == 1: + logger.debug(f"Reference: {path}, whole file => {part}") + url = part[0] + start1, end1 = start, end + else: + url, start0, size = part + logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}") + end0 = start0 + size + + if start is not None: + if start >= 0: + start1 = start0 + start + else: + start1 = end0 + start + else: + start1 = start0 + if end is not None: + if end >= 0: + end1 = start0 + end + else: + end1 = end0 + end + else: + end1 = end0 + if url is None: + url = self.target + return url, start1, end1 + + async def _cat_file(self, path, start=None, end=None, **kwargs): + part_or_url, start0, end0 = self._cat_common(path, start=start, end=end) + if isinstance(part_or_url, bytes): + return part_or_url[start:end] + protocol, _ = split_protocol(part_or_url) + try: + await self.fss[protocol]._cat_file(part_or_url, start=start, end=end) + except Exception as e: + raise ReferenceNotReachable(path, part_or_url) from e + + def cat_file(self, path, start=None, end=None, **kwargs): + part_or_url, start0, end0 = self._cat_common(path, start=start, end=end) + if isinstance(part_or_url, bytes): + return part_or_url[start:end] + protocol, _ = split_protocol(part_or_url) + try: + return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0) + except Exception as e: + raise ReferenceNotReachable(path, part_or_url) from e + + def pipe_file(self, path, value, **_): + """Temporarily add binary data or reference as a file""" + self.references[path] = value + + async def _get_file(self, rpath, lpath, **kwargs): + if self.isdir(rpath): + return os.makedirs(lpath, exist_ok=True) + data = await self._cat_file(rpath) + with open(lpath, "wb") as f: + f.write(data) + + def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs): + if self.isdir(rpath): + return os.makedirs(lpath, exist_ok=True) + data = self.cat_file(rpath, **kwargs) + callback.set_size(len(data)) + if isfilelike(lpath): + lpath.write(data) + else: + with open(lpath, "wb") as f: + f.write(data) + callback.absolute_update(len(data)) + + def get(self, rpath, lpath, recursive=False, **kwargs): + if recursive: + # trigger directory build + self.ls("") + rpath = self.expand_path(rpath, recursive=recursive) + fs = fsspec.filesystem("file", auto_mkdir=True) + targets = other_paths(rpath, lpath) + if recursive: + data = self.cat([r for r in rpath if not self.isdir(r)]) + else: + data = self.cat(rpath) + for remote, local in zip(rpath, targets): + if remote in data: + fs.pipe_file(local, data[remote]) + + def cat(self, path, recursive=False, on_error="raise", **kwargs): + if isinstance(path, str) and recursive: + raise NotImplementedError + if isinstance(path, list) and (recursive or any("*" in p for p in path)): + raise NotImplementedError + # TODO: if references is lazy, pre-fetch all paths in batch before access + proto_dict = _protocol_groups(path, self.references) + out = {} + for proto, paths in proto_dict.items(): + fs = self.fss[proto] + urls, starts, ends, valid_paths = [], [], [], [] + for p in paths: + # find references or label not-found. Early exit if any not + # found and on_error is "raise" + try: + u, s, e = self._cat_common(p) + except FileNotFoundError as err: + if on_error == "raise": + raise + if on_error != "omit": + out[p] = err + else: + urls.append(u) + starts.append(s) + ends.append(e) + valid_paths.append(p) + + # process references into form for merging + urls2 = [] + starts2 = [] + ends2 = [] + paths2 = [] + whole_files = set() + for u, s, e, p in zip(urls, starts, ends, valid_paths): + if isinstance(u, bytes): + # data + out[p] = u + elif s is None: + # whole file - limits are None, None, but no further + # entries take for this file + whole_files.add(u) + urls2.append(u) + starts2.append(s) + ends2.append(e) + paths2.append(p) + for u, s, e, p in zip(urls, starts, ends, valid_paths): + # second run to account for files that are to be loaded whole + if s is not None and u not in whole_files: + urls2.append(u) + starts2.append(s) + ends2.append(e) + paths2.append(p) + + # merge and fetch consolidated ranges + new_paths, new_starts, new_ends = merge_offset_ranges( + list(urls2), + list(starts2), + list(ends2), + sort=True, + max_gap=self.max_gap, + max_block=self.max_block, + ) + bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends) + + # unbundle from merged bytes - simple approach + for u, s, e, p in zip(urls, starts, ends, valid_paths): + if p in out: + continue # was bytes, already handled + for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out): + if np == u and (ns is None or ne is None): + if isinstance(b, Exception): + out[p] = b + else: + out[p] = b[s:e] + elif np == u and s >= ns and e <= ne: + if isinstance(b, Exception): + out[p] = b + else: + out[p] = b[s - ns : (e - ne) or None] + + for k, v in out.copy().items(): + # these were valid references, but fetch failed, so transform exc + if isinstance(v, Exception) and k in self.references: + ex = out[k] + new_ex = ReferenceNotReachable(k, self.references[k]) + new_ex.__cause__ = ex + if on_error == "raise": + raise new_ex + elif on_error != "omit": + out[k] = new_ex + + if len(out) == 1 and isinstance(path, str) and "*" not in path: + return _first(out) + return out + + def _process_references(self, references, template_overrides=None): + vers = references.get("version", None) + if vers is None: + self._process_references0(references) + elif vers == 1: + self._process_references1(references, template_overrides=template_overrides) + else: + raise ValueError(f"Unknown reference spec version: {vers}") + # TODO: we make dircache by iterating over all entries, but for Spec >= 1, + # can replace with programmatic. Is it even needed for mapper interface? + + def _process_references0(self, references): + """Make reference dict for Spec Version 0""" + self.references = references + + def _process_references1(self, references, template_overrides=None): + if not self.simple_templates or self.templates: + import jinja2 + self.references = {} + self._process_templates(references.get("templates", {})) + + @lru_cache(1000) + def _render_jinja(u): + return jinja2.Template(u).render(**self.templates) + + for k, v in references.get("refs", {}).items(): + if isinstance(v, str): + if v.startswith("base64:"): + self.references[k] = base64.b64decode(v[7:]) + self.references[k] = v + elif self.templates: + u = v[0] + if "{{" in u: + if self.simple_templates: + u = ( + u.replace("{{", "{") + .replace("}}", "}") + .format(**self.templates) + ) + else: + u = _render_jinja(u) + self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]] + else: + self.references[k] = v + self.references.update(self._process_gen(references.get("gen", []))) + + def _process_templates(self, tmp): + self.templates = {} + if self.template_overrides is not None: + tmp.update(self.template_overrides) + for k, v in tmp.items(): + if "{{" in v: + import jinja2 + + self.templates[k] = lambda temp=v, **kwargs: jinja2.Template( + temp + ).render(**kwargs) + else: + self.templates[k] = v + + def _process_gen(self, gens): + out = {} + for gen in gens: + dimension = { + k: v + if isinstance(v, list) + else range(v.get("start", 0), v["stop"], v.get("step", 1)) + for k, v in gen["dimensions"].items() + } + products = ( + dict(zip(dimension.keys(), values)) + for values in itertools.product(*dimension.values()) + ) + for pr in products: + import jinja2 + + key = jinja2.Template(gen["key"]).render(**pr, **self.templates) + url = jinja2.Template(gen["url"]).render(**pr, **self.templates) + if ("offset" in gen) and ("length" in gen): + offset = int( + jinja2.Template(gen["offset"]).render(**pr, **self.templates) + ) + length = int( + jinja2.Template(gen["length"]).render(**pr, **self.templates) + ) + out[key] = [url, offset, length] + elif ("offset" in gen) ^ ("length" in gen): + raise ValueError( + "Both 'offset' and 'length' are required for a " + "reference generator entry if either is provided." + ) + else: + out[key] = [url] + return out + + def _dircache_from_items(self): + self.dircache = {"": []} + it = self.references.items() + for path, part in it: + if isinstance(part, (bytes, str)): + size = len(part) + elif len(part) == 1: + size = None + else: + _, _, size = part + par = path.rsplit("/", 1)[0] if "/" in path else "" + par0 = par + subdirs = [par0] + while par0 and par0 not in self.dircache: + # collect parent directories + par0 = self._parent(par0) + subdirs.append(par0) + + subdirs.reverse() + for parent, child in zip(subdirs, subdirs[1:]): + # register newly discovered directories + assert child not in self.dircache + assert parent in self.dircache + self.dircache[parent].append( + {"name": child, "type": "directory", "size": 0} + ) + self.dircache[child] = [] + + self.dircache[par].append({"name": path, "type": "file", "size": size}) + + def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs): + data = self.cat_file(path) # load whole chunk into memory + return io.BytesIO(data) + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + if isinstance(self.references, LazyReferenceMapper): + try: + return self.references.ls(path, detail) + except KeyError: + pass + raise FileNotFoundError(f"'{path}' is not a known key") + if not self.dircache: + self._dircache_from_items() + out = self._ls_from_cache(path) + if out is None: + raise FileNotFoundError(path) + if detail: + return out + return [o["name"] for o in out] + + def exists(self, path, **kwargs): # overwrite auto-sync version + return self.isdir(path) or self.isfile(path) + + def isdir(self, path): # overwrite auto-sync version + if self.dircache: + return path in self.dircache + elif isinstance(self.references, LazyReferenceMapper): + return path in self.references.listdir("") + else: + # this may be faster than building dircache for single calls, but + # by looping will be slow for many calls; could cache it? + return any(_.startswith(f"{path}/") for _ in self.references) + + def isfile(self, path): # overwrite auto-sync version + return path in self.references + + async def _ls(self, path, detail=True, **kwargs): # calls fast sync code + return self.ls(path, detail, **kwargs) + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + if withdirs: + return super().find( + path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs + ) + if path: + path = self._strip_protocol(path) + r = sorted(k for k in self.references if k.startswith(path)) + else: + r = sorted(self.references) + if detail: + if not self.dircache: + self._dircache_from_items() + return {k: self._ls_from_cache(k)[0] for k in r} + else: + return r + + def info(self, path, **kwargs): + out = self.references.get(path) + if out is not None: + if isinstance(out, (str, bytes)): + # decode base64 here + return {"name": path, "type": "file", "size": len(out)} + elif len(out) > 1: + return {"name": path, "type": "file", "size": out[2]} + else: + out0 = [{"name": path, "type": "file", "size": None}] + else: + out = self.ls(path, True) + out0 = [o for o in out if o["name"] == path] + if not out0: + return {"name": path, "type": "directory", "size": 0} + if out0[0]["size"] is None: + # if this is a whole remote file, update size using remote FS + prot, _ = split_protocol(self.references[path][0]) + out0[0]["size"] = self.fss[prot].size(self.references[path][0]) + return out0[0] + + async def _info(self, path, **kwargs): # calls fast sync code + return self.info(path) + + async def _rm_file(self, path, **kwargs): + self.references.pop( + path, None + ) # ignores FileNotFound, just as well for directories + self.dircache.clear() # this is a bit heavy handed + + async def _pipe_file(self, path, data): + # can be str or bytes + self.references[path] = data + self.dircache.clear() # this is a bit heavy handed + + async def _put_file(self, lpath, rpath, **kwargs): + # puts binary + with open(lpath, "rb") as f: + self.references[rpath] = f.read() + self.dircache.clear() # this is a bit heavy handed + + def save_json(self, url, **storage_options): + """Write modified references into new location""" + out = {} + for k, v in self.references.items(): + if isinstance(v, bytes): + try: + out[k] = v.decode("ascii") + except UnicodeDecodeError: + out[k] = (b"base64:" + base64.b64encode(v)).decode() + else: + out[k] = v + with fsspec.open(url, "wb", **storage_options) as f: + f.write(json.dumps({"version": 1, "refs": out}).encode()) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/sftp.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/sftp.py new file mode 100644 index 0000000000000000000000000000000000000000..77f7b370cd246f9a9bfd34141afc3edd728d13c3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/sftp.py @@ -0,0 +1,180 @@ +import datetime +import logging +import os +import types +import uuid +from stat import S_ISDIR, S_ISLNK + +import paramiko + +from .. import AbstractFileSystem +from ..utils import infer_storage_options + +logger = logging.getLogger("fsspec.sftp") + + +class SFTPFileSystem(AbstractFileSystem): + """Files over SFTP/SSH + + Peer-to-peer filesystem over SSH using paramiko. + + Note: if using this with the ``open`` or ``open_files``, with full URLs, + there is no way to tell if a path is relative, so all paths are assumed + to be absolute. + """ + + protocol = "sftp", "ssh" + + def __init__(self, host, **ssh_kwargs): + """ + + Parameters + ---------- + host: str + Hostname or IP as a string + temppath: str + Location on the server to put files, when within a transaction + ssh_kwargs: dict + Parameters passed on to connection. See details in + https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect + May include port, username, password... + """ + if self._cached: + return + super().__init__(**ssh_kwargs) + self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory + self.host = host + self.ssh_kwargs = ssh_kwargs + self._connect() + + def _connect(self): + logger.debug("Connecting to SFTP server %s", self.host) + self.client = paramiko.SSHClient() + self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + self.client.connect(self.host, **self.ssh_kwargs) + self.ftp = self.client.open_sftp() + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + return out + + def mkdir(self, path, create_parents=True, mode=511): + logger.debug("Creating folder %s", path) + if self.exists(path): + raise FileExistsError(f"File exists: {path}") + + if create_parents: + self.makedirs(path) + else: + self.ftp.mkdir(path, mode) + + def makedirs(self, path, exist_ok=False, mode=511): + if self.exists(path) and not exist_ok: + raise FileExistsError(f"File exists: {path}") + + parts = path.split("/") + new_path = "/" if path[:1] == "/" else "" + + for part in parts: + if part: + new_path = f"{new_path}/{part}" if new_path else part + if not self.exists(new_path): + self.ftp.mkdir(new_path, mode) + + def rmdir(self, path): + logger.debug("Removing folder %s", path) + self.ftp.rmdir(path) + + def info(self, path): + stat = self._decode_stat(self.ftp.stat(path)) + stat["name"] = path + return stat + + @staticmethod + def _decode_stat(stat, parent_path=None): + if S_ISDIR(stat.st_mode): + t = "directory" + elif S_ISLNK(stat.st_mode): + t = "link" + else: + t = "file" + out = { + "name": "", + "size": stat.st_size, + "type": t, + "uid": stat.st_uid, + "gid": stat.st_gid, + "time": datetime.datetime.fromtimestamp( + stat.st_atime, tz=datetime.timezone.utc + ), + "mtime": datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ), + } + if parent_path: + out["name"] = "/".join([parent_path.rstrip("/"), stat.filename]) + return out + + def ls(self, path, detail=False): + logger.debug("Listing folder %s", path) + stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)] + if detail: + return stats + else: + paths = [stat["name"] for stat in stats] + return sorted(paths) + + def put(self, lpath, rpath, callback=None, **kwargs): + logger.debug("Put file %s into %s", lpath, rpath) + self.ftp.put(lpath, rpath) + + def get_file(self, rpath, lpath, **kwargs): + if self.isdir(rpath): + os.makedirs(lpath, exist_ok=True) + else: + self.ftp.get(self._strip_protocol(rpath), lpath) + + def _open(self, path, mode="rb", block_size=None, **kwargs): + """ + block_size: int or None + If 0, no buffering, if 1, line buffering, if >1, buffer that many + bytes, if None use default from paramiko. + """ + logger.debug("Opening file %s", path) + if kwargs.get("autocommit", True) is False: + # writes to temporary file, move on commit + path2 = "/".join([self.temppath, str(uuid.uuid4())]) + f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1) + f.temppath = path2 + f.targetpath = path + f.fs = self + f.commit = types.MethodType(commit_a_file, f) + f.discard = types.MethodType(discard_a_file, f) + else: + f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1) + return f + + def _rm(self, path): + if self.isdir(path): + self.ftp.rmdir(path) + else: + self.ftp.remove(path) + + def mv(self, old, new): + logger.debug("Renaming %s into %s", old, new) + self.ftp.posix_rename(old, new) + + +def commit_a_file(self): + self.fs.mv(self.temppath, self.targetpath) + + +def discard_a_file(self): + self.fs._rm(self.temppath) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/smb.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/smb.py new file mode 100644 index 0000000000000000000000000000000000000000..867a581582cee1e03b50654b74537f1b3f3a4a08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/smb.py @@ -0,0 +1,333 @@ +""" +This module contains SMBFileSystem class responsible for handling access to +Windows Samba network shares by using package smbprotocol +""" + +import datetime +import uuid +from stat import S_ISDIR, S_ISLNK + +import smbclient + +from .. import AbstractFileSystem +from ..utils import infer_storage_options + +# ! pylint: disable=bad-continuation + + +class SMBFileSystem(AbstractFileSystem): + """Allow reading and writing to Windows and Samba network shares. + + When using `fsspec.open()` for getting a file-like object the URI + should be specified as this format: + ``smb://workgroup;user:password@server:port/share/folder/file.csv``. + + Example:: + + >>> import fsspec + >>> with fsspec.open( + ... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv' + ... ) as smbfile: + ... df = pd.read_csv(smbfile, sep='|', header=None) + + Note that you need to pass in a valid hostname or IP address for the host + component of the URL. Do not use the Windows/NetBIOS machine name for the + host component. + + The first component of the path in the URL points to the name of the shared + folder. Subsequent path components will point to the directory/folder/file. + + The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be + optional. + + .. note:: + + For working this source require `smbprotocol`_ to be installed, e.g.:: + + $ pip install smbprotocol + # or + # pip install smbprotocol[kerberos] + + .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements + + Note: if using this with the ``open`` or ``open_files``, with full URLs, + there is no way to tell if a path is relative, so all paths are assumed + to be absolute. + """ + + protocol = "smb" + + # pylint: disable=too-many-arguments + def __init__( + self, + host, + port=None, + username=None, + password=None, + timeout=60, + encrypt=None, + share_access=None, + register_session_retries=5, + **kwargs, + ): + """ + You can use _get_kwargs_from_urls to get some kwargs from + a reasonable SMB url. + + Authentication will be anonymous or integrated if username/password are not + given. + + Parameters + ---------- + host: str + The remote server name/ip to connect to + port: int or None + Port to connect with. Usually 445, sometimes 139. + username: str or None + Username to connect with. Required if Kerberos auth is not being used. + password: str or None + User's password on the server, if using username + timeout: int + Connection timeout in seconds + encrypt: bool + Whether to force encryption or not, once this has been set to True + the session cannot be changed back to False. + share_access: str or None + Specifies the default access applied to file open operations + performed with this file system object. + This affects whether other processes can concurrently open a handle + to the same file. + + - None (the default): exclusively locks the file until closed. + - 'r': Allow other handles to be opened with read access. + - 'w': Allow other handles to be opened with write access. + - 'd': Allow other handles to be opened with delete access. + """ + super().__init__(**kwargs) + self.host = host + self.port = port + self.username = username + self.password = password + self.timeout = timeout + self.encrypt = encrypt + self.temppath = kwargs.pop("temppath", "") + self.share_access = share_access + self.register_session_retries = register_session_retries + self._connect() + + @property + def _port(self): + return 445 if self.port is None else self.port + + def _connect(self): + import time + + for _ in range(self.register_session_retries): + try: + smbclient.register_session( + self.host, + username=self.username, + password=self.password, + port=self._port, + encrypt=self.encrypt, + connection_timeout=self.timeout, + ) + break + except Exception: + time.sleep(0.1) + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(path): + # smb://workgroup;user:password@host:port/share/folder/file.csv + out = infer_storage_options(path) + out.pop("path", None) + out.pop("protocol", None) + return out + + def mkdir(self, path, create_parents=True, **kwargs): + wpath = _as_unc_path(self.host, path) + if create_parents: + smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs) + else: + smbclient.mkdir(wpath, port=self._port, **kwargs) + + def makedirs(self, path, exist_ok=False): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port) + + def rmdir(self, path): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + smbclient.rmdir(wpath, port=self._port) + + def info(self, path, **kwargs): + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, port=self._port, **kwargs) + if S_ISDIR(stats.st_mode): + stype = "directory" + elif S_ISLNK(stats.st_mode): + stype = "link" + else: + stype = "file" + res = { + "name": path + "/" if stype == "directory" else path, + "size": stats.st_size, + "type": stype, + "uid": stats.st_uid, + "gid": stats.st_gid, + "time": stats.st_atime, + "mtime": stats.st_mtime, + } + return res + + def created(self, path): + """Return the created timestamp of a file as a datetime.datetime""" + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, port=self._port) + return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc) + + def modified(self, path): + """Return the modified timestamp of a file as a datetime.datetime""" + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, port=self._port) + return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc) + + def ls(self, path, detail=True, **kwargs): + unc = _as_unc_path(self.host, path) + listed = smbclient.listdir(unc, port=self._port, **kwargs) + dirs = ["/".join([path.rstrip("/"), p]) for p in listed] + if detail: + dirs = [self.info(d) for d in dirs] + return dirs + + # pylint: disable=too-many-arguments + def _open( + self, + path, + mode="rb", + block_size=-1, + autocommit=True, + cache_options=None, + **kwargs, + ): + """ + block_size: int or None + If 0, no buffering, 1, line buffering, >1, buffer that many bytes + + Notes + ----- + By specifying 'share_access' in 'kwargs' it is possible to override the + default shared access setting applied in the constructor of this object. + """ + bls = block_size if block_size is not None and block_size >= 0 else -1 + wpath = _as_unc_path(self.host, path) + share_access = kwargs.pop("share_access", self.share_access) + if "w" in mode and autocommit is False: + temp = _as_temp_path(self.host, path, self.temppath) + return SMBFileOpener( + wpath, temp, mode, port=self._port, block_size=bls, **kwargs + ) + return smbclient.open_file( + wpath, + mode, + buffering=bls, + share_access=share_access, + port=self._port, + **kwargs, + ) + + def copy(self, path1, path2, **kwargs): + """Copy within two locations in the same filesystem""" + wpath1 = _as_unc_path(self.host, path1) + wpath2 = _as_unc_path(self.host, path2) + smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs) + + def _rm(self, path): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, port=self._port) + if S_ISDIR(stats.st_mode): + smbclient.rmdir(wpath, port=self._port) + else: + smbclient.remove(wpath, port=self._port) + + def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs): + wpath1 = _as_unc_path(self.host, path1) + wpath2 = _as_unc_path(self.host, path2) + smbclient.rename(wpath1, wpath2, port=self._port, **kwargs) + + +def _as_unc_path(host, path): + rpath = path.replace("/", "\\") + unc = f"\\\\{host}{rpath}" + return unc + + +def _as_temp_path(host, path, temppath): + share = path.split("/")[1] + temp_file = f"/{share}{temppath}/{uuid.uuid4()}" + unc = _as_unc_path(host, temp_file) + return unc + + +def _share_has_path(path): + parts = path.count("/") + if path.endswith("/"): + return parts > 2 + return parts > 1 + + +class SMBFileOpener: + """writes to remote temporary file, move on commit""" + + def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs): + self.path = path + self.temp = temp + self.mode = mode + self.block_size = block_size + self.kwargs = kwargs + self.smbfile = None + self._incontext = False + self.port = port + self._open() + + def _open(self): + if self.smbfile is None or self.smbfile.closed: + self.smbfile = smbclient.open_file( + self.temp, + self.mode, + port=self.port, + buffering=self.block_size, + **self.kwargs, + ) + + def commit(self): + """Move temp file to definitive on success.""" + # TODO: use transaction support in SMB protocol + smbclient.replace(self.temp, self.path, port=self.port) + + def discard(self): + """Remove the temp file on failure.""" + smbclient.remove(self.temp, port=self.port) + + def __fspath__(self): + return self.path + + def __iter__(self): + return self.smbfile.__iter__() + + def __getattr__(self, item): + return getattr(self.smbfile, item) + + def __enter__(self): + self._incontext = True + return self.smbfile.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + self._incontext = False + self.smbfile.__exit__(exc_type, exc_value, traceback) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py new file mode 100644 index 0000000000000000000000000000000000000000..412e5ba4d2cdea7db090dc96412e697909a38d78 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py @@ -0,0 +1,124 @@ +import logging +import tarfile + +import fsspec +from fsspec.archive import AbstractArchiveFileSystem +from fsspec.compression import compr +from fsspec.utils import infer_compression + +typemap = {b"0": "file", b"5": "directory"} + +logger = logging.getLogger("tar") + + +class TarFileSystem(AbstractArchiveFileSystem): + """Compressed Tar archives as a file-system (read-only) + + Supports the following formats: + tar.gz, tar.bz2, tar.xz + """ + + root_marker = "" + protocol = "tar" + cachable = False + + def __init__( + self, + fo="", + index_store=None, + target_options=None, + target_protocol=None, + compression=None, + **kwargs, + ): + super().__init__(**kwargs) + target_options = target_options or {} + + if isinstance(fo, str): + self.of = fsspec.open(fo, protocol=target_protocol, **target_options) + fo = self.of.open() # keep the reference + + # Try to infer compression. + if compression is None: + name = None + + # Try different ways to get hold of the filename. `fo` might either + # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an + # `fsspec.AbstractFileSystem` instance. + try: + # Amended io.BufferedReader or similar. + # This uses a "protocol extension" where original filenames are + # propagated to archive-like filesystems in order to let them + # infer the right compression appropriately. + if hasattr(fo, "original"): + name = fo.original + + # fsspec.LocalFileOpener + elif hasattr(fo, "path"): + name = fo.path + + # io.BufferedReader + elif hasattr(fo, "name"): + name = fo.name + + # fsspec.AbstractFileSystem + elif hasattr(fo, "info"): + name = fo.info()["name"] + + except Exception as ex: + logger.warning( + f"Unable to determine file name, not inferring compression: {ex}" + ) + + if name is not None: + compression = infer_compression(name) + logger.info(f"Inferred compression {compression} from file name {name}") + + if compression is not None: + # TODO: tarfile already implements compression with modes like "'r:gz'", + # but then would seek to offset in the file work? + fo = compr[compression](fo) + + self._fo_ref = fo + self.fo = fo # the whole instance is a context + self.tar = tarfile.TarFile(fileobj=self.fo) + self.dir_cache = None + + self.index_store = index_store + self.index = None + self._index() + + def _index(self): + # TODO: load and set saved index, if exists + out = {} + for ti in self.tar: + info = ti.get_info() + info["type"] = typemap.get(info["type"], "file") + name = ti.get_info()["name"].rstrip("/") + out[name] = (info, ti.offset_data) + + self.index = out + # TODO: save index to self.index_store here, if set + + def _get_dirs(self): + if self.dir_cache is not None: + return + + # This enables ls to get directories as children as well as files + self.dir_cache = { + dirname: {"name": dirname, "size": 0, "type": "directory"} + for dirname in self._all_dirnames(self.tar.getnames()) + } + for member in self.tar.getmembers(): + info = member.get_info() + info["name"] = info["name"].rstrip("/") + info["type"] = typemap.get(info["type"], "file") + self.dir_cache[info["name"]] = info + + def _open(self, path, mode="rb", **kwargs): + if mode != "rb": + raise ValueError("Read-only filesystem implementation") + details, offset = self.index[path] + if details["type"] != "file": + raise ValueError("Can only handle regular files") + return self.tar.extractfile(path) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py new file mode 100644 index 0000000000000000000000000000000000000000..4bac5d51aa52ccfa3319d86c8c8cd384497881a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py @@ -0,0 +1,484 @@ +# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html + +import logging +import os +import secrets +import shutil +import tempfile +import uuid +from contextlib import suppress +from urllib.parse import quote + +import requests + +from ..spec import AbstractBufferedFile, AbstractFileSystem +from ..utils import infer_storage_options, tokenize + +logger = logging.getLogger("webhdfs") + + +class WebHDFS(AbstractFileSystem): + """ + Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways. + + Four auth mechanisms are supported: + + insecure: no auth is done, and the user is assumed to be whoever they + say they are (parameter ``user``), or a predefined value such as + "dr.who" if not given + spnego: when kerberos authentication is enabled, auth is negotiated by + requests_kerberos https://github.com/requests/requests-kerberos . + This establishes a session based on existing kinit login and/or + specified principal/password; parameters are passed with ``kerb_kwargs`` + token: uses an existing Hadoop delegation token from another secured + service. Indeed, this client can also generate such tokens when + not insecure. Note that tokens expire, but can be renewed (by a + previously specified user) and may allow for proxying. + basic-auth: used when both parameter ``user`` and parameter ``password`` + are provided. + + """ + + tempdir = str(tempfile.gettempdir()) + protocol = "webhdfs", "webHDFS" + + def __init__( + self, + host, + port=50070, + kerberos=False, + token=None, + user=None, + password=None, + proxy_to=None, + kerb_kwargs=None, + data_proxy=None, + use_https=False, + session_cert=None, + session_verify=True, + **kwargs, + ): + """ + Parameters + ---------- + host: str + Name-node address + port: int + Port for webHDFS + kerberos: bool + Whether to authenticate with kerberos for this connection + token: str or None + If given, use this token on every call to authenticate. A user + and user-proxy may be encoded in the token and should not be also + given + user: str or None + If given, assert the user name to connect with + password: str or None + If given, assert the password to use for basic auth. If password + is provided, user must be provided also + proxy_to: str or None + If given, the user has the authority to proxy, and this value is + the user in who's name actions are taken + kerb_kwargs: dict + Any extra arguments for HTTPKerberosAuth, see + ``_ + data_proxy: dict, callable or None + If given, map data-node addresses. This can be necessary if the + HDFS cluster is behind a proxy, running on Docker or otherwise has + a mismatch between the host-names given by the name-node and the + address by which to refer to them from the client. If a dict, + maps host names ``host->data_proxy[host]``; if a callable, full + URLs are passed, and function must conform to + ``url->data_proxy(url)``. + use_https: bool + Whether to connect to the Name-node using HTTPS instead of HTTP + session_cert: str or Tuple[str, str] or None + Path to a certificate file, or tuple of (cert, key) files to use + for the requests.Session + session_verify: str, bool or None + Path to a certificate file to use for verifying the requests.Session. + kwargs + """ + if self._cached: + return + super().__init__(**kwargs) + self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" # noqa + self.kerb = kerberos + self.kerb_kwargs = kerb_kwargs or {} + self.pars = {} + self.proxy = data_proxy or {} + if token is not None: + if user is not None or proxy_to is not None: + raise ValueError( + "If passing a delegation token, must not set " + "user or proxy_to, as these are encoded in the" + " token" + ) + self.pars["delegation"] = token + self.user = user + self.password = password + + if password is not None: + if user is None: + raise ValueError( + "If passing a password, the user must also be" + "set in order to set up the basic-auth" + ) + else: + if user is not None: + self.pars["user.name"] = user + + if proxy_to is not None: + self.pars["doas"] = proxy_to + if kerberos and user is not None: + raise ValueError( + "If using Kerberos auth, do not specify the " + "user, this is handled by kinit." + ) + + self.session_cert = session_cert + self.session_verify = session_verify + + self._connect() + + self._fsid = f"webhdfs_{tokenize(host, port)}" + + @property + def fsid(self): + return self._fsid + + def _connect(self): + self.session = requests.Session() + + if self.session_cert: + self.session.cert = self.session_cert + + self.session.verify = self.session_verify + + if self.kerb: + from requests_kerberos import HTTPKerberosAuth + + self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs) + + if self.user is not None and self.password is not None: + from requests.auth import HTTPBasicAuth + + self.session.auth = HTTPBasicAuth(self.user, self.password) + + def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs): + url = self._apply_proxy(self.url + quote(path or "", safe="/=")) + args = kwargs.copy() + args.update(self.pars) + args["op"] = op.upper() + logger.debug("sending %s with %s", url, method) + out = self.session.request( + method=method.upper(), + url=url, + params=args, + data=data, + allow_redirects=redirect, + ) + if out.status_code in [400, 401, 403, 404, 500]: + try: + err = out.json() + msg = err["RemoteException"]["message"] + exp = err["RemoteException"]["exception"] + except (ValueError, KeyError): + pass + else: + if exp in ["IllegalArgumentException", "UnsupportedOperationException"]: + raise ValueError(msg) + elif exp in ["SecurityException", "AccessControlException"]: + raise PermissionError(msg) + elif exp in ["FileNotFoundException"]: + raise FileNotFoundError(msg) + else: + raise RuntimeError(msg) + out.raise_for_status() + return out + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + replication=None, + permissions=None, + **kwargs, + ): + """ + + Parameters + ---------- + path: str + File location + mode: str + 'rb', 'wb', etc. + block_size: int + Client buffer size for read-ahead or write buffer + autocommit: bool + If False, writes to temporary file that only gets put in final + location upon commit + replication: int + Number of copies of file on the cluster, write mode only + permissions: str or int + posix permissions, write mode only + kwargs + + Returns + ------- + WebHDFile instance + """ + block_size = block_size or self.blocksize + return WebHDFile( + self, + path, + mode=mode, + block_size=block_size, + tempdir=self.tempdir, + autocommit=autocommit, + replication=replication, + permissions=permissions, + ) + + @staticmethod + def _process_info(info): + info["type"] = info["type"].lower() + info["size"] = info["length"] + return info + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + if "username" in out: + out["user"] = out.pop("username") + return out + + def info(self, path): + out = self._call("GETFILESTATUS", path=path) + info = out.json()["FileStatus"] + info["name"] = path + return self._process_info(info) + + def ls(self, path, detail=False): + out = self._call("LISTSTATUS", path=path) + infos = out.json()["FileStatuses"]["FileStatus"] + for info in infos: + self._process_info(info) + info["name"] = path.rstrip("/") + "/" + info["pathSuffix"] + if detail: + return sorted(infos, key=lambda i: i["name"]) + else: + return sorted(info["name"] for info in infos) + + def content_summary(self, path): + """Total numbers of files, directories and bytes under path""" + out = self._call("GETCONTENTSUMMARY", path=path) + return out.json()["ContentSummary"] + + def ukey(self, path): + """Checksum info of file, giving method and result""" + out = self._call("GETFILECHECKSUM", path=path, redirect=False) + if "Location" in out.headers: + location = self._apply_proxy(out.headers["Location"]) + out2 = self.session.get(location) + out2.raise_for_status() + return out2.json()["FileChecksum"] + else: + out.raise_for_status() + return out.json()["FileChecksum"] + + def home_directory(self): + """Get user's home directory""" + out = self._call("GETHOMEDIRECTORY") + return out.json()["Path"] + + def get_delegation_token(self, renewer=None): + """Retrieve token which can give the same authority to other uses + + Parameters + ---------- + renewer: str or None + User who may use this token; if None, will be current user + """ + if renewer: + out = self._call("GETDELEGATIONTOKEN", renewer=renewer) + else: + out = self._call("GETDELEGATIONTOKEN") + t = out.json()["Token"] + if t is None: + raise ValueError("No token available for this user/security context") + return t["urlString"] + + def renew_delegation_token(self, token): + """Make token live longer. Returns new expiry time""" + out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token) + return out.json()["long"] + + def cancel_delegation_token(self, token): + """Stop the token from being useful""" + self._call("CANCELDELEGATIONTOKEN", method="put", token=token) + + def chmod(self, path, mod): + """Set the permission at path + + Parameters + ---------- + path: str + location to set (file or directory) + mod: str or int + posix epresentation or permission, give as oct string, e.g, '777' + or 0o777 + """ + self._call("SETPERMISSION", method="put", path=path, permission=mod) + + def chown(self, path, owner=None, group=None): + """Change owning user and/or group""" + kwargs = {} + if owner is not None: + kwargs["owner"] = owner + if group is not None: + kwargs["group"] = group + self._call("SETOWNER", method="put", path=path, **kwargs) + + def set_replication(self, path, replication): + """ + Set file replication factor + + Parameters + ---------- + path: str + File location (not for directories) + replication: int + Number of copies of file on the cluster. Should be smaller than + number of data nodes; normally 3 on most systems. + """ + self._call("SETREPLICATION", path=path, method="put", replication=replication) + + def mkdir(self, path, **kwargs): + self._call("MKDIRS", method="put", path=path) + + def makedirs(self, path, exist_ok=False): + if exist_ok is False and self.exists(path): + raise FileExistsError(path) + self.mkdir(path) + + def mv(self, path1, path2, **kwargs): + self._call("RENAME", method="put", path=path1, destination=path2) + + def rm(self, path, recursive=False, **kwargs): + self._call( + "DELETE", + method="delete", + path=path, + recursive="true" if recursive else "false", + ) + + def rm_file(self, path, **kwargs): + self.rm(path) + + def cp_file(self, lpath, rpath, **kwargs): + with self.open(lpath) as lstream: + tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"]) + # Perform an atomic copy (stream to a temporary file and + # move it to the actual destination). + try: + with self.open(tmp_fname, "wb") as rstream: + shutil.copyfileobj(lstream, rstream) + self.mv(tmp_fname, rpath) + except BaseException: # noqa + with suppress(FileNotFoundError): + self.rm(tmp_fname) + raise + + def _apply_proxy(self, location): + if self.proxy and callable(self.proxy): + location = self.proxy(location) + elif self.proxy: + # as a dict + for k, v in self.proxy.items(): + location = location.replace(k, v, 1) + return location + + +class WebHDFile(AbstractBufferedFile): + """A file living in HDFS over webHDFS""" + + def __init__(self, fs, path, **kwargs): + super().__init__(fs, path, **kwargs) + kwargs = kwargs.copy() + if kwargs.get("permissions", None) is None: + kwargs.pop("permissions", None) + if kwargs.get("replication", None) is None: + kwargs.pop("replication", None) + self.permissions = kwargs.pop("permissions", 511) + tempdir = kwargs.pop("tempdir") + if kwargs.pop("autocommit", False) is False: + self.target = self.path + self.path = os.path.join(tempdir, str(uuid.uuid4())) + + def _upload_chunk(self, final=False): + """Write one part of a multi-block file upload + + Parameters + ========== + final: bool + This is the last block, so should complete file, if + self.autocommit is True. + """ + out = self.fs.session.post( + self.location, + data=self.buffer.getvalue(), + headers={"content-type": "application/octet-stream"}, + ) + out.raise_for_status() + return True + + def _initiate_upload(self): + """Create remote file/upload""" + kwargs = self.kwargs.copy() + if "a" in self.mode: + op, method = "APPEND", "POST" + else: + op, method = "CREATE", "PUT" + kwargs["overwrite"] = "true" + out = self.fs._call(op, method, self.path, redirect=False, **kwargs) + location = self.fs._apply_proxy(out.headers["Location"]) + if "w" in self.mode: + # create empty file to append to + out2 = self.fs.session.put( + location, headers={"content-type": "application/octet-stream"} + ) + out2.raise_for_status() + # after creating empty file, change location to append to + out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs) + self.location = self.fs._apply_proxy(out2.headers["Location"]) + + def _fetch_range(self, start, end): + start = max(start, 0) + end = min(self.size, end) + if start >= end or start >= self.size: + return b"" + out = self.fs._call( + "OPEN", path=self.path, offset=start, length=end - start, redirect=False + ) + out.raise_for_status() + if "Location" in out.headers: + location = out.headers["Location"] + out2 = self.fs.session.get(self.fs._apply_proxy(location)) + return out2.content + else: + return out.content + + def commit(self): + self.fs.mv(self.path, self.target) + + def discard(self): + self.fs.rm(self.path) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/zip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/zip.py new file mode 100644 index 0000000000000000000000000000000000000000..9d9c046bfde313b6868399c4d200bc779c1ab19f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/zip.py @@ -0,0 +1,134 @@ +import zipfile + +import fsspec +from fsspec.archive import AbstractArchiveFileSystem + + +class ZipFileSystem(AbstractArchiveFileSystem): + """Read/Write contents of ZIP archive as a file-system + + Keeps file object open while instance lives. + + This class is pickleable, but not necessarily thread-safe + """ + + root_marker = "" + protocol = "zip" + cachable = False + + def __init__( + self, + fo="", + mode="r", + target_protocol=None, + target_options=None, + compression=zipfile.ZIP_STORED, + allowZip64=True, + compresslevel=None, + **kwargs, + ): + """ + Parameters + ---------- + fo: str or file-like + Contains ZIP, and must exist. If a str, will fetch file using + :meth:`~fsspec.open_files`, which must return one file exactly. + mode: str + Accept: "r", "w", "a" + target_protocol: str (optional) + If ``fo`` is a string, this value can be used to override the + FS protocol inferred from a URL + target_options: dict (optional) + Kwargs passed when instantiating the target FS, if ``fo`` is + a string. + compression, allowZip64, compresslevel: passed to ZipFile + Only relevant when creating a ZIP + """ + super().__init__(self, **kwargs) + if mode not in set("rwa"): + raise ValueError(f"mode '{mode}' no understood") + self.mode = mode + if isinstance(fo, str): + if mode == "a": + m = "r+b" + else: + m = mode + "b" + fo = fsspec.open( + fo, mode=m, protocol=target_protocol, **(target_options or {}) + ) + self.force_zip_64 = allowZip64 + self.of = fo + self.fo = fo.__enter__() # the whole instance is a context + self.zip = zipfile.ZipFile( + self.fo, + mode=mode, + compression=compression, + allowZip64=allowZip64, + compresslevel=compresslevel, + ) + self.dir_cache = None + + @classmethod + def _strip_protocol(cls, path): + # zip file paths are always relative to the archive root + return super()._strip_protocol(path).lstrip("/") + + def __del__(self): + if hasattr(self, "zip"): + self.close() + del self.zip + + def close(self): + """Commits any write changes to the file. Done on ``del`` too.""" + self.zip.close() + + def _get_dirs(self): + if self.dir_cache is None or self.mode in set("wa"): + # when writing, dir_cache is always in the ZipFile's attributes, + # not read from the file. + files = self.zip.infolist() + self.dir_cache = { + dirname.rstrip("/"): { + "name": dirname.rstrip("/"), + "size": 0, + "type": "directory", + } + for dirname in self._all_dirnames(self.zip.namelist()) + } + for z in files: + f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__} + f.update( + { + "name": z.filename.rstrip("/"), + "size": z.file_size, + "type": ("directory" if z.is_dir() else "file"), + } + ) + self.dir_cache[f["name"]] = f + + def pipe_file(self, path, value, **kwargs): + # override upstream, because we know the exact file size in this case + self.zip.writestr(path, value, **kwargs) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + path = self._strip_protocol(path) + if "r" in mode and self.mode in set("wa"): + if self.exists(path): + raise OSError("ZipFS can only be open for reading or writing, not both") + raise FileNotFoundError(path) + if "r" in self.mode and "w" in mode: + raise OSError("ZipFS can only be open for reading or writing, not both") + out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64) + if "r" in mode: + info = self.info(path) + out.size = info["size"] + out.name = info["name"] + return out diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/conftest.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..fb1efb0414e77679a86ca80af89e32655df306c4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/conftest.py @@ -0,0 +1,188 @@ +import contextlib +import gzip +import json +import os +import threading +from collections import ChainMap +from http.server import BaseHTTPRequestHandler, HTTPServer + +import pytest + +requests = pytest.importorskip("requests") +port = 9898 +data = b"\n".join([b"some test data"] * 1000) +realfile = f"http://127.0.0.1:{port}/index/realfile" +index = b'Link' % realfile.encode() +listing = open( + os.path.join(os.path.dirname(__file__), "data", "listing.html"), "rb" +).read() +win = os.name == "nt" + + +def _make_listing(*paths): + return "\n".join( + f'Link_{i}' + for i, f in enumerate(paths) + ).encode() + + +@pytest.fixture +def reset_files(): + yield + + # Reset the newly added files after the + # test is completed. + HTTPTestHandler.dynamic_files.clear() + + +class HTTPTestHandler(BaseHTTPRequestHandler): + static_files = { + "/index/realfile": data, + "/index/otherfile": data, + "/index": index, + "/data/20020401": listing, + "/simple/": _make_listing("/simple/file", "/simple/dir/"), + "/simple/file": data, + "/simple/dir/": _make_listing("/simple/dir/file"), + "/simple/dir/file": data, + } + dynamic_files = {} + + files = ChainMap(dynamic_files, static_files) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _respond(self, code=200, headers=None, data=b""): + headers = headers or {} + headers.update({"User-Agent": "test"}) + self.send_response(code) + for k, v in headers.items(): + self.send_header(k, str(v)) + self.end_headers() + if data: + self.wfile.write(data) + + def do_GET(self): + file_path = self.path + if file_path.endswith("/") and file_path.rstrip("/") in self.files: + file_path = file_path.rstrip("/") + file_data = self.files.get(file_path) + if "give_path" in self.headers: + return self._respond(200, data=json.dumps({"path": self.path}).encode()) + if "redirect" in self.headers and file_path != "/index/realfile": + new_url = f"http://127.0.0.1:{port}/index/realfile" + return self._respond(301, {"Location": new_url}) + if file_data is None: + return self._respond(404) + + status = 200 + content_range = f"bytes 0-{len(file_data) - 1}/{len(file_data)}" + if ("Range" in self.headers) and ("ignore_range" not in self.headers): + ran = self.headers["Range"] + b, ran = ran.split("=") + start, end = ran.split("-") + if start: + content_range = f"bytes {start}-{end}/{len(file_data)}" + file_data = file_data[int(start) : (int(end) + 1) if end else None] + else: + # suffix only + l = len(file_data) + content_range = f"bytes {l - int(end)}-{l - 1}/{l}" + file_data = file_data[-int(end) :] + if "use_206" in self.headers: + status = 206 + if "give_length" in self.headers: + if "gzip_encoding" in self.headers: + file_data = gzip.compress(file_data) + response_headers = { + "Content-Length": len(file_data), + "Content-Encoding": "gzip", + } + else: + response_headers = {"Content-Length": len(file_data)} + self._respond(status, response_headers, file_data) + elif "give_range" in self.headers: + self._respond(status, {"Content-Range": content_range}, file_data) + elif "give_mimetype" in self.headers: + self._respond( + status, {"Content-Type": "text/html; charset=utf-8"}, file_data + ) + else: + self._respond(status, data=file_data) + + def do_POST(self): + length = self.headers.get("Content-Length") + file_path = self.path.rstrip("/") + if length is None: + assert self.headers.get("Transfer-Encoding") == "chunked" + self.files[file_path] = b"".join(self.read_chunks()) + else: + self.files[file_path] = self.rfile.read(length) + self._respond(200) + + do_PUT = do_POST + + def read_chunks(self): + length = -1 + while length != 0: + line = self.rfile.readline().strip() + if len(line) == 0: + length = 0 + else: + length = int(line, 16) + yield self.rfile.read(length) + self.rfile.readline() + + def do_HEAD(self): + if "head_not_auth" in self.headers: + return self._respond( + 403, {"Content-Length": 123}, b"not authorized for HEAD request" + ) + elif "head_ok" not in self.headers: + return self._respond(405) + + file_path = self.path.rstrip("/") + file_data = self.files.get(file_path) + if file_data is None: + return self._respond(404) + + if ("give_length" in self.headers) or ("head_give_length" in self.headers): + response_headers = {"Content-Length": len(file_data)} + if "zero_length" in self.headers: + response_headers["Content-Length"] = 0 + elif "gzip_encoding" in self.headers: + file_data = gzip.compress(file_data) + response_headers["Content-Encoding"] = "gzip" + response_headers["Content-Length"] = len(file_data) + + self._respond(200, response_headers) + elif "give_range" in self.headers: + self._respond( + 200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"} + ) + elif "give_etag" in self.headers: + self._respond(200, {"ETag": "xxx"}) + else: + self._respond(200) # OK response, but no useful info + + +@contextlib.contextmanager +def serve(): + server_address = ("", port) + httpd = HTTPServer(server_address, HTTPTestHandler) + th = threading.Thread(target=httpd.serve_forever) + th.daemon = True + th.start() + try: + yield f"http://127.0.0.1:{port}" + finally: + httpd.socket.close() + httpd.shutdown() + th.join() + + +@pytest.fixture(scope="module") +def server(): + with serve() as s: + yield s diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_api.py new file mode 100644 index 0000000000000000000000000000000000000000..24e6101f0988a90f842df12dcc44fdb8cabab7e6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_api.py @@ -0,0 +1,498 @@ +"""Tests the spec, using memoryfs""" + +import contextlib +import os +import pickle +import tempfile +from unittest.mock import Mock + +import pytest + +import fsspec +from fsspec.implementations.memory import MemoryFile, MemoryFileSystem + + +def test_idempotent(): + MemoryFileSystem.clear_instance_cache() + fs = MemoryFileSystem() + fs2 = MemoryFileSystem() + assert fs is fs2 + assert MemoryFileSystem.current() is fs2 + + MemoryFileSystem.clear_instance_cache() + assert not MemoryFileSystem._cache + + fs2 = MemoryFileSystem().current() + assert fs == fs2 + + +def test_pickle(): + fs = MemoryFileSystem() + fs2 = pickle.loads(pickle.dumps(fs)) + assert fs == fs2 + + +def test_class_methods(): + assert MemoryFileSystem._strip_protocol("memory://stuff") == "/stuff" + assert MemoryFileSystem._strip_protocol("stuff") == "/stuff" + assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff" + + assert MemoryFileSystem._get_kwargs_from_urls("memory://user@thing") == {} + + +def test_multi(m): + m.pipe("/afile", b"data") + fs, token, paths = fsspec.core.get_fs_token_paths(["/afile", "/afile"]) + assert len(paths) == 2 + + +def test_get_put(tmpdir, m): + tmpdir = str(tmpdir) + fn = os.path.join(tmpdir, "one") + open(fn, "wb").write(b"one") + os.mkdir(os.path.join(tmpdir, "dir")) + fn2 = os.path.join(tmpdir, "dir", "two") + open(fn2, "wb").write(b"two") + + fs = MemoryFileSystem() + fs.put(fn, "/afile") + assert fs.cat("/afile") == b"one" + + fs.store["/bfile"] = MemoryFile(fs, "/bfile", b"data") + fn3 = os.path.join(tmpdir, "three") + fs.get("/bfile", fn3) + assert open(fn3, "rb").read() == b"data" + + fs.put(tmpdir, "/more", recursive=True) + assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"] + + @contextlib.contextmanager + def tmp_chdir(path): + curdir = os.getcwd() + os.chdir(path) + try: + yield + finally: + os.chdir(curdir) + + with tmp_chdir(os.path.join(tmpdir, os.path.pardir)): + fs.put(os.path.basename(tmpdir), "/moretwo", recursive=True) + assert fs.find("/moretwo") == [ + "/moretwo/dir/two", + "/moretwo/one", + "/moretwo/three", + ] + + with tmp_chdir(tmpdir): + fs.put(os.path.curdir, "/morethree", recursive=True) + assert fs.find("/morethree") == [ + "/morethree/dir/two", + "/morethree/one", + "/morethree/three", + ] + + for f in [fn, fn2, fn3]: + os.remove(f) + os.rmdir(os.path.join(tmpdir, "dir")) + + fs.get("/more/", tmpdir + "/", recursive=True) + assert open(fn3, "rb").read() == b"data" + assert open(fn, "rb").read() == b"one" + + +def test_du(m): + fs = MemoryFileSystem() + fs.store.update( + { + "/dir/afile": MemoryFile(fs, "/afile", b"a"), + "/dir/dirb/afile": MemoryFile(fs, "/afile", b"bb"), + "/dir/dirb/bfile": MemoryFile(fs, "/afile", b"ccc"), + } + ) + assert fs.du("/dir") == 6 + assert fs.du("/dir", total=False) == { + "/dir/afile": 1, + "/dir/dirb/afile": 2, + "/dir/dirb/bfile": 3, + } + assert fs.du("/dir", withdirs=True) == 6 + assert fs.du("/dir", total=False, withdirs=True) == { + "/dir": 0, + "/dir/afile": 1, + "/dir/dirb": 0, + "/dir/dirb/afile": 2, + "/dir/dirb/bfile": 3, + } + with pytest.raises(ValueError): + assert fs.du("/dir", maxdepth=0) == 1 + assert fs.du("/dir", total=False, withdirs=True, maxdepth=1) == { + "/dir": 0, + "/dir/afile": 1, + "/dir/dirb": 0, + } + + # Size of file only. + assert fs.du("/dir/afile") == 1 + assert fs.du("/dir/afile", withdirs=True) == 1 + + +def test_head_tail(m): + fs = MemoryFileSystem() + with fs.open("/myfile", "wb") as f: + f.write(b"I had a nice big cabbage") + assert fs.head("/myfile", 5) == b"I had" + assert fs.tail("/myfile", 7) == b"cabbage" + + +def test_move(m): + fs = MemoryFileSystem() + with fs.open("/myfile", "wb") as f: + f.write(b"I had a nice big cabbage") + fs.move("/myfile", "/otherfile") + assert not fs.exists("/myfile") + assert fs.info("/otherfile") + assert isinstance(fs.ukey("/otherfile"), str) + + +def test_recursive_get_put(tmpdir, m): + fs = MemoryFileSystem() + os.makedirs(f"{tmpdir}/nest") + for file in ["one", "two", "nest/other"]: + with open(f"{tmpdir}/{file}", "wb") as f: + f.write(b"data") + + fs.put(str(tmpdir), "test", recursive=True) + + # get to directory with slash + d = tempfile.mkdtemp() + fs.get("test/", d, recursive=True) + for file in ["one", "two", "nest/other"]: + with open(f"{d}/{file}", "rb") as f: + f.read() == b"data" + + # get to directory without slash + d = tempfile.mkdtemp() + fs.get("test", d, recursive=True) + for file in ["test/one", "test/two", "test/nest/other"]: + with open(f"{d}/{file}", "rb") as f: + f.read() == b"data" + + +def test_pipe_cat(m): + fs = MemoryFileSystem() + fs.pipe("afile", b"contents") + assert fs.cat("afile") == b"contents" + + data = {"/bfile": b"more", "/cfile": b"stuff"} + fs.pipe(data) + assert fs.cat(list(data)) == data + + +def test_read_block_delimiter(m): + fs = MemoryFileSystem() + with fs.open("/myfile", "wb") as f: + f.write(b"some\nlines\nof\ntext") + assert fs.read_block("/myfile", 0, 2, b"\n") == b"some\n" + assert fs.read_block("/myfile", 2, 6, b"\n") == b"lines\n" + assert fs.read_block("/myfile", 6, 2, b"\n") == b"" + assert fs.read_block("/myfile", 2, 9, b"\n") == b"lines\nof\n" + assert fs.read_block("/myfile", 12, 6, b"\n") == b"text" + assert fs.read_block("/myfile", 0, None) == fs.cat("/myfile") + + +def test_open_text(m): + fs = MemoryFileSystem() + with fs.open("/myfile", "wb") as f: + f.write(b"some\nlines\nof\ntext") + f = fs.open("/myfile", "r", encoding="latin1") + assert f.encoding == "latin1" + + +def test_read_text(m): + with m.open("/myfile", "w", encoding="utf-8") as f: + f.write("some\nlines\nof\ntext") + assert m.read_text("/myfile", encoding="utf-8") == "some\nlines\nof\ntext" + + +def test_write_text(m): + m.write_text("/myfile", "some\nlines\nof\ntext", encoding="utf-8") + assert m.read_text("/myfile", encoding="utf-8") == "some\nlines\nof\ntext" + + +def test_chained_fs(): + d1 = tempfile.mkdtemp() + d2 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1") + with open(f1, "wb") as f: + f.write(b"test") + + of = fsspec.open( + f"simplecache::file://{f1}", + simplecache={"cache_storage": d2, "same_names": True}, + ) + with of as f: + assert f.read() == b"test" + + assert os.listdir(d2) == ["f1"] + + +@pytest.mark.xfail(reason="see issue #334", strict=True) +def test_multilevel_chained_fs(): + """This test reproduces fsspec/filesystem_spec#334""" + import zipfile + + d1 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1.zip") + with zipfile.ZipFile(f1, mode="w") as z: + # filename, content + z.writestr("foo.txt", "foo.txt") + z.writestr("bar.txt", "bar.txt") + + # We expected this to be the correct syntax + with pytest.raises(IsADirectoryError): + of = fsspec.open_files(f"zip://*.txt::simplecache::file://{f1}") + assert len(of) == 2 + + # But this is what is actually valid... + of = fsspec.open_files(f"zip://*.txt::simplecache://{f1}::file://") + + assert len(of) == 2 + for open_file in of: + with open_file as f: + assert f.read().decode("utf-8") == f.name + + +def test_multilevel_chained_fs_zip_zip_file(): + """This test reproduces fsspec/filesystem_spec#334""" + import zipfile + + d1 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1.zip") + f2 = os.path.join(d1, "f2.zip") + with zipfile.ZipFile(f1, mode="w") as z: + # filename, content + z.writestr("foo.txt", "foo.txt") + z.writestr("bar.txt", "bar.txt") + + with zipfile.ZipFile(f2, mode="w") as z: + with open(f1, "rb") as f: + z.writestr("f1.zip", f.read()) + + # We expected this to be the correct syntax + of = fsspec.open_files(f"zip://*.txt::zip://f1.zip::file://{f2}") + + assert len(of) == 2 + for open_file in of: + with open_file as f: + assert f.read().decode("utf-8") == f.name + + +def test_chained_equivalent(): + d1 = tempfile.mkdtemp() + d2 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1") + with open(f1, "wb") as f: + f.write(b"test1") + + of = fsspec.open( + f"simplecache::file://{f1}", + simplecache={"cache_storage": d2, "same_names": True}, + ) + of2 = fsspec.open( + f"simplecache://{f1}", + cache_storage=d2, + same_names=True, + target_protocol="file", + target_options={}, + ) + # the following line passes by fluke - they are not quite the same instance, + # since the parameters don't quite match. Also, the url understood by the two + # of s are not the same (path gets munged a bit differently) + assert of.fs == of2.fs + assert hash(of.fs) == hash(of2.fs) + assert of.open().read() == of2.open().read() + + +def test_chained_fs_multi(): + d1 = tempfile.mkdtemp() + d2 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1") + f2 = os.path.join(d1, "f2") + with open(f1, "wb") as f: + f.write(b"test1") + with open(f2, "wb") as f: + f.write(b"test2") + + of = fsspec.open_files( + f"simplecache::file://{d1}/*", + simplecache={"cache_storage": d2, "same_names": True}, + ) + with of[0] as f: + assert f.read() == b"test1" + with of[1] as f: + assert f.read() == b"test2" + + assert sorted(os.listdir(d2)) == ["f1", "f2"] + + d2 = tempfile.mkdtemp() + + of = fsspec.open_files( + [f"simplecache::file://{f1}", f"simplecache::file://{f2}"], + simplecache={"cache_storage": d2, "same_names": True}, + ) + with of[0] as f: + assert f.read() == b"test1" + with of[1] as f: + assert f.read() == b"test2" + + assert sorted(os.listdir(d2)) == ["f1", "f2"] + + +def test_chained_fo(): + import zipfile + + d1 = tempfile.mkdtemp() + f1 = os.path.join(d1, "temp.zip") + d3 = tempfile.mkdtemp() + with zipfile.ZipFile(f1, mode="w") as z: + z.writestr("afile", b"test") + + of = fsspec.open(f"zip://afile::file://{f1}") + with of as f: + assert f.read() == b"test" + + of = fsspec.open_files(f"zip://*::file://{f1}") + with of[0] as f: + assert f.read() == b"test" + + of = fsspec.open_files( + f"simplecache::zip://*::file://{f1}", + simplecache={"cache_storage": d3, "same_names": True}, + ) + with of[0] as f: + assert f.read() == b"test" + assert "afile" in os.listdir(d3) + + +def test_url_to_fs(): + url = "memory://a.txt" + fs, url2 = fsspec.core.url_to_fs(url) + + assert isinstance(fs, MemoryFileSystem) + assert url2 == "/a.txt" + + +def test_walk(m): + # depth = 0 + dir1 = "/dir1" + # depth = 1 (2 dirs, 1 file) + dir11 = dir1 + "/dir11" + dir12 = dir1 + "/dir12" + file11 = dir1 + "/file11" + # depth = 2 + dir111 = dir11 + "/dir111" + file111 = dir11 + "/file111" + file121 = dir12 + "/file121" + # depth = 3 + file1111 = dir111 + "/file1111" + + m.mkdir(dir111) # Creates parents too + m.mkdir(dir12) # Creates parents too + m.touch(file11) + m.touch(file111) + m.touch(file121) + m.touch(file1111) + + # No maxdepth + assert list(m.walk(dir1, topdown=True)) == [ + (dir1, ["dir11", "dir12"], ["file11"]), + (dir11, ["dir111"], ["file111"]), + (dir111, [], ["file1111"]), + (dir12, [], ["file121"]), + ] + assert list(m.walk(dir1, topdown=False)) == [ + (dir111, [], ["file1111"]), + (dir11, ["dir111"], ["file111"]), + (dir12, [], ["file121"]), + (dir1, ["dir11", "dir12"], ["file11"]), + ] + + # maxdepth=2 + assert list(m.walk(dir1, maxdepth=2, topdown=True)) == [ + (dir1, ["dir11", "dir12"], ["file11"]), + (dir11, ["dir111"], ["file111"]), + (dir12, [], ["file121"]), + ] + assert list(m.walk(dir1, maxdepth=2, topdown=False)) == [ + (dir11, ["dir111"], ["file111"]), + (dir12, [], ["file121"]), + (dir1, ["dir11", "dir12"], ["file11"]), + ] + + # maxdepth=1 + assert list(m.walk(dir1, maxdepth=1, topdown=True)) == [ + (dir1, ["dir11", "dir12"], ["file11"]), + ] + assert list(m.walk(dir1, maxdepth=1, topdown=False)) == [ + (dir1, ["dir11", "dir12"], ["file11"]), + ] + + # maxdepth=0 + with pytest.raises(ValueError): + list(m.walk(dir1, maxdepth=0, topdown=True)) + with pytest.raises(ValueError): + list(m.walk(dir1, maxdepth=0, topdown=False)) + + # prune dir111 + def _walk(*args, **kwargs): + for path, dirs, files in m.walk(*args, **kwargs): + yield (path, dirs.copy(), files) + if "dir111" in dirs: + dirs.remove("dir111") + + assert list(_walk(dir1, topdown=True)) == [ + (dir1, ["dir11", "dir12"], ["file11"]), + (dir11, ["dir111"], ["file111"]), + (dir12, [], ["file121"]), + ] + assert list(_walk(dir1, topdown=False)) == [ + (dir111, [], ["file1111"]), + (dir11, ["dir111"], ["file111"]), + (dir12, [], ["file121"]), + (dir1, ["dir11", "dir12"], ["file11"]), + ] + + # reverse dirs order + def _walk(*args, **kwargs): + for path, dirs, files in m.walk(*args, **kwargs): + yield (path, dirs.copy(), files) + dirs.reverse() + + assert list(_walk(dir1, topdown=True)) == [ + (dir1, ["dir11", "dir12"], ["file11"]), + # Here dir12 comes before dir11 + (dir12, [], ["file121"]), + (dir11, ["dir111"], ["file111"]), + (dir111, [], ["file1111"]), + ] + assert list(_walk(dir1, topdown=False)) == [ + (dir111, [], ["file1111"]), + (dir11, ["dir111"], ["file111"]), + (dir12, [], ["file121"]), + (dir1, ["dir11", "dir12"], ["file11"]), + ] + + # on_error omit by default + assert list(m.walk("do_not_exist")) == [] + # on_error omit + assert list(m.walk("do_not_exist", on_error="omit")) == [] + # on_error raise + with pytest.raises(FileNotFoundError): + list(m.walk("do_not_exist", on_error="raise")) + # on_error callable function + mock = Mock() + assert list(m.walk("do_not_exist", on_error=mock.onerror)) == [] + mock.onerror.assert_called() + assert mock.onerror.call_args.kwargs == {} + assert len(mock.onerror.call_args.args) == 1 + assert isinstance(mock.onerror.call_args.args[0], FileNotFoundError) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_async.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_async.py new file mode 100644 index 0000000000000000000000000000000000000000..e1a29420fa0de7cd571f25556500645ad62be59d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_async.py @@ -0,0 +1,230 @@ +import asyncio +import inspect +import io +import os +import time + +import pytest + +import fsspec +import fsspec.asyn +from fsspec.asyn import _run_coros_in_chunks + + +def test_sync_methods(): + inst = fsspec.asyn.AsyncFileSystem() + assert inspect.iscoroutinefunction(inst._info) + assert hasattr(inst, "info") + assert inst.info.__qualname__ == "AsyncFileSystem._info" + assert not inspect.iscoroutinefunction(inst.info) + + +def test_when_sync_methods_are_disabled(): + class TestFS(fsspec.asyn.AsyncFileSystem): + mirror_sync_methods = False + + inst = TestFS() + assert inspect.iscoroutinefunction(inst._info) + assert not inspect.iscoroutinefunction(inst.info) + assert inst.info.__qualname__ == "AbstractFileSystem.info" + + +def test_interrupt(): + loop = fsspec.asyn.get_loop() + + async def f(): + await asyncio.sleep(1000000) + return True + + fut = asyncio.run_coroutine_threadsafe(f(), loop) + time.sleep(0.01) # task launches + out = fsspec.asyn._dump_running_tasks(with_task=True) + task = out[0]["task"] + assert task.done() and fut.done() + assert isinstance(fut.exception(), fsspec.asyn.FSSpecCoroutineCancel) + + +class _DummyAsyncKlass: + def __init__(self): + self.loop = fsspec.asyn.get_loop() + + async def _dummy_async_func(self): + # Sleep 1 second function to test timeout + await asyncio.sleep(1) + return True + + async def _bad_multiple_sync(self): + fsspec.asyn.sync_wrapper(_DummyAsyncKlass._dummy_async_func)(self) + return True + + dummy_func = fsspec.asyn.sync_wrapper(_dummy_async_func) + bad_multiple_sync_func = fsspec.asyn.sync_wrapper(_bad_multiple_sync) + + +def test_sync_wrapper_timeout_on_less_than_expected_wait_time_not_finish_function(): + test_obj = _DummyAsyncKlass() + with pytest.raises(fsspec.FSTimeoutError): + test_obj.dummy_func(timeout=0.1) + + +def test_sync_wrapper_timeout_on_more_than_expected_wait_time_will_finish_function(): + test_obj = _DummyAsyncKlass() + assert test_obj.dummy_func(timeout=5) + + +def test_sync_wrapper_timeout_none_will_wait_func_finished(): + test_obj = _DummyAsyncKlass() + assert test_obj.dummy_func(timeout=None) + + +def test_sync_wrapper_treat_timeout_0_as_none(): + test_obj = _DummyAsyncKlass() + assert test_obj.dummy_func(timeout=0) + + +def test_sync_wrapper_bad_multiple_sync(): + test_obj = _DummyAsyncKlass() + with pytest.raises(NotImplementedError): + test_obj.bad_multiple_sync_func(timeout=5) + + +def test_run_coros_in_chunks(monkeypatch): + total_running = 0 + + async def runner(): + nonlocal total_running + + total_running += 1 + await asyncio.sleep(0) + if total_running > 4: + raise ValueError("More than 4 coroutines are running together") + total_running -= 1 + return 1 + + async def main(**kwargs): + nonlocal total_running + + total_running = 0 + coros = [runner() for _ in range(32)] + results = await _run_coros_in_chunks(coros, **kwargs) + for result in results: + if isinstance(result, Exception): + raise result + return results + + assert sum(asyncio.run(main(batch_size=4))) == 32 + + with pytest.raises(ValueError): + asyncio.run(main(batch_size=5)) + + with pytest.raises(ValueError): + asyncio.run(main(batch_size=-1)) + + assert sum(asyncio.run(main(batch_size=4))) == 32 + + monkeypatch.setitem(fsspec.config.conf, "gather_batch_size", 5) + with pytest.raises(ValueError): + asyncio.run(main()) + assert sum(asyncio.run(main(batch_size=4))) == 32 # override + + monkeypatch.setitem(fsspec.config.conf, "gather_batch_size", 4) + assert sum(asyncio.run(main())) == 32 # override + + +@pytest.mark.skipif(os.name != "nt", reason="only for windows") +def test_windows_policy(): + from asyncio.windows_events import SelectorEventLoop + + loop = fsspec.asyn.get_loop() + policy = asyncio.get_event_loop_policy() + + # Ensure that the created loop always uses selector policy + assert isinstance(loop, SelectorEventLoop) + + # Ensure that the global policy is not changed and it is + # set to the default one. This is important since the + # get_loop() method will temporarily override the policy + # with the one which uses selectors on windows, so this + # check ensures that we are restoring the old policy back + # after our change. + assert isinstance(policy, asyncio.DefaultEventLoopPolicy) + + +def test_running_async(): + assert not fsspec.asyn.running_async() + + async def go(): + assert fsspec.asyn.running_async() + + asyncio.run(go()) + + +class DummyAsyncFS(fsspec.asyn.AsyncFileSystem): + _file_class = fsspec.asyn.AbstractAsyncStreamedFile + + async def _info(self, path, **kwargs): + return {"name": "misc/foo.txt", "type": "file", "size": 100} + + async def open_async( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + return DummyAsyncStreamedFile( + self, + path, + mode, + block_size, + autocommit, + cache_options=cache_options, + **kwargs, + ) + + +class DummyAsyncStreamedFile(fsspec.asyn.AbstractAsyncStreamedFile): + def __init__(self, fs, path, mode, block_size, autocommit, **kwargs): + super().__init__(fs, path, mode, block_size, autocommit, **kwargs) + self.temp_buffer = io.BytesIO(b"foo-bar" * 20) + + async def _fetch_range(self, start, end): + return self.temp_buffer.read(end - start) + + async def _initiate_upload(self): + # Reinitialize for new uploads. + self.temp_buffer = io.BytesIO() + + async def _upload_chunk(self, final=False): + self.temp_buffer.write(self.buffer.getbuffer()) + + async def get_data(self): + return self.temp_buffer.getbuffer().tobytes() + + async def get_data(self): + return self.temp_buffer.getbuffer().tobytes() + + +@pytest.mark.asyncio +async def test_async_streamed_file_write(): + test_fs = DummyAsyncFS() + streamed_file = await test_fs.open_async("misc/foo.txt", mode="wb") + inp_data = "foo-bar".encode("utf8") * streamed_file.blocksize * 2 + await streamed_file.write(inp_data) + assert streamed_file.loc == len(inp_data) + await streamed_file.close() + out_data = await streamed_file.get_data() + assert out_data.count(b"foo-bar") == streamed_file.blocksize * 2 + + +@pytest.mark.asyncio +async def test_async_streamed_file_read(): + test_fs = DummyAsyncFS() + streamed_file = await test_fs.open_async("misc/foo.txt", mode="rb") + assert ( + await streamed_file.read(7 * 3) + await streamed_file.read(7 * 18) + == b"foo-bar" * 20 + ) + await streamed_file.close() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_caches.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_caches.py new file mode 100644 index 0000000000000000000000000000000000000000..5bde713b8e24777378ff913879e0e0ee505e502e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_caches.py @@ -0,0 +1,255 @@ +import pickle +import string + +import pytest + +from fsspec.caching import ( + BlockCache, + FirstChunkCache, + ReadAheadCache, + caches, + register_cache, +) +from fsspec.implementations.cached import WholeFileCacheFileSystem + + +def test_cache_getitem(Cache_imp): + cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters)) + assert cacher._fetch(0, 4) == b"abcd" + assert cacher._fetch(None, 4) == b"abcd" + assert cacher._fetch(2, 4) == b"cd" + + +def test_block_cache_lru(): + # BlockCache is a cache that stores blocks of data and uses LRU to evict + block_size = 4 + cache = BlockCache( + block_size, letters_fetcher, len(string.ascii_letters), maxblocks=2 + ) + # miss + cache._fetch(0, 2) + assert cache.cache_info().misses == 1 + assert cache.cache_info().currsize == 1 + assert cache.total_requested_bytes == block_size * cache.miss_count + assert cache.size == 52 + + # hit + cache._fetch(0, 2) + assert cache.cache_info().misses == 1 + assert cache.cache_info().currsize == 1 + assert cache.total_requested_bytes == block_size * cache.miss_count + + # hit + cache._fetch(0, 2) + assert cache.cache_info().misses == 1 + assert cache.cache_info().currsize == 1 + # this works as a counter since all the reads are from the cache + assert cache.hit_count == 3 + assert cache.miss_count == 1 + # so far only 4 bytes have been read using range requests + assert cache.total_requested_bytes == block_size * cache.miss_count + + # miss + cache._fetch(4, 6) + assert cache.cache_info().misses == 2 + assert cache.cache_info().currsize == 2 + assert cache.total_requested_bytes == block_size * cache.miss_count + + # miss & evict + cache._fetch(12, 13) + assert cache.cache_info().misses == 3 + assert cache.cache_info().currsize == 2 + assert cache.hit_count == 5 + assert cache.miss_count == 3 + assert cache.total_requested_bytes == block_size * cache.miss_count + + +def test_first_cache(): + """ + FirstChunkCache is a cache that only caches the first chunk of data + when some of that first block is requested. + """ + block_size = 5 + cache = FirstChunkCache(block_size, letters_fetcher, len(string.ascii_letters)) + assert cache.cache is None + assert cache._fetch(12, 15) == letters_fetcher(12, 15) + assert cache.miss_count == 1 + assert cache.hit_count == 0 + assert cache.cache is None + total_requested_bytes = 15 - 12 + assert cache.total_requested_bytes == total_requested_bytes + + # because we overlap with the cache range, it will be cached + assert cache._fetch(3, 10) == letters_fetcher(3, 10) + assert cache.miss_count == 2 + assert cache.hit_count == 0 + # we'll read the first 5 and then the rest + total_requested_bytes += block_size + 5 + assert cache.total_requested_bytes == total_requested_bytes + + # partial hit again + assert cache._fetch(3, 10) == letters_fetcher(3, 10) + assert cache.miss_count == 2 + assert cache.hit_count == 1 + # we have the first 5 bytes cached + total_requested_bytes += 10 - 5 + assert cache.total_requested_bytes == total_requested_bytes + + assert cache.cache == letters_fetcher(0, 5) + assert cache._fetch(0, 4) == letters_fetcher(0, 4) + assert cache.hit_count == 2 + assert cache.miss_count == 2 + assert cache.total_requested_bytes == 18 + + +def test_readahead_cache(): + """ + ReadAheadCache is a cache that reads ahead of the requested range. + If the access pattern is not sequential it will be very inefficient. + """ + block_size = 5 + cache = ReadAheadCache(block_size, letters_fetcher, len(string.ascii_letters)) + assert cache._fetch(12, 15) == letters_fetcher(12, 15) + assert cache.miss_count == 1 + assert cache.hit_count == 0 + total_requested_bytes = 15 - 12 + block_size + assert cache.total_requested_bytes == total_requested_bytes + + assert cache._fetch(3, 10) == letters_fetcher(3, 10) + assert cache.miss_count == 2 + assert cache.hit_count == 0 + assert len(cache.cache) == 12 + total_requested_bytes += (10 - 3) + block_size + assert cache.total_requested_bytes == total_requested_bytes + + # caache hit again + assert cache._fetch(3, 10) == letters_fetcher(3, 10) + assert cache.miss_count == 2 + assert cache.hit_count == 1 + assert len(cache.cache) == 12 + assert cache.total_requested_bytes == total_requested_bytes + assert cache.cache == letters_fetcher(3, 15) + + # cache miss + assert cache._fetch(0, 4) == letters_fetcher(0, 4) + assert cache.hit_count == 1 + assert cache.miss_count == 3 + assert len(cache.cache) == 9 + total_requested_bytes += (4 - 0) + block_size + assert cache.total_requested_bytes == total_requested_bytes + + +def _fetcher(start, end): + return b"0" * (end - start) + + +def letters_fetcher(start, end): + return string.ascii_letters[start:end].encode() + + +not_parts_caches = {k: v for k, v in caches.items() if k != "parts"} + + +@pytest.fixture(params=not_parts_caches.values(), ids=list(not_parts_caches)) +def Cache_imp(request): + return request.param + + +def test_cache_empty_file(Cache_imp): + blocksize = 5 + size = 0 + cache = Cache_imp(blocksize, _fetcher, size) + assert cache._fetch(0, 0) == b"" + + +def test_cache_pickleable(Cache_imp): + blocksize = 5 + size = 100 + cache = Cache_imp(blocksize, _fetcher, size) + cache._fetch(0, 5) # fill in cache + unpickled = pickle.loads(pickle.dumps(cache)) + assert isinstance(unpickled, Cache_imp) + assert unpickled.blocksize == blocksize + assert unpickled.size == size + assert unpickled._fetch(0, 10) == b"0" * 10 + + +@pytest.mark.parametrize( + "size_requests", + [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]], +) +@pytest.mark.parametrize("blocksize", [1, 10, 52, 100]) +def test_cache_basic(Cache_imp, blocksize, size_requests): + cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters)) + + for start, end in size_requests: + result = cache._fetch(start, end) + expected = string.ascii_letters[start:end].encode() + assert result == expected + + +@pytest.mark.parametrize("strict", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_known(sort, strict): + parts = {(10, 20): b"1" * 10, (20, 30): b"2" * 10, (0, 10): b"0" * 10} + if sort: + parts = dict(sorted(parts.items())) + c = caches["parts"](None, None, 100, parts, strict=strict) + assert (0, 30) in c.data # got consolidated + assert c._fetch(5, 15) == b"0" * 5 + b"1" * 5 + assert c._fetch(15, 25) == b"1" * 5 + b"2" * 5 + if strict: + # Over-read will raise error + with pytest.raises(ValueError): + # tries to call None fetcher + c._fetch(25, 35) + else: + # Over-read will be zero-padded + assert c._fetch(25, 35) == b"2" * 5 + b"\x00" * 5 + + +def test_background(server, monkeypatch): + import threading + import time + + import fsspec + + head = {"head_ok": "true", "head_give_length": "true"} + urla = server + "/index/realfile" + h = fsspec.filesystem("http", headers=head) + thread_ids = {threading.current_thread().ident} + f = h.open(urla, block_size=5, cache_type="background") + orig = f.cache._fetch_block + + def wrapped(*a, **kw): + thread_ids.add(threading.current_thread().ident) + return orig(*a, **kw) + + f.cache._fetch_block = wrapped + assert len(thread_ids) == 1 + f.read(1) + time.sleep(0.1) # second block is loading + assert len(thread_ids) == 2 + + +def test_register_cache(): + # just test that we have them populated and fail to re-add again unless overload + with pytest.raises(ValueError): + register_cache(BlockCache) + register_cache(BlockCache, clobber=True) + + +def test_cache_kwargs(mocker): + # test that kwargs are passed to the underlying filesystem after cache commit + + fs = WholeFileCacheFileSystem(target_protocol="memory") + fs.touch("test") + fs.fs.put = mocker.MagicMock() + + with fs.open("test", "wb", overwrite=True) as file_handle: + file_handle.write(b"foo") + + # We don't care about the first parameter, just retrieve its expected value. + # It is a random location that cannot be predicted. + # The important thing is the 'overwrite' kwarg + fs.fs.put.assert_called_with(fs.fs.put.call_args[0][0], "/test", overwrite=True) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_callbacks.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..2cc679d0299ed4ecf4407be3e0a0f85cfa766feb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_callbacks.py @@ -0,0 +1,89 @@ +import pytest + +from fsspec.callbacks import Callback, TqdmCallback + + +def test_callbacks(): + empty_callback = Callback() + assert empty_callback.call("something", somearg=None) is None + + hooks = {"something": lambda *_, arg=None: arg + 2} + simple_callback = Callback(hooks=hooks) + assert simple_callback.call("something", arg=2) == 4 + + hooks = {"something": lambda *_, arg1=None, arg2=None: arg1 + arg2} + multi_arg_callback = Callback(hooks=hooks) + assert multi_arg_callback.call("something", arg1=2, arg2=2) == 4 + + +def test_callbacks_as_callback(): + empty_callback = Callback.as_callback(None) + assert empty_callback.call("something", arg="somearg") is None + assert Callback.as_callback(None) is Callback.as_callback(None) + + hooks = {"something": lambda *_, arg=None: arg + 2} + real_callback = Callback.as_callback(Callback(hooks=hooks)) + assert real_callback.call("something", arg=2) == 4 + + +def test_callbacks_as_context_manager(mocker): + spy_close = mocker.spy(Callback, "close") + + with Callback() as cb: + assert isinstance(cb, Callback) + + spy_close.assert_called_once() + + +def test_callbacks_branched(): + callback = Callback() + + branch = callback.branched("path_1", "path_2") + + assert branch is not callback + assert isinstance(branch, Callback) + + +@pytest.mark.asyncio +async def test_callbacks_branch_coro(mocker): + async_fn = mocker.AsyncMock(return_value=10) + callback = Callback() + wrapped_fn = callback.branch_coro(async_fn) + spy = mocker.spy(callback, "branched") + + assert await wrapped_fn("path_1", "path_2", key="value") == 10 + + spy.assert_called_once_with("path_1", "path_2", key="value") + async_fn.assert_called_once_with( + "path_1", "path_2", callback=spy.spy_return, key="value" + ) + + +def test_callbacks_wrap(): + events = [] + + class TestCallback(Callback): + def relative_update(self, inc=1): + events.append(inc) + + callback = TestCallback() + for _ in callback.wrap(range(10)): + ... + + assert events == [1] * 10 + + +@pytest.mark.parametrize("tqdm_kwargs", [{}, {"desc": "A custom desc"}]) +def test_tqdm_callback(tqdm_kwargs, mocker): + pytest.importorskip("tqdm") + callback = TqdmCallback(tqdm_kwargs=tqdm_kwargs) + mocker.patch.object(callback, "_tqdm_cls") + callback.set_size(10) + for _ in callback.wrap(range(10)): + ... + + assert callback.tqdm.update.call_count == 11 + if not tqdm_kwargs: + callback._tqdm_cls.assert_called_with(total=10) + else: + callback._tqdm_cls.assert_called_with(total=10, **tqdm_kwargs) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_compression.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_compression.py new file mode 100644 index 0000000000000000000000000000000000000000..3b670ce128a98201bc7aa36190fe0e52a1a4d486 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_compression.py @@ -0,0 +1,164 @@ +import pathlib + +import pytest + +import fsspec.core +from fsspec.compression import compr, register_compression +from fsspec.utils import compressions, infer_compression + + +def test_infer_custom_compression(): + """Inferred compression gets values from fsspec.compression.compr.""" + assert infer_compression("fn.zip") == "zip" + assert infer_compression("fn.gz") == "gzip" + assert infer_compression("fn.unknown") is None + assert infer_compression("fn.test_custom") is None + assert infer_compression("fn.tst") is None + + register_compression("test_custom", lambda f, **kwargs: f, "tst") + + try: + assert infer_compression("fn.zip") == "zip" + assert infer_compression("fn.gz") == "gzip" + assert infer_compression("fn.unknown") is None + assert infer_compression("fn.test_custom") is None + assert infer_compression("fn.tst") == "test_custom" + + # Duplicate registration in name or extension raises a value error. + with pytest.raises(ValueError): + register_compression("test_custom", lambda f, **kwargs: f, "tst") + + with pytest.raises(ValueError): + register_compression("test_conflicting", lambda f, **kwargs: f, "tst") + assert "test_conflicting" not in compr + + # ...but can be forced. + register_compression( + "test_conflicting", lambda f, **kwargs: f, "tst", force=True + ) + assert infer_compression("fn.zip") == "zip" + assert infer_compression("fn.gz") == "gzip" + assert infer_compression("fn.unknown") is None + assert infer_compression("fn.test_custom") is None + assert infer_compression("fn.tst") == "test_conflicting" + + finally: + del compr["test_custom"] + del compr["test_conflicting"] + del compressions["tst"] + + +def test_infer_uppercase_compression(): + assert infer_compression("fn.ZIP") == "zip" + assert infer_compression("fn.GZ") == "gzip" + assert infer_compression("fn.UNKNOWN") is None + assert infer_compression("fn.TEST_UPPERCASE") is None + assert infer_compression("fn.TEST") is None + + +def test_lzma_compression_name(): + pytest.importorskip("lzma") + assert infer_compression("fn.xz") == "xz" + assert infer_compression("fn.lzma") == "lzma" + + +def test_lz4_compression(tmpdir): + """Infer lz4 compression for .lz4 files if lz4 is available.""" + tmp_path = pathlib.Path(str(tmpdir)) + + lz4 = pytest.importorskip("lz4") + + tmp_path.mkdir(exist_ok=True) + + tdat = "foobar" * 100 + + with fsspec.core.open( + str(tmp_path / "out.lz4"), mode="wt", compression="infer" + ) as outfile: + outfile.write(tdat) + + compressed = (tmp_path / "out.lz4").open("rb").read() + assert lz4.frame.decompress(compressed).decode() == tdat + + with fsspec.core.open( + str(tmp_path / "out.lz4"), mode="rt", compression="infer" + ) as infile: + assert infile.read() == tdat + + with fsspec.core.open( + str(tmp_path / "out.lz4"), mode="rt", compression="lz4" + ) as infile: + assert infile.read() == tdat + + +def test_zstd_compression(tmpdir): + """Infer zstd compression for .zst files if zstandard is available.""" + tmp_path = pathlib.Path(str(tmpdir)) + + zstd = pytest.importorskip("zstandard") + + tmp_path.mkdir(exist_ok=True) + + tdat = "foobar" * 100 + + with fsspec.core.open( + str(tmp_path / "out.zst"), mode="wt", compression="infer" + ) as outfile: + outfile.write(tdat) + + compressed = (tmp_path / "out.zst").open("rb").read() + assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat + + with fsspec.core.open( + str(tmp_path / "out.zst"), mode="rt", compression="infer" + ) as infile: + assert infile.read() == tdat + + with fsspec.core.open( + str(tmp_path / "out.zst"), mode="rt", compression="zstd" + ) as infile: + assert infile.read() == tdat + + # fails in https://github.com/fsspec/filesystem_spec/issues/725 + infile = fsspec.core.open( + str(tmp_path / "out.zst"), mode="rb", compression="infer" + ).open() + + infile.close() + + +def test_snappy_compression(tmpdir): + """No registered compression for snappy, but can be specified.""" + tmp_path = pathlib.Path(str(tmpdir)) + + snappy = pytest.importorskip("snappy") + + tmp_path.mkdir(exist_ok=True) + + tdat = "foobar" * 100 + + # Snappy isn't inferred. + with fsspec.core.open( + str(tmp_path / "out.snappy"), mode="wt", compression="infer" + ) as outfile: + outfile.write(tdat) + assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat + + # but can be specified. + with fsspec.core.open( + str(tmp_path / "out.snappy"), mode="wt", compression="snappy" + ) as outfile: + outfile.write(tdat) + + compressed = (tmp_path / "out.snappy").open("rb").read() + assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat + + with fsspec.core.open( + str(tmp_path / "out.snappy"), mode="rb", compression="infer" + ) as infile: + assert infile.read() == compressed + + with fsspec.core.open( + str(tmp_path / "out.snappy"), mode="rt", compression="snappy" + ) as infile: + assert infile.read() == tdat diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_config.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_config.py new file mode 100644 index 0000000000000000000000000000000000000000..dc251a0d91fbed1bdec6df0b185d1607cdb2b82f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_config.py @@ -0,0 +1,129 @@ +import os +from warnings import catch_warnings + +import pytest + +import fsspec +from fsspec.config import conf, set_conf_env, set_conf_files + + +@pytest.fixture +def clean_conf(): + """Tests should start and end with clean config dict""" + conf.clear() + yield + conf.clear() + + +def test_from_env_ignored(clean_conf): + env = { + "FSSPEC": "missing_protocol", + "FSSPEC_": "missing_protocol", + "FSSPEC__INVALID_KEY": "invalid_protocol", + "FSSPEC_INVALID1": "not_json_dict", + "FSSPEC_INVALID2": '["not_json_dict"]', + } + cd = {} + with catch_warnings(record=True) as w: + set_conf_env(conf_dict=cd, envdict=env) + assert len(w) == 5 + assert "unexpected name" in str(w[0].message) + assert "unexpected name" in str(w[1].message) + assert "unexpected name" in str(w[2].message) + assert "parse failure" in str(w[3].message) + assert "not being a dict" in str(w[4].message) + assert cd == {} + + +def test_from_env_kwargs(clean_conf): + env = { + "FSSPEC_PROTO_KEY": "value", + "FSSPEC_PROTO_LONG_KEY": "othervalue", + "FSSPEC_MALFORMED": "novalue", + } + cd = {} + with catch_warnings(record=True) as w: + set_conf_env(conf_dict=cd, envdict=env) + assert len(w) == 1 + assert "parse failure" in str(w[0].message) + assert cd == {"proto": {"key": "value", "long_key": "othervalue"}} + + +def test_from_env_protocol_dict(clean_conf): + env = { + "FSSPEC_PROTO": '{"int": 1, "float": 2.3, "bool": true, "dict": {"key": "val"}}' + } + cd = {} + set_conf_env(conf_dict=cd, envdict=env) + assert cd == { + "proto": {"int": 1, "float": 2.3, "bool": True, "dict": {"key": "val"}} + } + + +def test_from_env_kwargs_override_protocol_dict(clean_conf): + env = { + "FSSPEC_PROTO_LONG_KEY": "override1", + "FSSPEC_PROTO": '{"key": "value1", "long_key": "value2", "otherkey": "value3"}', + "FSSPEC_PROTO_KEY": "override2", + } + cd = {} + set_conf_env(conf_dict=cd, envdict=env) + assert cd == { + "proto": {"key": "override2", "long_key": "override1", "otherkey": "value3"} + } + + +def test_from_file_ini(clean_conf, tmpdir): + file1 = os.path.join(tmpdir, "1.ini") + file2 = os.path.join(tmpdir, "2.ini") + with open(file1, "w") as f: + f.write( + """[proto] +key=value +other_key:othervalue +overwritten=dont_see + """ + ) + with open(file2, "w") as f: + f.write( + """[proto] +overwritten=see + """ + ) + cd = {} + set_conf_files(tmpdir, cd) + assert cd == { + "proto": {"key": "value", "other_key": "othervalue", "overwritten": "see"} + } + + +def test_from_file_json(clean_conf, tmpdir): + file1 = os.path.join(tmpdir, "1.json") + file2 = os.path.join(tmpdir, "2.json") + with open(file1, "w") as f: + f.write( + """{"proto": +{"key": "value", +"other_key": "othervalue", +"overwritten": false}} + """ + ) + with open(file2, "w") as f: + f.write( + """{"proto": +{"overwritten": true}} + """ + ) + cd = {} + set_conf_files(tmpdir, cd) + assert cd == { + "proto": {"key": "value", "other_key": "othervalue", "overwritten": True} + } + + +def test_apply(clean_conf): + conf["file"] = {"auto_mkdir": "test"} + fs = fsspec.filesystem("file") + assert fs.auto_mkdir == "test" + fs = fsspec.filesystem("file", auto_mkdir=True) + assert fs.auto_mkdir is True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_core.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_core.py new file mode 100644 index 0000000000000000000000000000000000000000..8626ae8fd9bba7a58cad8845f656977482444df7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_core.py @@ -0,0 +1,466 @@ +import os +import pickle +import tempfile +import zipfile +from contextlib import contextmanager +from pathlib import Path + +import pytest + +import fsspec +from fsspec.core import ( + OpenFile, + OpenFiles, + _expand_paths, + expand_paths_if_needed, + get_compression, + get_fs_token_paths, + open_files, + open_local, +) + + +@contextmanager +def tempzip(data={}): + f = tempfile.mkstemp(suffix="zip")[1] + with zipfile.ZipFile(f, mode="w") as z: + for k, v in data.items(): + z.writestr(k, v) + try: + yield f + finally: + try: + os.remove(f) + except OSError: + pass + + +@pytest.mark.parametrize( + "path, name_function, num, out", + [ + [["apath"], None, 1, ["apath"]], + ["apath.*.csv", None, 1, ["apath.0.csv"]], + ["apath.*.csv", None, 2, ["apath.0.csv", "apath.1.csv"]], + ["a*", lambda x: "abc"[x], 2, ["aa", "ab"]], + ], +) +def test_expand_paths(path, name_function, num, out): + assert _expand_paths(path, name_function, num) == out + + +@pytest.mark.parametrize( + "create_files, path, out", + [ + [["apath"], "apath", ["apath"]], + [["apath1"], "apath*", ["apath1"]], + [["apath1", "apath2"], "apath*", ["apath1", "apath2"]], + [["apath1", "apath2"], "apath[1]", ["apath1"]], + [["apath1", "apath11"], "apath?", ["apath1"]], + ], +) +def test_expand_paths_if_needed_in_read_mode(create_files, path, out): + d = str(tempfile.mkdtemp()) + for f in create_files: + f = os.path.join(d, f) + open(f, "w").write("test") + + path = os.path.join(d, path) + + fs = fsspec.filesystem("file") + res = expand_paths_if_needed([path], "r", 0, fs, None) + assert [os.path.basename(p) for p in res] == out + + +def test_expand_error(): + with pytest.raises(ValueError): + _expand_paths("*.*", None, 1) + + +@pytest.mark.parametrize("mode", ["w", "w+", "x", "x+"]) +def test_expand_fs_token_paths(mode): + assert len(get_fs_token_paths("path", mode, num=2, expand=True)[-1]) == 2 + + +def test_openfile_api(m): + m.open("somepath", "wb").write(b"data") + of = OpenFile(m, "somepath") + assert str(of) == "" + f = of.open() + assert f.read() == b"data" + f.close() + with OpenFile(m, "somepath", mode="rt") as f: + assert f.read() == "data" + + +def test_openfile_open(m): + of = OpenFile(m, "somepath", mode="wt") + f = of.open() + f.write("hello") + assert m.size("somepath") == 0 # no flush yet + of.close() + assert m.size("somepath") == 5 + + +def test_open_local_w_cache(): + d1 = str(tempfile.mkdtemp()) + f1 = os.path.join(d1, "f1") + open(f1, "w").write("test1") + d2 = str(tempfile.mkdtemp()) + fn = open_local(f"simplecache://{f1}", cache_storage=d2, target_protocol="file") + assert isinstance(fn, str) + assert open(fn).read() == "test1" + assert d2 in fn + + +def test_open_local_w_magic(): + d1 = str(tempfile.mkdtemp()) + f1 = os.path.join(d1, "f1") + open(f1, "w").write("test1") + fn = open_local(os.path.join(d1, "f*")) + assert len(fn) == 1 + assert isinstance(fn, list) + + +def test_open_local_w_list_of_str(): + d1 = str(tempfile.mkdtemp()) + f1 = os.path.join(d1, "f1") + open(f1, "w").write("test1") + fn = open_local([f1, f1]) + assert len(fn) == 2 + assert isinstance(fn, list) + assert all(isinstance(elem, str) for elem in fn) + + +def test_open_local_w_path(): + d1 = str(tempfile.mkdtemp()) + f1 = os.path.join(d1, "f1") + open(f1, "w").write("test1") + p = Path(f1) + fn = open_local(p) + assert isinstance(fn, str) + + +def test_open_local_w_list_of_path(): + d1 = str(tempfile.mkdtemp()) + f1 = os.path.join(d1, "f1") + open(f1, "w").write("test1") + p = Path(f1) + fn = open_local([p, p]) + assert len(fn) == 2 + assert isinstance(fn, list) + assert all(isinstance(elem, str) for elem in fn) + + +def test_xz_lzma_compressions(): + pytest.importorskip("lzma") + # Ensure that both 'xz' and 'lzma' compression names can be parsed + assert get_compression("some_file.xz", "infer") == "xz" + assert get_compression("some_file.xz", "xz") == "xz" + assert get_compression("some_file.xz", "lzma") == "lzma" + + +def test_list(): + here = os.path.abspath(os.path.dirname(__file__)) + flist = os.listdir(here) + plist = [os.path.join(here, p).replace("\\", "/") for p in flist] + of = open_files(plist) + assert len(of) == len(flist) + assert [f.path for f in of] == plist + + +def test_open_expand(m, monkeypatch): + m.pipe("/myfile", b"hello") + with pytest.raises(FileNotFoundError, match="expand=True"): + with fsspec.open("memory://my*", expand=False): + pass + with fsspec.open("memory://my*", expand=True) as f: + assert f.path == "/myfile" + monkeypatch.setattr(fsspec.core, "DEFAULT_EXPAND", True) + with fsspec.open("memory://my*") as f: + assert f.path == "/myfile" + + +def test_pathobject(tmpdir): + import pathlib + + tmpdir = str(tmpdir) + plist_str = [os.path.join(str(tmpdir), f).replace("\\", "/") for f in ["a", "b"]] + open(plist_str[0], "w").write("first file") + open(plist_str[1], "w").write("second file") + plist = [pathlib.Path(p) for p in plist_str] + of = open_files(plist) + assert len(of) == 2 + assert [f.path for f in of] == plist_str + + of = open_files(plist[0]) + assert len(of) == 1 + assert of[0].path == plist_str[0] + with of[0] as f: + assert f.read() == open(plist_str[0], "rb").read() + + +def test_automkdir(tmpdir): + dir = os.path.join(str(tmpdir), "a") + of = fsspec.open(os.path.join(dir, "afile"), "w", auto_mkdir=False) + with pytest.raises(IOError): + with of: + pass + + dir = os.path.join(str(tmpdir), "b") + of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=True) + with of: + pass + + assert "bfile" in os.listdir(dir) + + dir = os.path.join(str(tmpdir), "c") + with pytest.raises(FileNotFoundError): + of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=False) + with of: + pass + + +def test_automkdir_readonly(tmpdir): + dir = os.path.join(str(tmpdir), "d") + with pytest.raises(FileNotFoundError): + of = fsspec.open(os.path.join(dir, "dfile"), "r") + with of: + pass + + +def test_openfile_pickle_newline(): + # GH#318 + test = fsspec.open(__file__, newline=b"") + + pickled = pickle.dumps(test) + restored = pickle.loads(pickled) + + assert test.newline == restored.newline + + +def test_pickle_after_open_open(): + of = fsspec.open(__file__, mode="rt") + test = of.open() + of2 = pickle.loads(pickle.dumps(of)) + test2 = of2.open() + test.close() + + assert not test2.closed + of.close() + of2.close() + + +# Define a list of special glob characters. +# Note that we need to escape some characters and also consider file system limitations. +# '*' and '?' are excluded because they are not valid for many file systems. +# Similarly, we're careful with '{', '}', and '@' as their special meaning is +# context-specific and might not be considered special for filenames. +# Add tests for more file systems and for more glob magic later +glob_magic_characters = ["[", "]", "!"] +if os.name != "nt": + glob_magic_characters.extend(("*", "?")) # not valid on Windows + + +@pytest.mark.parametrize("char", glob_magic_characters) +def test_open_file_read_with_special_characters(tmp_path, char): + # Create a filename incorporating the special character + file_name = f"test{char}.txt" + file_path = tmp_path / file_name + expected_content = "Hello, world!" + + with open(file_path, "w") as f: + f.write(expected_content) + + with fsspec.open(file_path, "r") as f: + actual_content = f.read() + + assert actual_content == expected_content + + +@pytest.mark.parametrize("char", glob_magic_characters) +def test_open_files_read_with_special_characters(tmp_path, char): + # Create a filename incorporating the special character + file_name = f"test{char}.txt" + file_path = tmp_path / file_name + expected_content = "Hello, world!" + + with open(file_path, "w") as f: + f.write(expected_content) + + with fsspec.open_files(file_path, "r")[0] as f: + actual_content = f.read() + + assert actual_content == expected_content + + +@pytest.mark.parametrize("char", glob_magic_characters) +def test_open_file_write_with_special_characters(tmp_path, char, monkeypatch): + # Create a filename incorporating the special character + file_name = f"test{char}.txt" + file_path = tmp_path / file_name + expected_content = "Hello, world!" + + with fsspec.open(file_path, "w", expand=False) as f: + f.write(expected_content) + + with open(file_path, "r") as f: + actual_content = f.read() + + monkeypatch.setattr(fsspec.core, "DEFAULT_EXPAND", False) + with fsspec.open(file_path, "w") as f: + f.write(expected_content * 2) + + with open(file_path, "r") as f: + f.read() == actual_content * 2 + + assert actual_content == expected_content + + +@pytest.mark.parametrize("char", glob_magic_characters) +def test_open_files_read_with_special_characters(tmp_path, char): + # Create a filename incorporating the special character + file_name = f"test{char}.txt" + file_path = tmp_path / file_name + expected_content = "Hello, world!" + + with open(file_path, "w") as f: + f.write(expected_content) + + with fsspec.open_files( + urlpath=[os.fspath(file_path)], mode="r", auto_mkdir=False, expand=False + )[0] as f: + actual_content = f.read() + + assert actual_content == expected_content + + +@pytest.mark.parametrize("char", glob_magic_characters) +def test_open_files_write_with_special_characters(tmp_path, char): + # Create a filename incorporating the special character + file_name = f"test{char}.txt" + file_path = tmp_path / file_name + expected_content = "Hello, world!" + + with fsspec.open_files( + urlpath=[os.fspath(file_path)], mode="w", auto_mkdir=False, expand=False + )[0] as f: + f.write(expected_content) + + with open(file_path, "r") as f: + actual_content = f.read() + + assert actual_content == expected_content + + +def test_mismatch(): + pytest.importorskip("s3fs") + with pytest.raises(ValueError): + open_files(["s3://test/path.csv", "/other/path.csv"]) + + +def test_url_kwargs_chain(ftp_writable): + host, port, username, password = ftp_writable + data = b"hello" + with fsspec.open( + "ftp:///afile", "wb", host=host, port=port, username=username, password=password + ) as f: + f.write(data) + + with fsspec.open( + f"simplecache::ftp://{username}:{password}@{host}:{port}//afile", "rb" + ) as f: + assert f.read() == data + + +def test_multi_context(tmpdir): + fns = [os.path.join(tmpdir, fn) for fn in ["a", "b"]] + files = open_files(fns, "wb") + assert isinstance(files, OpenFiles) + assert isinstance(files[0], OpenFile) + assert len(files) == 2 + assert isinstance(files[:1], OpenFiles) + assert len(files[:1]) == 1 + with files as of: + assert len(of) == 2 + assert not of[0].closed + assert of[0].name.endswith("a") + assert of[0].closed + assert repr(files) == "" + + +def test_not_local(): + with pytest.raises(ValueError, match="attribute local_file=True"): + open_local("memory://afile") + + +def test_url_to_fs(ftp_writable): + host, port, username, password = ftp_writable + data = b"hello" + with fsspec.open(f"ftp://{username}:{password}@{host}:{port}/afile", "wb") as f: + f.write(data) + fs, url = fsspec.core.url_to_fs( + f"simplecache::ftp://{username}:{password}@{host}:{port}/afile" + ) + assert url == "/afile" + fs, url = fsspec.core.url_to_fs(f"ftp://{username}:{password}@{host}:{port}/afile") + assert url == "/afile" + + with fsspec.open(f"ftp://{username}:{password}@{host}:{port}/afile.zip", "wb") as f: + import zipfile + + with zipfile.ZipFile(f, "w") as z: + with z.open("inner", "w") as f2: + f2.write(b"hello") + f.write(data) + + fs, url = fsspec.core.url_to_fs( + f"zip://inner::ftp://{username}:{password}@{host}:{port}/afile.zip" + ) + assert url == "inner" + fs, url = fsspec.core.url_to_fs( + f"simplecache::zip::ftp://{username}:{password}@{host}:{port}/afile.zip" + ) + assert url == "" + + +def test_target_protocol_options(ftp_writable): + host, port, username, password = ftp_writable + data = {"afile": b"hello"} + options = {"host": host, "port": port, "username": username, "password": password} + with tempzip(data) as lfile, fsspec.open( + "ftp:///archive.zip", "wb", **options + ) as f: + f.write(open(lfile, "rb").read()) + with fsspec.open( + "zip://afile", + "rb", + target_protocol="ftp", + target_options=options, + fo="archive.zip", + ) as f: + assert f.read() == data["afile"] + + +def test_chained_url(ftp_writable): + host, port, username, password = ftp_writable + data = {"afile": b"hello"} + cls = fsspec.get_filesystem_class("ftp") + fs = cls(host=host, port=port, username=username, password=password) + with tempzip(data) as lfile: + fs.put_file(lfile, "archive.zip") + + urls = [ + "zip://afile", + "zip://afile::simplecache", + "simplecache::zip://afile", + "simplecache::zip://afile::simplecache", + ] + for url in urls: + url += f"::ftp://{username}:{password}@{host}:{port}/archive.zip" + with fsspec.open(url, "rb") as f: + assert f.read() == data["afile"] + + +def test_automkdir_local(): + fs, _ = fsspec.core.url_to_fs("file://", auto_mkdir=True) + assert fs.auto_mkdir is True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_downstream.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_downstream.py new file mode 100644 index 0000000000000000000000000000000000000000..172b2a7a73e8a4eef07776bfe25cc410c4495577 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_downstream.py @@ -0,0 +1,40 @@ +import pytest + +pytest.importorskip("s3fs") +pytest.importorskip("moto") + +try: + from s3fs.tests.test_s3fs import ( # noqa: E402,F401 + endpoint_uri, + s3, + s3_base, + test_bucket_name, + ) +except ImportError: + pytest.skip("s3 tests not available.") + +so = {"anon": False, "client_kwargs": {"endpoint_url": endpoint_uri}} + + +def test_pandas(s3): + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.to_csv(f"s3://{test_bucket_name}/a.csv", storage_options=so) + df2 = pd.read_csv(f"s3://{test_bucket_name}/a.csv", storage_options=so) + + assert df.a.equals(df2.a) + + +def test_xarray_zarr(s3): + xr = pytest.importorskip("xarray") + pytest.importorskip("zarr") + import numpy as np + + x = np.arange(5) + xarr = xr.DataArray(x) + ds = xr.Dataset({"x": xarr}) + ds.to_zarr(f"s3://{test_bucket_name}/a.zarr", storage_options=so) + + ds2 = xr.open_zarr(f"s3://{test_bucket_name}/a.zarr", storage_options=so) + + assert (ds.x == ds2.x).all() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_file.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_file.py new file mode 100644 index 0000000000000000000000000000000000000000..784f50753f2af9fde4f16a7945a10bce8f76f1c5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_file.py @@ -0,0 +1,200 @@ +"""Tests abstract buffered file API, using FTP implementation""" + +import pickle + +import pytest + +from fsspec.implementations.tests.test_ftp import FTPFileSystem + +data = b"hello" * 10000 + + +def test_pickle(ftp_writable): + host, port, user, pw = ftp_writable + ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) + + f = ftp.open("/out", "rb") + + f2 = pickle.loads(pickle.dumps(f)) + assert f == f2 + + +def test_file_read_attributes(ftp_writable): + host, port, user, pw = ftp_writable + ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) + + f = ftp.open("/out", "rb") + assert f.info()["size"] == len(data) + assert f.tell() == 0 + assert f.seekable() + assert f.readable() + assert not f.writable() + out = bytearray(len(data)) + + assert f.read() == data + assert f.read() == b"" + f.seek(0) + assert f.readuntil(b"l") == b"hel" + assert f.tell() == 3 + + f.readinto1(out) + assert out[:-3] == data[3:] + with pytest.raises(ValueError): + f.write(b"") + f.close() + with pytest.raises(ValueError): + f.read()(b"") + + +def test_seek(ftp_writable): + host, port, user, pw = ftp_writable + ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) + + f = ftp.open("/out", "rb") + + assert f.seek(-10, 2) == len(data) - 10 + assert f.tell() == len(data) - 10 + assert f.seek(-1, 1) == len(data) - 11 + with pytest.raises(ValueError): + f.seek(-1) + with pytest.raises(ValueError): + f.seek(0, 7) + + +def test_file_idempotent(ftp_writable): + host, port, user, pw = ftp_writable + ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) + + f = ftp.open("/out", "rb") + f2 = ftp.open("/out", "rb") + assert hash(f) == hash(f2) + assert f == f2 + ftp.touch("/out2") + f2 = ftp.open("/out2", "rb") + assert hash(f2) != hash(f) + assert f != f2 + f2 = ftp.open("/out", "wb") + assert hash(f2) != hash(f) + + +def test_file_text_attributes(ftp_writable): + host, port, user, pw = ftp_writable + ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) + + data = b"hello\n" * 1000 + with ftp.open("/out2", "wb") as f: + f.write(data) + + f = ftp.open("/out2", "rb") + assert f.readline() == b"hello\n" + f.seek(0) + assert list(f) == [d + b"\n" for d in data.split()] + f.seek(0) + assert f.readlines() == [d + b"\n" for d in data.split()] + + f = ftp.open("/out2", "rt") + assert f.readline() == "hello\n" + assert f.encoding + + +def test_file_write_attributes(ftp_writable): + host, port, user, pw = ftp_writable + ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) + f = ftp.open("/out2", "wb") + with pytest.raises(ValueError): + f.info() + with pytest.raises(OSError): + f.seek(0) + with pytest.raises(ValueError): + f.read(0) + assert not f.readable() + assert f.writable() + + f.flush() # no-op + + assert f.write(b"hello") == 5 + assert f.write(b"hello") == 5 + assert not f.closed + f.close() + assert f.closed + with pytest.raises(ValueError): + f.write(b"") + with pytest.raises(ValueError): + f.flush() + + +def test_midread_cache(ftp_writable): + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host=host, port=port, username=user, password=pw) + fn = "/myfile" + with fs.open(fn, "wb") as f: + f.write(b"a" * 175627146) + with fs.open(fn, "rb") as f: + f.seek(175561610) + d1 = f.read(65536) + assert len(d1) == 65536 + + f.seek(4) + size = 17562198 + d2 = f.read(size) + assert len(d2) == size + + f.seek(17562288) + size = 17562187 + d3 = f.read(size) + assert len(d3) == size + + +def test_read_block(ftp_writable): + # not the same as test_read_block in test_utils, this depends on the + # behaviour of the bytest caching + from fsspec.utils import read_block + + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host=host, port=port, username=user, password=pw) + fn = "/myfile" + with fs.open(fn, "wb") as f: + f.write(b"a,b\n1,2") + f = fs.open(fn, "rb", cache_type="bytes") + assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2" + + +def test_with_gzip(ftp_writable): + import gzip + + data = b"some compressible stuff" + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host=host, port=port, username=user, password=pw) + fn = "/myfile" + with fs.open(fn, "wb") as f: + gf = gzip.GzipFile(fileobj=f, mode="w") + gf.write(data) + gf.close() + with fs.open(fn, "rb") as f: + gf = gzip.GzipFile(fileobj=f, mode="r") + assert gf.read() == data + + +def test_auto_compression(m): + fs = m + with fs.open("myfile.gz", mode="wt", compression="infer") as f: + f.write("text") + with fs.open("myfile.gz", mode="rt", compression="infer") as f: + assert f.read() == "text" + + +def test_with_zip(ftp_writable): + import zipfile + + data = b"hello zip" + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host=host, port=port, username=user, password=pw) + fn = "/myfile.zip" + inner_file = "test.txt" + with fs.open(fn, "wb") as f: + zf = zipfile.ZipFile(f, mode="w") + zf.writestr(inner_file, data) + zf.close() + with fs.open(fn, "rb") as f: + zf = zipfile.ZipFile(f, mode="r") + assert zf.read(inner_file) == data diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_fuse.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_fuse.py new file mode 100644 index 0000000000000000000000000000000000000000..db627ffc962fa58f2e7de4e335a3a08cceeed625 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_fuse.py @@ -0,0 +1,147 @@ +import os +import subprocess +import time +from multiprocessing import Process + +import pytest + +try: + pytest.importorskip("fuse") # noqa: E402 +except OSError: + # can succeed in importing fuse, but fail to load so + pytest.importorskip("nonexistent") # noqa: E402 + +from fsspec.fuse import main, run +from fsspec.implementations.memory import MemoryFileSystem + + +def host_fuse(mountdir): + fs = MemoryFileSystem() + fs.touch("/mounted/testfile") + run(fs, "/mounted/", mountdir) + + +def test_basic(tmpdir, capfd): + mountdir = str(tmpdir.mkdir("mount")) + + fuse_process = Process(target=host_fuse, args=(str(mountdir),)) + fuse_process.start() + + try: + timeout = 10 + while True: + try: + # can fail with device not ready while waiting for fuse + if "testfile" in os.listdir(mountdir): + break + except Exception: + pass + timeout -= 1 + time.sleep(1) + if not timeout > 0: + import pdb + + pdb.set_trace() + pytest.skip(msg="fuse didn't come live") + + fn = os.path.join(mountdir, "test") + with open(fn, "wb") as f: + f.write(b"data") + + with open(fn) as f: + assert f.read() == "data" + + os.remove(fn) + + os.mkdir(fn) + assert os.listdir(fn) == [] + + os.mkdir(fn + "/inner") + + with pytest.raises(OSError): + os.rmdir(fn) + + captured = capfd.readouterr() + assert "Traceback" not in captured.out + assert "Traceback" not in captured.err + + os.rmdir(fn + "/inner") + os.rmdir(fn) + finally: + fuse_process.terminate() + fuse_process.join(timeout=10) + if fuse_process.is_alive(): + fuse_process.kill() + fuse_process.join() + + +def host_mount_local(source_dir, mount_dir, debug_log): + main(["local", source_dir, mount_dir, "-l", debug_log, "--ready-file"]) + + +@pytest.fixture() +def mount_local(tmpdir): + source_dir = tmpdir.mkdir("source") + mount_dir = tmpdir.mkdir("local") + debug_log = tmpdir / "debug.log" + fuse_process = Process( + target=host_mount_local, args=(str(source_dir), str(mount_dir), str(debug_log)) + ) + fuse_process.start() + ready_file = mount_dir / ".fuse_ready" + for _ in range(20): + if ready_file.exists() and open(ready_file).read() == b"ready": + break + time.sleep(0.1) + try: + yield (source_dir, mount_dir) + finally: + fuse_process.terminate() + fuse_process.join(timeout=10) + if fuse_process.is_alive(): + fuse_process.kill() + fuse_process.join() + + +def test_mount(mount_local): + source_dir, mount_dir = mount_local + assert os.listdir(mount_dir) == [] + assert os.listdir(source_dir) == [] + + mount_dir.mkdir("a") + + assert os.listdir(mount_dir) == ["a"] + assert os.listdir(source_dir) == ["a"] + + +def test_chmod(mount_local): + source_dir, mount_dir = mount_local + open(mount_dir / "text", "w").write("test") + assert os.listdir(source_dir) == ["text"] + + cp = subprocess.run( + ["cp", str(mount_dir / "text"), str(mount_dir / "new")], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + + assert cp.stderr == b"" + assert cp.stdout == b"" + assert set(os.listdir(source_dir)) == {"text", "new"} + assert open(mount_dir / "new").read() == "test" + + +def test_seek_rw(mount_local): + source_dir, mount_dir = mount_local + fh = open(mount_dir / "text", "w") + fh.write("teST") + fh.seek(2) + fh.write("st") + fh.close() + + fh = open(mount_dir / "text", "r") + assert fh.read() == "test" + fh.seek(2) + assert fh.read() == "st" + fh.close() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_generic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_generic.py new file mode 100644 index 0000000000000000000000000000000000000000..fc4c8bf01754c01e56394d8331090e8dc33150fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_generic.py @@ -0,0 +1,90 @@ +import pytest + +import fsspec +from fsspec.tests.conftest import data, server # noqa: F401 + + +def test_remote_async_ops(server): + fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) + fs = fsspec.filesystem("generic", default_method="current") + out = fs.info(server + "/index/realfile") + assert out["size"] == len(data) + assert out["type"] == "file" + assert fs.isfile(server + "/index/realfile") # this method from superclass + + +def test_touch_rm(m): + m.touch("afile") + m.touch("dir/afile") + + fs = fsspec.filesystem("generic", default_method="current") + fs.rm("memory://afile") + assert not m.exists("afile") + + fs.rm("memory://dir", recursive=True) + assert not m.exists("dir/afile") + assert not m.exists("dir") + + +def test_cp_async_to_sync(server, m): + fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) + fs = fsspec.filesystem("generic", default_method="current") + fs.cp([server + "/index/realfile"], ["memory://realfile"]) + assert m.cat("realfile") == data + + fs.rm("memory://realfile") + assert not m.exists("realfile") + + +def test_pipe_cat_sync(m): + fs = fsspec.filesystem("generic", default_method="current") + fs.pipe("memory://afile", b"data") + assert fs.cat("memory://afile") == b"data" + + +def test_cat_async(server): + fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"}) + fs = fsspec.filesystem("generic", default_method="current") + assert fs.cat(server + "/index/realfile") == data + + +def test_rsync(tmpdir, m): + from fsspec.generic import GenericFileSystem, rsync + + fs = GenericFileSystem() + fs.pipe("memory:///deep/path/afile", b"data1") + fs.pipe("memory:///deep/afile", b"data2") + + with pytest.raises(ValueError): + rsync("memory:///deep/afile", f"file://{tmpdir}") + rsync("memory://", f"file://{tmpdir}") + + allfiles = fs.find(f"file://{tmpdir}", withdirs=True, detail=True) + pos_tmpdir = fsspec.implementations.local.make_path_posix(str(tmpdir)) # for WIN + assert set(allfiles) == { + f"file://{pos_tmpdir}{_}" + for _ in [ + "", + "/deep", + "/deep/path", + "/deep/path/afile", + "/deep/afile", + ] + } + fs.rm("memory:///deep/afile") + rsync("memory://", f"file://{tmpdir}", delete_missing=True) + allfiles2 = fs.find(f"file://{tmpdir}", withdirs=True, detail=True) + assert set(allfiles2) == { + f"file://{pos_tmpdir}{_}" + for _ in [ + "", + "/deep", + "/deep/path", + "/deep/path/afile", + ] + } + # the file was not updated, since size was correct + assert ( + allfiles[f"file://{pos_tmpdir}/deep/path/afile"] + == allfiles2[f"file://{pos_tmpdir}/deep/path/afile"] + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_gui.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..38b83a2cfb49517e387091441d77f80e1ffa1aec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_gui.py @@ -0,0 +1,23 @@ +import pytest + +panel = pytest.importorskip("panel") + + +def test_basic(): + import fsspec.gui + + gui = fsspec.gui.FileSelector() + assert "url" in str(gui.panel) + + +def test_kwargs(tmpdir): + """confirm kwargs are passed to the filesystem instance""" + import fsspec.gui + + gui = fsspec.gui.FileSelector(f"file://{tmpdir}", kwargs="{'auto_mkdir': True}") + + assert gui.fs.auto_mkdir + + gui = fsspec.gui.FileSelector(f"file://{tmpdir}", kwargs={"auto_mkdir": True}) + + assert gui.fs.auto_mkdir diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_mapping.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..0d6c8fdde5a775dd3078461c61ef76948f0deb5e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_mapping.py @@ -0,0 +1,228 @@ +import os +import pickle +import platform +import sys +import uuid + +import pytest + +import fsspec +from fsspec.implementations.local import LocalFileSystem +from fsspec.implementations.memory import MemoryFileSystem + + +def test_mapping_prefix(tmpdir): + tmpdir = str(tmpdir) + os.makedirs(os.path.join(tmpdir, "afolder")) + open(os.path.join(tmpdir, "afile"), "w").write("test") + open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") + + m = fsspec.get_mapper(f"file://{tmpdir}") + assert "afile" in m + assert m["afolder/anotherfile"] == b"test2" + + fs = fsspec.filesystem("file") + m2 = fs.get_mapper(tmpdir) + m3 = fs.get_mapper(f"file://{tmpdir}") + + assert m == m2 == m3 + + +def test_getitems_errors(tmpdir): + tmpdir = str(tmpdir) + os.makedirs(os.path.join(tmpdir, "afolder")) + open(os.path.join(tmpdir, "afile"), "w").write("test") + open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") + m = fsspec.get_mapper(f"file://{tmpdir}") + assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"} + with pytest.raises(KeyError): + m.getitems(["afile", "bfile"]) + out = m.getitems(["afile", "bfile"], on_error="return") + assert isinstance(out["bfile"], KeyError) + m = fsspec.get_mapper(f"file://{tmpdir}", missing_exceptions=()) + assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"} + with pytest.raises(FileNotFoundError): + m.getitems(["afile", "bfile"]) + + +def test_ops(): + MemoryFileSystem.store.clear() + m = fsspec.get_mapper("memory://") + assert not m + assert list(m) == [] + + with pytest.raises(KeyError): + m["hi"] + + assert m.pop("key", 0) == 0 + + m["key0"] = b"data" + assert list(m) == ["key0"] + assert m["key0"] == b"data" + + m.clear() + + assert list(m) == [] + + +def test_pickle(): + m = fsspec.get_mapper("memory://") + assert isinstance(m.fs, MemoryFileSystem) + m["key"] = b"data" + m2 = pickle.loads(pickle.dumps(m)) + assert list(m) == list(m2) + assert m.missing_exceptions == m2.missing_exceptions + + +def test_keys_view(): + # https://github.com/fsspec/filesystem_spec/issues/186 + m = fsspec.get_mapper("memory://") + m["key"] = b"data" + + keys = m.keys() + assert len(keys) == 1 + # check that we don't consume the keys + assert len(keys) == 1 + m.clear() + + +def test_multi(): + m = fsspec.get_mapper("memory:///") + data = {"a": b"data1", "b": b"data2"} + m.setitems(data) + + assert m.getitems(list(data)) == data + m.delitems(list(data)) + assert not list(m) + + +def test_setitem_types(): + import array + + m = fsspec.get_mapper("memory://") + m["a"] = array.array("i", [1]) + if sys.byteorder == "little": + assert m["a"] == b"\x01\x00\x00\x00" + else: + assert m["a"] == b"\x00\x00\x00\x01" + m["b"] = bytearray(b"123") + assert m["b"] == b"123" + m.setitems({"c": array.array("i", [1]), "d": bytearray(b"123")}) + if sys.byteorder == "little": + assert m["c"] == b"\x01\x00\x00\x00" + else: + assert m["c"] == b"\x00\x00\x00\x01" + assert m["d"] == b"123" + + +def test_setitem_numpy(): + m = fsspec.get_mapper("memory://") + np = pytest.importorskip("numpy") + m["c"] = np.array(1, dtype="foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000)) + ).encode() + + # Read single record at beginning. + # All reads include beginning of file and read through termination of + # delimited record. + assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n" + assert ( + read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True) + == b"#header>foo0" + ) + assert ( + read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>" + ) + assert ( + read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True) + == b"#header>foo0\nFOOBAR0\n" + ) + + # Read multiple records at beginning. + # All reads include beginning of file and read through termination of + # delimited record. + assert ( + read_block(io.BytesIO(d), 0, 27, delimiter=b"\n") + == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" + ) + assert ( + read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True) + == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1" + ) + assert ( + read_block(io.BytesIO(d), 0, 27, delimiter=b">") + == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>" + ) + assert ( + read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True) + == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" + ) + + # Read with offset spanning into next record, splits on either side of delimiter. + # Read not spanning the full record returns nothing. + assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n" + assert ( + read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True) + == b"\nFOOBAR0" + ) + assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b"" + assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b"" + + # Read with offset spanning multiple records, splits on either side of delimiter + assert ( + read_block(io.BytesIO(d), 10, 20, delimiter=b"\n") + == b"FOOBAR0\n>foo1\nFOOBAR1\n" + ) + assert ( + read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True) + == b"\nFOOBAR0\n>foo1\nFOOBAR1" + ) + assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>" + assert ( + read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True) + == b">foo1\nFOOBAR1\n" + ) + + # Read record at end, all records read to end + + tlen = len(d) + + assert ( + read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n") + == b">foo99999\nFOOBAR99999\n" + ) + + assert ( + read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True) + == b"\n>foo99999\nFOOBAR99999\n" + ) + + assert ( + read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">") + == b"foo99999\nFOOBAR99999\n" + ) + + assert ( + read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True) + == b">foo99999\nFOOBAR99999\n" + ) + + +def test_seek_delimiter_endline(): + f = io.BytesIO(b"123\n456\n789") + + # if at zero, stay at zero + seek_delimiter(f, b"\n", 5) + assert f.tell() == 0 + + # choose the first block + for bs in [1, 5, 100]: + f.seek(1) + seek_delimiter(f, b"\n", blocksize=bs) + assert f.tell() == 4 + + # handle long delimiters well, even with short blocksizes + f = io.BytesIO(b"123abc456abc789") + for bs in [1, 2, 3, 4, 5, 6, 10]: + f.seek(1) + seek_delimiter(f, b"abc", blocksize=bs) + assert f.tell() == 6 + + # End at the end + f = io.BytesIO(b"123\n456") + f.seek(5) + seek_delimiter(f, b"\n", 5) + assert f.tell() == 7 + + +def test_infer_options(): + so = infer_storage_options("/mnt/datasets/test.csv") + assert so.pop("protocol") == "file" + assert so.pop("path") == "/mnt/datasets/test.csv" + assert not so + + assert infer_storage_options("./test.csv")["path"] == "./test.csv" + assert infer_storage_options("../test.csv")["path"] == "../test.csv" + + so = infer_storage_options("C:\\test.csv") + assert so.pop("protocol") == "file" + assert so.pop("path") == "C:\\test.csv" + assert not so + + assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv" + assert infer_storage_options("\\test.csv")["path"] == "\\test.csv" + assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv" + assert infer_storage_options("test.csv")["path"] == "test.csv" + + so = infer_storage_options( + "hdfs://username:pwd@Node:123/mnt/datasets/test.csv?q=1#fragm", + inherit_storage_options={"extra": "value"}, + ) + assert so.pop("protocol") == "hdfs" + assert so.pop("username") == "username" + assert so.pop("password") == "pwd" + assert so.pop("host") == "Node" + assert so.pop("port") == 123 + assert so.pop("path") == "/mnt/datasets/test.csv#fragm" + assert so.pop("url_query") == "q=1" + assert so.pop("url_fragment") == "fragm" + assert so.pop("extra") == "value" + assert not so + + so = infer_storage_options("hdfs://User-name@Node-name.com/mnt/datasets/test.csv") + assert so.pop("username") == "User-name" + assert so.pop("host") == "Node-name.com" + + u = "http://127.0.0.1:8080/test.csv" + assert infer_storage_options(u) == {"protocol": "http", "path": u} + + # For s3 and gcs the netloc is actually the bucket name, so we want to + # include it in the path. Test that: + # - Parsing doesn't lowercase the bucket + # - The bucket is included in path + for protocol in ["s3", "s3a", "gcs", "gs"]: + options = infer_storage_options(f"{protocol}://Bucket-name.com/test.csv") + assert options["path"] == "Bucket-name.com/test.csv" + + with pytest.raises(KeyError): + infer_storage_options("file:///bucket/file.csv", {"path": "collide"}) + with pytest.raises(KeyError): + infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"}) + + +def test_infer_simple(): + out = infer_storage_options("//mnt/datasets/test.csv") + assert out["protocol"] == "file" + assert out["path"] == "//mnt/datasets/test.csv" + assert out.get("host", None) is None + + +@pytest.mark.parametrize( + "urlpath, expected_path", + ( + (r"c:\foo\bar", r"c:\foo\bar"), + (r"C:\\foo\bar", r"C:\\foo\bar"), + (r"c:/foo/bar", r"c:/foo/bar"), + (r"file:///c|\foo\bar", r"c:\foo\bar"), + (r"file:///C|/foo/bar", r"C:/foo/bar"), + (r"file:///C:/foo/bar", r"C:/foo/bar"), + ), +) +def test_infer_storage_options_c(urlpath, expected_path): + so = infer_storage_options(urlpath) + assert so["protocol"] == "file" + assert so["path"] == expected_path + + +@pytest.mark.parametrize( + "paths, out", + ( + (["/more/dir/", "/more/dir/two", "/more/one", "/more/three"], "/more"), + (["/", "", "/"], ""), + (["/", "/"], "/"), + (["/more/", "/"], ""), + (["/more/", "/more"], "/more"), + (["more/dir/", "more/dir/two", "more/one", "more/three"], "more"), + ), +) +def test_common_prefix(paths, out): + assert common_prefix(paths) == out + + +@pytest.mark.parametrize( + "paths, other, exists, expected", + ( + (["/path1"], "/path2", False, ["/path2"]), + (["/path1"], "/path2", True, ["/path2/path1"]), + (["/path1"], "/path2", False, ["/path2"]), + (["/path1"], "/path2/", True, ["/path2/path1"]), + (["/path1"], ["/path2"], False, ["/path2"]), + (["/path1"], ["/path2"], True, ["/path2"]), + (["/path1", "/path2"], "/path2", False, ["/path2/path1", "/path2/path2"]), + (["/path1", "/path2"], "/path2", True, ["/path2/path1", "/path2/path2"]), + ( + ["/more/path1", "/more/path2"], + "/path2", + False, + ["/path2/path1", "/path2/path2"], + ), + ( + ["/more/path1", "/more/path2"], + "/path2", + True, + ["/path2/more/path1", "/path2/more/path2"], + ), + ( + ["/more/path1", "/more/path2"], + "/path2", + False, + ["/path2/path1", "/path2/path2"], + ), + ( + ["/more/path1", "/more/path2"], + "/path2", + True, + ["/path2/more/path1", "/path2/more/path2"], + ), + ( + ["/more/path1", "/more/path2"], + "/path2/", + False, + ["/path2/path1", "/path2/path2"], + ), + ( + ["/more/path1", "/more/path2"], + "/path2/", + True, + ["/path2/more/path1", "/path2/more/path2"], + ), + ( + ["/more/path1", "/diff/path2"], + "/path2/", + False, + ["/path2/more/path1", "/path2/diff/path2"], + ), + ( + ["/more/path1", "/diff/path2"], + "/path2/", + True, + ["/path2/more/path1", "/path2/diff/path2"], + ), + (["a", "b/", "b/c"], "dest/", False, ["dest/a", "dest/b/", "dest/b/c"]), + ( + ["/a", "/b/", "/b/c"], + "dest/", + False, + ["dest/a", "dest/b/", "dest/b/c"], + ), + ), +) +def test_other_paths(paths, other, exists, expected): + assert other_paths(paths, other, exists) == expected + + +def test_log(): + import logging + + logger = setup_logging(logger_name="fsspec.test") + assert logger.level == logging.DEBUG + + +@pytest.mark.parametrize( + "par", + [ + ("afile", "file"), + ("file://afile", "file"), + ("noproto://afile", "noproto"), + ("noproto::stuff", "noproto"), + ("simplecache::stuff", "simplecache"), + ("simplecache://stuff", "simplecache"), + ("s3://afile", "s3"), + (Path("afile"), "file"), + ], +) +def test_get_protocol(par): + url, outcome = par + assert get_protocol(url) == outcome + + +@pytest.mark.parametrize( + "par", + [ + ("afile", True), + ("file://afile", True), + ("noproto://afile", False), + ("noproto::stuff", False), + ("simplecache::stuff", True), + ("simplecache://stuff", True), + (Path("afile"), True), + ], +) +def test_can_local(par): + url, outcome = par + assert can_be_local(url) == outcome + + +def test_mirror_from(): + mock = Mock() + mock.attr = 1 + + @mirror_from("client", ["attr", "func_1", "func_2"]) + class Real: + @property + def client(self): + return mock + + def func_2(self): + assert False, "have to overwrite this" + + def func_3(self): + return "should succeed" + + obj = Real() + assert obj.attr == mock.attr + + obj.func_1() + mock.func_1.assert_called() + + obj.func_2(1, 2) + mock.func_2.assert_called_with(1, 2) + + assert obj.func_3() == "should succeed" + mock.func_3.assert_not_called() + + +@pytest.mark.parametrize("max_gap", [0, 32]) +@pytest.mark.parametrize("max_block", [None, 128]) +def test_merge_offset_ranges(max_gap, max_block): + # Input ranges + # (Using out-of-order ranges for full coverage) + paths = ["foo", "bar", "bar", "bar", "foo"] + starts = [0, 0, 512, 64, 32] + ends = [32, 32, 1024, 256, 64] + + # Call merge_offset_ranges + ( + result_paths, + result_starts, + result_ends, + ) = merge_offset_ranges( + paths, + starts, + ends, + max_gap=max_gap, + max_block=max_block, + ) + + # Check result + if max_block is None and max_gap == 32: + expect_paths = ["bar", "bar", "foo"] + expect_starts = [0, 512, 0] + expect_ends = [256, 1024, 64] + else: + expect_paths = ["bar", "bar", "bar", "foo"] + expect_starts = [0, 64, 512, 0] + expect_ends = [32, 256, 1024, 64] + + assert expect_paths == result_paths + assert expect_starts == result_starts + assert expect_ends == result_ends + + +def test_size(): + f = io.BytesIO(b"hello") + assert fsspec.utils.file_size(f) == 5 + assert f.tell() == 0 + + +class _HasFspath: + def __fspath__(self): + return "foo" + + +class _HasPathAttr: + def __init__(self): + self.path = "foo" + + +@pytest.mark.parametrize( + "path,expected", + [ + # coerce to string + ("foo", "foo"), + (Path("foo"), "foo"), + (PurePath("foo"), "foo"), + (_HasFspath(), "foo"), + (_HasPathAttr(), "foo"), + # passthrough + (b"bytes", b"bytes"), + (None, None), + (1, 1), + (True, True), + (o := object(), o), + ([], []), + ((), ()), + (set(), set()), + ], +) +def test_stringify_path(path, expected): + path = fsspec.utils.stringify_path(path) + + assert path == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/ElementSoup.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/ElementSoup.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64c547458e418e51e1e349ad1a90787f8d749fbe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/ElementSoup.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7fa8596c874ef45b38a7e70664b1aa3af0ed9ef0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_diffcommand.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_diffcommand.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed34c182e00d05e773ce340be649b07cf1e198d5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_diffcommand.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_difflib.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_difflib.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52e4305a2362523a01e21c2e48ae9b0622298cac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_difflib.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_html5builder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_html5builder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0622c73d42ecd0ec13da67fa22437eea6e3091a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_html5builder.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_setmixin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_setmixin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41dc466109d6c974fb77e7af000d4cd1d60fe066 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_setmixin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/builder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/builder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b8de17cf8c3020747b2f9e2e092b9ef5fce5c96 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/builder.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/clean.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/clean.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..470e8112d8c81d8fa15336ff6e56fde0616924c2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/clean.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/defs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/defs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9551006a10c4e82a4d4ec3f06e81648c40d380b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/defs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/diff.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/diff.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db44479c142c01c453c3991c68d1dff66166c368 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/diff.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/formfill.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/formfill.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a32d00b06f25b0bfd456fa482b5d0e459f97d3be Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/formfill.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/html5parser.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/html5parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5163f56add1e7fe8d3f667d9968d767bfcd6565 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/html5parser.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/soupparser.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/soupparser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee47a83f78807fa33b573d23024a8f3df7df3a68 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/soupparser.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/usedoctest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/usedoctest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7984c0833594a220c42887f03f39ce3873a41926 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/usedoctest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49599ddd7de19acbd26134b4c47f988a138a7dec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/c14n.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/c14n.pxd new file mode 100644 index 0000000000000000000000000000000000000000..8b1f3c4c516b23ec938da286e0ddb7b8f5795ee2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/c14n.pxd @@ -0,0 +1,25 @@ +from lxml.includes.tree cimport xmlDoc, xmlOutputBuffer, xmlChar +from lxml.includes.xpath cimport xmlNodeSet + +cdef extern from "libxml/c14n.h" nogil: + cdef int xmlC14NDocDumpMemory(xmlDoc* doc, + xmlNodeSet* nodes, + int exclusive, + xmlChar** inclusive_ns_prefixes, + int with_comments, + xmlChar** doc_txt_ptr) + + cdef int xmlC14NDocSave(xmlDoc* doc, + xmlNodeSet* nodes, + int exclusive, + xmlChar** inclusive_ns_prefixes, + int with_comments, + char* filename, + int compression) + + cdef int xmlC14NDocSaveTo(xmlDoc* doc, + xmlNodeSet* nodes, + int exclusive, + xmlChar** inclusive_ns_prefixes, + int with_comments, + xmlOutputBuffer* buffer) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/dtdvalid.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/dtdvalid.pxd new file mode 100644 index 0000000000000000000000000000000000000000..2ad49db11b20e4e710f4f55cf5590ef3928f1058 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/dtdvalid.pxd @@ -0,0 +1,18 @@ +from lxml.includes cimport tree +from lxml.includes.tree cimport xmlDoc, xmlDtd + +cdef extern from "libxml/valid.h" nogil: + ctypedef void (*xmlValidityErrorFunc)(void * ctx, const char * msg, ...) noexcept + ctypedef void (*xmlValidityWarningFunc)(void * ctx, const char * msg, ...) noexcept + + ctypedef struct xmlValidCtxt: + void *userData + xmlValidityErrorFunc error + xmlValidityWarningFunc warning + + cdef xmlValidCtxt* xmlNewValidCtxt() + cdef void xmlFreeValidCtxt(xmlValidCtxt* cur) + + cdef int xmlValidateDtd(xmlValidCtxt* ctxt, xmlDoc* doc, xmlDtd* dtd) + cdef tree.xmlElement* xmlGetDtdElementDesc( + xmlDtd* dtd, tree.const_xmlChar* name) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etree_defs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etree_defs.h new file mode 100644 index 0000000000000000000000000000000000000000..0cb6eb3a58c5910770733e8837c5d296049fe4cb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etree_defs.h @@ -0,0 +1,390 @@ +#ifndef HAS_ETREE_DEFS_H +#define HAS_ETREE_DEFS_H + +/* quick check for Python/libxml2/libxslt devel setup */ +#include "Python.h" +#ifndef PY_VERSION_HEX +# error the development package of Python (header files etc.) is not installed correctly +#elif PY_VERSION_HEX < 0x03060000 +# error this version of lxml requires Python 3.6 or later +#endif + +#include "libxml/xmlversion.h" +#ifndef LIBXML_VERSION +# error the development package of libxml2 (header files etc.) is not installed correctly +#elif LIBXML_VERSION < 20700 +# error minimum required version of libxml2 is 2.7.0 +#endif + +#include "libxslt/xsltconfig.h" +#ifndef LIBXSLT_VERSION +# error the development package of libxslt (header files etc.) is not installed correctly +#elif LIBXSLT_VERSION < 10123 +# error minimum required version of libxslt is 1.1.23 +#endif + + +/* v_arg functions */ +#define va_int(ap) va_arg(ap, int) +#define va_charptr(ap) va_arg(ap, char *) + +#ifdef PYPY_VERSION +# define IS_PYPY 1 +#else +# define IS_PYPY 0 +#endif + +/* unused */ +#define IS_PYTHON2 0 +#define IS_PYTHON3 1 +#undef LXML_UNICODE_STRINGS +#define LXML_UNICODE_STRINGS 1 + +#if !IS_PYPY +# define PyWeakref_LockObject(obj) (NULL) +#endif + +/* Threading is not currently supported by PyPy */ +#if IS_PYPY +# ifndef WITHOUT_THREADING +# define WITHOUT_THREADING +# endif +#endif + +#if IS_PYPY +# ifndef PyUnicode_FromFormat +# define PyUnicode_FromFormat PyString_FromFormat +# endif +# if !defined(PyBytes_FromFormat) +# ifdef PyString_FromFormat +# define PyBytes_FromFormat PyString_FromFormat +# else +#include +static PyObject* PyBytes_FromFormat(const char* format, ...) { + PyObject *string; + va_list vargs; +#ifdef HAVE_STDARG_PROTOTYPES + va_start(vargs, format); +#else + va_start(vargs); +#endif + string = PyUnicode_FromFormatV(format, vargs); + va_end(vargs); + if (string && PyUnicode_Check(string)) { + PyObject *bstring = PyUnicode_AsUTF8String(string); + Py_DECREF(string); + string = bstring; + } + if (string && !PyBytes_CheckExact(string)) { + Py_DECREF(string); + string = NULL; + PyErr_SetString(PyExc_TypeError, "String formatting and encoding failed to return bytes object"); + } + return string; +} +# endif +# endif +#endif + +#if PY_VERSION_HEX >= 0x030B00A1 +/* Python 3.12 doesn't have wstr Unicode strings any more. */ +#undef PyUnicode_GET_DATA_SIZE +#define PyUnicode_GET_DATA_SIZE(ustr) (0) +#undef PyUnicode_AS_DATA +#define PyUnicode_AS_DATA(ustr) (NULL) +#undef PyUnicode_IS_READY +#define PyUnicode_IS_READY(ustr) (1) +#endif + +#ifdef WITHOUT_THREADING +# undef PyEval_SaveThread +# define PyEval_SaveThread() (NULL) +# undef PyEval_RestoreThread +# define PyEval_RestoreThread(state) if (state); else {} +# undef PyGILState_Ensure +# define PyGILState_Ensure() (PyGILState_UNLOCKED) +# undef PyGILState_Release +# define PyGILState_Release(state) if (state); else {} +# undef Py_UNBLOCK_THREADS +# define Py_UNBLOCK_THREADS _save = NULL; +# undef Py_BLOCK_THREADS +# define Py_BLOCK_THREADS if (_save); else {} +#endif + +#ifdef WITHOUT_THREADING +# define ENABLE_THREADING 0 +#else +# define ENABLE_THREADING 1 +#endif + +#if LIBXML_VERSION < 20704 +/* FIXME: hack to make new error reporting compile in old libxml2 versions */ +# define xmlStructuredErrorContext NULL +# define xmlXIncludeProcessTreeFlagsData(n,o,d) xmlXIncludeProcessTreeFlags(n,o) +#endif + +/* schematron was added in libxml2 2.6.21 */ +#ifdef LIBXML_SCHEMATRON_ENABLED +# define ENABLE_SCHEMATRON 1 +#else +# define ENABLE_SCHEMATRON 0 +# define XML_SCHEMATRON_OUT_QUIET 0 +# define XML_SCHEMATRON_OUT_XML 0 +# define XML_SCHEMATRON_OUT_ERROR 0 + typedef void xmlSchematron; + typedef void xmlSchematronParserCtxt; + typedef void xmlSchematronValidCtxt; +# define xmlSchematronNewDocParserCtxt(doc) NULL +# define xmlSchematronNewParserCtxt(file) NULL +# define xmlSchematronParse(ctxt) NULL +# define xmlSchematronFreeParserCtxt(ctxt) +# define xmlSchematronFree(schema) +# define xmlSchematronNewValidCtxt(schema, options) NULL +# define xmlSchematronValidateDoc(ctxt, doc) 0 +# define xmlSchematronFreeValidCtxt(ctxt) +# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data) +#endif + +#if LIBXML_VERSION < 20708 +# define HTML_PARSE_NODEFDTD 4 +#endif +#if LIBXML_VERSION < 20900 +# define XML_PARSE_BIG_LINES 0x400000 +#endif +#if LIBXML_VERSION < 21300 +# define XML_PARSE_NO_XXE 0x800000 +#endif +#if LIBXML_VERSION < 21400 +# define XML_PARSE_UNZIP 0x1000000 +# define XML_PARSE_NO_SYS_CATALOG 0x2000000 +# define XML_PARSE_CATALOG_PI 0x4000000 +#endif +#if LIBXML_VERSION < 21500 +# define XML_PARSE_SKIP_IDS 0x8000000 +#endif + +#include "libxml/tree.h" +#ifndef LIBXML2_NEW_BUFFER + typedef xmlBuffer xmlBuf; +# define xmlBufContent(buf) xmlBufferContent(buf) +# define xmlBufUse(buf) xmlBufferLength(buf) +#endif + +#if LIBXML_VERSION < 21500 +# define xmlCtxtIsStopped(p_ctxt) ((p_ctxt)->disableSAX != 0) +#endif + +/* libexslt 1.1.25+ support EXSLT functions in XPath */ +#if LIBXSLT_VERSION < 10125 +#define exsltDateXpathCtxtRegister(ctxt, prefix) +#define exsltSetsXpathCtxtRegister(ctxt, prefix) +#define exsltMathXpathCtxtRegister(ctxt, prefix) +#define exsltStrXpathCtxtRegister(ctxt, prefix) +#endif + +#define LXML_GET_XSLT_ENCODING(result_var, style) XSLT_GET_IMPORT_PTR(result_var, style, encoding) + +/* work around MSDEV 6.0 */ +#if (_MSC_VER == 1200) && (WINVER < 0x0500) +long _ftol( double ); //defined by VC6 C libs +long _ftol2( double dblSource ) { return _ftol( dblSource ); } +#endif + +#ifdef __GNUC__ +/* Test for GCC > 2.95 */ +#if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)) +#define unlikely_condition(x) __builtin_expect((x), 0) +#else /* __GNUC__ > 2 ... */ +#define unlikely_condition(x) (x) +#endif /* __GNUC__ > 2 ... */ +#else /* __GNUC__ */ +#define unlikely_condition(x) (x) +#endif /* __GNUC__ */ + +#ifndef Py_TYPE + #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) +#endif + +#define _fqtypename(o) ((Py_TYPE(o))->tp_name) + +#define lxml_malloc(count, item_size) \ + (unlikely_condition((size_t)(count) > (size_t) (PY_SSIZE_T_MAX / item_size)) ? NULL : \ + (PyMem_Malloc((count) * item_size))) + +#define lxml_realloc(mem, count, item_size) \ + (unlikely_condition((size_t)(count) > (size_t) (PY_SSIZE_T_MAX / item_size)) ? NULL : \ + (PyMem_Realloc(mem, (count) * item_size))) + +#define lxml_free(mem) PyMem_Free(mem) + +#define _isString(obj) (PyUnicode_Check(obj) || PyBytes_Check(obj)) + +#define _isElement(c_node) \ + (((c_node)->type == XML_ELEMENT_NODE) || \ + ((c_node)->type == XML_COMMENT_NODE) || \ + ((c_node)->type == XML_ENTITY_REF_NODE) || \ + ((c_node)->type == XML_PI_NODE)) + +#define _isElementOrXInclude(c_node) \ + (_isElement(c_node) || \ + ((c_node)->type == XML_XINCLUDE_START) || \ + ((c_node)->type == XML_XINCLUDE_END)) + +#define _getNs(c_node) \ + (((c_node)->ns == 0) ? 0 : ((c_node)->ns->href)) + + +#include "string.h" +static void* lxml_unpack_xmldoc_capsule(PyObject* capsule, int* is_owned) { + xmlDoc *c_doc; + void *context; + *is_owned = 0; + if (unlikely_condition(!PyCapsule_IsValid(capsule, (const char*)"libxml2:xmlDoc"))) { + PyErr_SetString( + PyExc_TypeError, + "Not a valid capsule. The capsule argument must be a capsule object with name libxml2:xmlDoc"); + return NULL; + } + c_doc = (xmlDoc*) PyCapsule_GetPointer(capsule, (const char*)"libxml2:xmlDoc"); + if (unlikely_condition(!c_doc)) return NULL; + + if (unlikely_condition(c_doc->type != XML_DOCUMENT_NODE && c_doc->type != XML_HTML_DOCUMENT_NODE)) { + PyErr_Format( + PyExc_ValueError, + "Illegal document provided: expected XML or HTML, found %d", (int)c_doc->type); + return NULL; + } + + context = PyCapsule_GetContext(capsule); + if (unlikely_condition(!context && PyErr_Occurred())) return NULL; + if (context && strcmp((const char*) context, "destructor:xmlFreeDoc") == 0) { + /* take ownership by setting destructor to NULL */ + if (PyCapsule_SetDestructor(capsule, NULL) == 0) { + /* ownership transferred => invalidate capsule by clearing its name */ + if (unlikely_condition(PyCapsule_SetName(capsule, NULL))) { + /* this should never happen since everything above succeeded */ + xmlFreeDoc(c_doc); + return NULL; + } + *is_owned = 1; + } + } + return c_doc; +} + +/* Macro pair implementation of a depth first tree walker + * + * Calls the code block between the BEGIN and END macros for all elements + * below c_tree_top (exclusively), starting at c_node (inclusively iff + * 'inclusive' is 1). The _ELEMENT_ variants will only stop on nodes + * that match _isElement(), the normal variant will stop on every node + * except text nodes. + * + * To traverse the node and all of its children and siblings in Pyrex, call + * cdef xmlNode* some_node + * BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 1) + * # do something with some_node + * END_FOR_EACH_ELEMENT_FROM(some_node) + * + * To traverse only the children and siblings of a node, call + * cdef xmlNode* some_node + * BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 0) + * # do something with some_node + * END_FOR_EACH_ELEMENT_FROM(some_node) + * + * To traverse only the children, do: + * cdef xmlNode* some_node + * some_node = parent_node.children + * BEGIN_FOR_EACH_ELEMENT_FROM(parent_node, some_node, 1) + * # do something with some_node + * END_FOR_EACH_ELEMENT_FROM(some_node) + * + * NOTE: 'some_node' MUST be a plain 'xmlNode*' ! + * + * NOTE: parent modification during the walk can divert the iterator, but + * should not segfault ! + */ + +#define _LX__ELEMENT_MATCH(c_node, only_elements) \ + ((only_elements) ? (_isElement(c_node)) : 1) + +#define _LX__ADVANCE_TO_NEXT(c_node, only_elements) \ + while ((c_node != 0) && (!_LX__ELEMENT_MATCH(c_node, only_elements))) \ + c_node = c_node->next; + +#define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \ +{ \ + /* walk through children first */ \ + xmlNode* _lx__next = c_node->children; \ + if (_lx__next != 0) { \ + if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \ + _lx__next = 0; \ + } else { \ + _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \ + } \ + } \ + if ((_lx__next == 0) && (c_node != c_stop_node)) { \ + /* try siblings */ \ + _lx__next = c_node->next; \ + _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \ + /* back off through parents */ \ + while (_lx__next == 0) { \ + c_node = c_node->parent; \ + if (c_node == 0) \ + break; \ + if (c_node == c_stop_node) \ + break; \ + if ((only_elements) && !_isElement(c_node)) \ + break; \ + /* we already traversed the parents -> siblings */ \ + _lx__next = c_node->next; \ + _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \ + } \ + } \ + c_node = _lx__next; \ +} + +#define _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, only_elements) \ +{ \ + if (c_node != 0) { \ + const xmlNode* _lx__tree_top = (c_tree_top); \ + const int _lx__only_elements = (only_elements); \ + /* make sure we start at an element */ \ + if (!_LX__ELEMENT_MATCH(c_node, _lx__only_elements)) { \ + /* we skip the node, so 'inclusive' is irrelevant */ \ + if (c_node == _lx__tree_top) \ + c_node = 0; /* nothing to traverse */ \ + else { \ + c_node = c_node->next; \ + _LX__ADVANCE_TO_NEXT(c_node, _lx__only_elements) \ + } \ + } else if (! (inclusive)) { \ + /* skip the first node */ \ + _LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \ + } \ + \ + /* now run the user code on the elements we find */ \ + while (c_node != 0) { \ + /* here goes the code to be run for each element */ + +#define _LX__END_FOR_EACH_FROM(c_node) \ + _LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \ + } \ + } \ +} + + +#define BEGIN_FOR_EACH_ELEMENT_FROM(c_tree_top, c_node, inclusive) \ + _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 1) + +#define END_FOR_EACH_ELEMENT_FROM(c_node) \ + _LX__END_FOR_EACH_FROM(c_node) + +#define BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive) \ + _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 0) + +#define END_FOR_EACH_FROM(c_node) \ + _LX__END_FOR_EACH_FROM(c_node) + + +#endif /* HAS_ETREE_DEFS_H */ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etreepublic.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etreepublic.pxd new file mode 100644 index 0000000000000000000000000000000000000000..7ef001b17b765de2a94172412681e6029e931c54 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etreepublic.pxd @@ -0,0 +1,237 @@ +# public Cython/C interface to lxml.etree + +from lxml.includes cimport tree +from lxml.includes.tree cimport const_xmlChar + +cdef extern from "lxml-version.h": + cdef char* LXML_VERSION_STRING + +cdef extern from "etree_defs.h": + # test if c_node is considered an Element (i.e. Element, Comment, etc.) + cdef bint _isElement(tree.xmlNode* c_node) noexcept nogil + + # return the namespace URI of the node or NULL + cdef const_xmlChar* _getNs(tree.xmlNode* node) noexcept nogil + + # pair of macros for tree traversal + cdef void BEGIN_FOR_EACH_ELEMENT_FROM(tree.xmlNode* tree_top, + tree.xmlNode* start_node, + int start_node_inclusive) noexcept nogil + cdef void END_FOR_EACH_ELEMENT_FROM(tree.xmlNode* start_node) noexcept nogil + +cdef extern from "etree_api.h": + + # first function to call! + cdef int import_lxml__etree() except -1 + + ########################################################################## + # public ElementTree API classes + + cdef class lxml.etree._Document [ object LxmlDocument ]: + cdef tree.xmlDoc* _c_doc + + cdef class lxml.etree._Element [ object LxmlElement ]: + cdef _Document _doc + cdef tree.xmlNode* _c_node + + cdef class lxml.etree.ElementBase(_Element) [ object LxmlElementBase ]: + pass + + cdef class lxml.etree._ElementTree [ object LxmlElementTree ]: + cdef _Document _doc + cdef _Element _context_node + + cdef class lxml.etree.ElementClassLookup [ object LxmlElementClassLookup ]: + cdef object (*_lookup_function)(object, _Document, tree.xmlNode*) + + cdef class lxml.etree.FallbackElementClassLookup(ElementClassLookup) \ + [ object LxmlFallbackElementClassLookup ]: + cdef ElementClassLookup fallback + cdef object (*_fallback_function)(object, _Document, tree.xmlNode*) + + ########################################################################## + # creating Element objects + + # create an Element for a C-node in the Document + cdef _Element elementFactory(_Document doc, tree.xmlNode* c_node) + + # create an ElementTree for an Element + cdef _ElementTree elementTreeFactory(_Element context_node) + + # create an ElementTree subclass for an Element + cdef _ElementTree newElementTree(_Element context_node, object subclass) + + # create an ElementTree from an external document + cdef _ElementTree adoptExternalDocument(tree.xmlDoc* c_doc, parser, bint is_owned) + + # create a new Element for an existing or new document (doc = None) + # builds Python object after setting text, tail, namespaces and attributes + cdef _Element makeElement(tag, _Document doc, parser, + text, tail, attrib, nsmap) + + # create a new SubElement for an existing parent + # builds Python object after setting text, tail, namespaces and attributes + cdef _Element makeSubElement(_Element parent, tag, text, tail, + attrib, nsmap) + + # deep copy a node to include it in the Document + cdef _Element deepcopyNodeToDocument(_Document doc, tree.xmlNode* c_root) + + # set the internal lookup function for Element/Comment/PI classes + # use setElementClassLookupFunction(NULL, None) to reset it + # note that the lookup function *must always* return an _Element subclass! + cdef void setElementClassLookupFunction( + object (*function)(object, _Document, tree.xmlNode*), object state) + + # lookup function that always returns the default Element class + # note that the first argument is expected to be None! + cdef object lookupDefaultElementClass(_1, _Document _2, + tree.xmlNode* c_node) + + # lookup function for namespace/tag specific Element classes + # note that the first argument is expected to be None! + cdef object lookupNamespaceElementClass(_1, _Document _2, + tree.xmlNode* c_node) + + # call the fallback lookup function of a FallbackElementClassLookup + cdef object callLookupFallback(FallbackElementClassLookup lookup, + _Document doc, tree.xmlNode* c_node) + + ########################################################################## + # XML attribute access + + # return an attribute value for a C attribute on a C element node + cdef unicode attributeValue(tree.xmlNode* c_element, + tree.xmlAttr* c_attrib_node) + + # return the value of the attribute with 'ns' and 'name' (or None) + cdef unicode attributeValueFromNsName(tree.xmlNode* c_element, + const_xmlChar* c_ns, const_xmlChar* c_name) + + # return the value of attribute "{ns}name", or the default value + cdef object getAttributeValue(_Element element, key, default) + + # return an iterator over attribute names (1), values (2) or items (3) + # attributes must not be removed during iteration! + cdef object iterattributes(_Element element, int keysvalues) + + # return the list of all attribute names (1), values (2) or items (3) + cdef list collectAttributes(tree.xmlNode* c_element, int keysvalues) + + # set an attribute value on an element + # on failure, sets an exception and returns -1 + cdef int setAttributeValue(_Element element, key, value) except -1 + + # delete an attribute + # on failure, sets an exception and returns -1 + cdef int delAttribute(_Element element, key) except -1 + + # delete an attribute based on name and namespace URI + # returns -1 if the attribute was not found (no exception) + cdef int delAttributeFromNsName(tree.xmlNode* c_element, + const_xmlChar* c_href, const_xmlChar* c_name) noexcept + + ########################################################################## + # XML node helper functions + + # check if the element has at least one child + cdef bint hasChild(tree.xmlNode* c_node) noexcept nogil + + # find child element number 'index' (supports negative indexes) + cdef tree.xmlNode* findChild(tree.xmlNode* c_node, + Py_ssize_t index) noexcept nogil + + # find child element number 'index' starting at first one + cdef tree.xmlNode* findChildForwards(tree.xmlNode* c_node, + Py_ssize_t index) nogil + + # find child element number 'index' starting at last one + cdef tree.xmlNode* findChildBackwards(tree.xmlNode* c_node, + Py_ssize_t index) nogil + + # return next/previous sibling element of the node + cdef tree.xmlNode* nextElement(tree.xmlNode* c_node) nogil + cdef tree.xmlNode* previousElement(tree.xmlNode* c_node) nogil + + ########################################################################## + # iterators (DEPRECATED API, don't use in new code!) + + cdef class lxml.etree._ElementTagMatcher [ object LxmlElementTagMatcher ]: + cdef char* _href + cdef char* _name + + # store "{ns}tag" (or None) filter for this matcher or element iterator + # ** unless _href *and* _name are set up 'by hand', this function *must* + # ** be called when subclassing the iterator below! + cdef void initTagMatch(_ElementTagMatcher matcher, tag) + + cdef class lxml.etree._ElementIterator(_ElementTagMatcher) [ + object LxmlElementIterator ]: + cdef _Element _node + cdef tree.xmlNode* (*_next_element)(tree.xmlNode*) + + # store the initial node of the iterator if it matches the required tag + # or its next matching sibling if not + cdef void iteratorStoreNext(_ElementIterator iterator, _Element node) + + ########################################################################## + # other helper functions + + # check if a C node matches a tag name and namespace + # (NULL allowed for each => always matches) + cdef int tagMatches(tree.xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name) + + # convert a UTF-8 char* to a Python unicode string + cdef unicode pyunicode(const_xmlChar* s) + + # convert the string to UTF-8 using the normal lxml.etree semantics + cdef bytes utf8(object s) + + # split a tag into a (URI, name) tuple, return None as URI for '{}tag' + cdef tuple getNsTag(object tag) + + # split a tag into a (URI, name) tuple, return b'' as URI for '{}tag' + cdef tuple getNsTagWithEmptyNs(object tag) + + # get the "{ns}tag" string for a C node + cdef unicode namespacedName(tree.xmlNode* c_node) + + # get the "{ns}tag" string for a href/tagname pair (c_ns may be NULL) + cdef unicode namespacedNameFromNsName(const_xmlChar* c_ns, const_xmlChar* c_tag) + + # check if the node has a text value (which may be '') + cdef bint hasText(tree.xmlNode* c_node) nogil + + # check if the node has a tail value (which may be '') + cdef bint hasTail(tree.xmlNode* c_node) nogil + + # get the text content of an element (or None) + cdef unicode textOf(tree.xmlNode* c_node) + + # get the tail content of an element (or None) + cdef unicode tailOf(tree.xmlNode* c_node) + + # set the text value of an element + cdef int setNodeText(tree.xmlNode* c_node, text) except -1 + + # set the tail text value of an element + cdef int setTailText(tree.xmlNode* c_node, text) except -1 + + # append an element to the children of a parent element + # deprecated: don't use, does not propagate exceptions! + # use appendChildToElement() instead + cdef void appendChild(_Element parent, _Element child) + + # added in lxml 3.3 as a safe replacement for appendChild() + # return -1 for exception, 0 for ok + cdef int appendChildToElement(_Element parent, _Element child) except -1 + + # recursively lookup a namespace in element or ancestors, or create it + cdef tree.xmlNs* findOrBuildNodeNsPrefix( + _Document doc, tree.xmlNode* c_node, const_xmlChar* href, const_xmlChar* prefix) + + # find the Document of an Element, ElementTree or Document (itself!) + cdef _Document documentOrRaise(object input) + + # find the root Element of an Element (itself!), ElementTree or Document + cdef _Element rootNodeOrRaise(object input) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b90e0754aa815daecf3b228e11375a0a7e9ea876 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/libcharset.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/libcharset.h new file mode 100644 index 0000000000000000000000000000000000000000..fcf22748101279e454fd2fefe01908fd8545bce2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/libcharset.h @@ -0,0 +1,45 @@ +/* Copyright (C) 2003 Free Software Foundation, Inc. + This file is part of the GNU CHARSET Library. + + The GNU CHARSET Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU CHARSET Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU CHARSET Library; see the file COPYING.LIB. If not, + see . */ + +#ifndef _LIBCHARSET_H +#define _LIBCHARSET_H + +#include + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Support for relocatable packages. */ + +/* Sets the original and the current installation prefix of the package. + Relocation simply replaces a pathname starting with the original prefix + by the corresponding pathname with the current prefix instead. Both + prefixes should be directory names without trailing slash (i.e. use "" + instead of "/"). */ +extern void libcharset_set_relocation_prefix (const char *orig_prefix, + const char *curr_prefix); + + +#ifdef __cplusplus +} +#endif + + +#endif /* _LIBCHARSET_H */ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/localcharset.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/localcharset.h new file mode 100644 index 0000000000000000000000000000000000000000..34ce0adde9bb793f1c1cd5f81b5cc3d2eff08ab1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/localcharset.h @@ -0,0 +1,137 @@ +/* Determine a canonical name for the current locale's character encoding. + Copyright (C) 2000-2003, 2009-2019 Free Software Foundation, Inc. + This file is part of the GNU CHARSET Library. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, see . */ + +#ifndef _LOCALCHARSET_H +#define _LOCALCHARSET_H + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Determine the current locale's character encoding, and canonicalize it + into one of the canonical names listed below. + The result must not be freed; it is statically allocated. The result + becomes invalid when setlocale() is used to change the global locale, or + when the value of one of the environment variables LC_ALL, LC_CTYPE, LANG + is changed; threads in multithreaded programs should not do this. + If the canonical name cannot be determined, the result is a non-canonical + name. */ +extern const char * locale_charset (void); + +/* About GNU canonical names for character encodings: + + Every canonical name must be supported by GNU libiconv. Support by GNU libc + is also desirable. + + The name is case insensitive. Usually an upper case MIME charset name is + preferred. + + The current list of these GNU canonical names is: + + name MIME? used by which systems + (darwin = Mac OS X, windows = native Windows) + + ASCII, ANSI_X3.4-1968 glibc solaris freebsd netbsd darwin minix cygwin + ISO-8859-1 Y glibc aix hpux irix osf solaris freebsd netbsd openbsd darwin cygwin zos + ISO-8859-2 Y glibc aix hpux irix osf solaris freebsd netbsd openbsd darwin cygwin zos + ISO-8859-3 Y glibc solaris cygwin + ISO-8859-4 Y hpux osf solaris freebsd netbsd openbsd darwin + ISO-8859-5 Y glibc aix hpux irix osf solaris freebsd netbsd openbsd darwin cygwin zos + ISO-8859-6 Y glibc aix hpux solaris cygwin + ISO-8859-7 Y glibc aix hpux irix osf solaris freebsd netbsd openbsd darwin cygwin zos + ISO-8859-8 Y glibc aix hpux osf solaris cygwin zos + ISO-8859-9 Y glibc aix hpux irix osf solaris freebsd darwin cygwin zos + ISO-8859-13 glibc hpux solaris freebsd netbsd openbsd darwin cygwin + ISO-8859-14 glibc cygwin + ISO-8859-15 glibc aix irix osf solaris freebsd netbsd openbsd darwin cygwin + KOI8-R Y glibc hpux solaris freebsd netbsd openbsd darwin + KOI8-U Y glibc freebsd netbsd openbsd darwin cygwin + KOI8-T glibc + CP437 dos + CP775 dos + CP850 aix osf dos + CP852 dos + CP855 dos + CP856 aix + CP857 dos + CP861 dos + CP862 dos + CP864 dos + CP865 dos + CP866 freebsd netbsd openbsd darwin dos + CP869 dos + CP874 windows dos + CP922 aix + CP932 aix cygwin windows dos + CP943 aix zos + CP949 osf darwin windows dos + CP950 windows dos + CP1046 aix + CP1124 aix + CP1125 dos + CP1129 aix + CP1131 freebsd darwin + CP1250 windows + CP1251 glibc hpux solaris freebsd netbsd openbsd darwin cygwin windows + CP1252 aix windows + CP1253 windows + CP1254 windows + CP1255 glibc windows + CP1256 windows + CP1257 windows + GB2312 Y glibc aix hpux irix solaris freebsd netbsd darwin cygwin zos + EUC-JP Y glibc aix hpux irix osf solaris freebsd netbsd darwin cygwin + EUC-KR Y glibc aix hpux irix osf solaris freebsd netbsd darwin cygwin zos + EUC-TW glibc aix hpux irix osf solaris netbsd + BIG5 Y glibc aix hpux osf solaris freebsd netbsd darwin cygwin zos + BIG5-HKSCS glibc hpux solaris netbsd darwin + GBK glibc aix osf solaris freebsd darwin cygwin windows dos + GB18030 glibc hpux solaris freebsd netbsd darwin + SHIFT_JIS Y hpux osf solaris freebsd netbsd darwin + JOHAB glibc solaris windows + TIS-620 glibc aix hpux osf solaris cygwin zos + VISCII Y glibc + TCVN5712-1 glibc + ARMSCII-8 glibc freebsd netbsd darwin + GEORGIAN-PS glibc cygwin + PT154 glibc netbsd cygwin + HP-ROMAN8 hpux + HP-ARABIC8 hpux + HP-GREEK8 hpux + HP-HEBREW8 hpux + HP-TURKISH8 hpux + HP-KANA8 hpux + DEC-KANJI osf + DEC-HANYU osf + UTF-8 Y glibc aix hpux osf solaris netbsd darwin cygwin zos + + Note: Names which are not marked as being a MIME name should not be used in + Internet protocols for information interchange (mail, news, etc.). + + Note: ASCII and ANSI_X3.4-1968 are synonymous canonical names. Applications + must understand both names and treat them as equivalent. + */ + + +#ifdef __cplusplus +} +#endif + + +#endif /* _LOCALCHARSET_H */ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zconf.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zconf.h new file mode 100644 index 0000000000000000000000000000000000000000..ede3c82e3eb129528194a2045c808b418fb20296 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zconf.h @@ -0,0 +1,543 @@ +/* zconf.h -- configuration of the zlib compression library + * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#ifndef ZCONF_H +#define ZCONF_H + +/* + * If you *really* need a unique prefix for all types and library functions, + * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. + * Even better than compiling with -DZ_PREFIX would be to use configure to set + * this permanently in zconf.h using "./configure --zprefix". + */ +#ifdef Z_PREFIX /* may be set to #if 1 by ./configure */ +# define Z_PREFIX_SET + +/* all linked symbols and init macros */ +# define _dist_code z__dist_code +# define _length_code z__length_code +# define _tr_align z__tr_align +# define _tr_flush_bits z__tr_flush_bits +# define _tr_flush_block z__tr_flush_block +# define _tr_init z__tr_init +# define _tr_stored_block z__tr_stored_block +# define _tr_tally z__tr_tally +# define adler32 z_adler32 +# define adler32_combine z_adler32_combine +# define adler32_combine64 z_adler32_combine64 +# define adler32_z z_adler32_z +# ifndef Z_SOLO +# define compress z_compress +# define compress2 z_compress2 +# define compressBound z_compressBound +# endif +# define crc32 z_crc32 +# define crc32_combine z_crc32_combine +# define crc32_combine64 z_crc32_combine64 +# define crc32_combine_gen z_crc32_combine_gen +# define crc32_combine_gen64 z_crc32_combine_gen64 +# define crc32_combine_op z_crc32_combine_op +# define crc32_z z_crc32_z +# define deflate z_deflate +# define deflateBound z_deflateBound +# define deflateCopy z_deflateCopy +# define deflateEnd z_deflateEnd +# define deflateGetDictionary z_deflateGetDictionary +# define deflateInit z_deflateInit +# define deflateInit2 z_deflateInit2 +# define deflateInit2_ z_deflateInit2_ +# define deflateInit_ z_deflateInit_ +# define deflateParams z_deflateParams +# define deflatePending z_deflatePending +# define deflatePrime z_deflatePrime +# define deflateReset z_deflateReset +# define deflateResetKeep z_deflateResetKeep +# define deflateSetDictionary z_deflateSetDictionary +# define deflateSetHeader z_deflateSetHeader +# define deflateTune z_deflateTune +# define deflate_copyright z_deflate_copyright +# define get_crc_table z_get_crc_table +# ifndef Z_SOLO +# define gz_error z_gz_error +# define gz_intmax z_gz_intmax +# define gz_strwinerror z_gz_strwinerror +# define gzbuffer z_gzbuffer +# define gzclearerr z_gzclearerr +# define gzclose z_gzclose +# define gzclose_r z_gzclose_r +# define gzclose_w z_gzclose_w +# define gzdirect z_gzdirect +# define gzdopen z_gzdopen +# define gzeof z_gzeof +# define gzerror z_gzerror +# define gzflush z_gzflush +# define gzfread z_gzfread +# define gzfwrite z_gzfwrite +# define gzgetc z_gzgetc +# define gzgetc_ z_gzgetc_ +# define gzgets z_gzgets +# define gzoffset z_gzoffset +# define gzoffset64 z_gzoffset64 +# define gzopen z_gzopen +# define gzopen64 z_gzopen64 +# ifdef _WIN32 +# define gzopen_w z_gzopen_w +# endif +# define gzprintf z_gzprintf +# define gzputc z_gzputc +# define gzputs z_gzputs +# define gzread z_gzread +# define gzrewind z_gzrewind +# define gzseek z_gzseek +# define gzseek64 z_gzseek64 +# define gzsetparams z_gzsetparams +# define gztell z_gztell +# define gztell64 z_gztell64 +# define gzungetc z_gzungetc +# define gzvprintf z_gzvprintf +# define gzwrite z_gzwrite +# endif +# define inflate z_inflate +# define inflateBack z_inflateBack +# define inflateBackEnd z_inflateBackEnd +# define inflateBackInit z_inflateBackInit +# define inflateBackInit_ z_inflateBackInit_ +# define inflateCodesUsed z_inflateCodesUsed +# define inflateCopy z_inflateCopy +# define inflateEnd z_inflateEnd +# define inflateGetDictionary z_inflateGetDictionary +# define inflateGetHeader z_inflateGetHeader +# define inflateInit z_inflateInit +# define inflateInit2 z_inflateInit2 +# define inflateInit2_ z_inflateInit2_ +# define inflateInit_ z_inflateInit_ +# define inflateMark z_inflateMark +# define inflatePrime z_inflatePrime +# define inflateReset z_inflateReset +# define inflateReset2 z_inflateReset2 +# define inflateResetKeep z_inflateResetKeep +# define inflateSetDictionary z_inflateSetDictionary +# define inflateSync z_inflateSync +# define inflateSyncPoint z_inflateSyncPoint +# define inflateUndermine z_inflateUndermine +# define inflateValidate z_inflateValidate +# define inflate_copyright z_inflate_copyright +# define inflate_fast z_inflate_fast +# define inflate_table z_inflate_table +# ifndef Z_SOLO +# define uncompress z_uncompress +# define uncompress2 z_uncompress2 +# endif +# define zError z_zError +# ifndef Z_SOLO +# define zcalloc z_zcalloc +# define zcfree z_zcfree +# endif +# define zlibCompileFlags z_zlibCompileFlags +# define zlibVersion z_zlibVersion + +/* all zlib typedefs in zlib.h and zconf.h */ +# define Byte z_Byte +# define Bytef z_Bytef +# define alloc_func z_alloc_func +# define charf z_charf +# define free_func z_free_func +# ifndef Z_SOLO +# define gzFile z_gzFile +# endif +# define gz_header z_gz_header +# define gz_headerp z_gz_headerp +# define in_func z_in_func +# define intf z_intf +# define out_func z_out_func +# define uInt z_uInt +# define uIntf z_uIntf +# define uLong z_uLong +# define uLongf z_uLongf +# define voidp z_voidp +# define voidpc z_voidpc +# define voidpf z_voidpf + +/* all zlib structs in zlib.h and zconf.h */ +# define gz_header_s z_gz_header_s +# define internal_state z_internal_state + +#endif + +#if defined(__MSDOS__) && !defined(MSDOS) +# define MSDOS +#endif +#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) +# define OS2 +#endif +#if defined(_WINDOWS) && !defined(WINDOWS) +# define WINDOWS +#endif +#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) +# ifndef WIN32 +# define WIN32 +# endif +#endif +#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) +# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) +# ifndef SYS16BIT +# define SYS16BIT +# endif +# endif +#endif + +/* + * Compile with -DMAXSEG_64K if the alloc function cannot allocate more + * than 64k bytes at a time (needed on systems with 16-bit int). + */ +#ifdef SYS16BIT +# define MAXSEG_64K +#endif +#ifdef MSDOS +# define UNALIGNED_OK +#endif + +#ifdef __STDC_VERSION__ +# ifndef STDC +# define STDC +# endif +# if __STDC_VERSION__ >= 199901L +# ifndef STDC99 +# define STDC99 +# endif +# endif +#endif +#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) +# define STDC +#endif +#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) +# define STDC +#endif +#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) +# define STDC +#endif +#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) +# define STDC +#endif + +#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ +# define STDC +#endif + +#ifndef STDC +# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ +# define const /* note: need a more gentle solution here */ +# endif +#endif + +#if defined(ZLIB_CONST) && !defined(z_const) +# define z_const const +#else +# define z_const +#endif + +#ifdef Z_SOLO +# ifdef _WIN64 + typedef unsigned long long z_size_t; +# else + typedef unsigned long z_size_t; +# endif +#else +# define z_longlong long long +# if defined(NO_SIZE_T) + typedef unsigned NO_SIZE_T z_size_t; +# elif defined(STDC) +# include + typedef size_t z_size_t; +# else + typedef unsigned long z_size_t; +# endif +# undef z_longlong +#endif + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# ifdef MAXSEG_64K +# define MAX_MEM_LEVEL 8 +# else +# define MAX_MEM_LEVEL 9 +# endif +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2. + * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files + * created by gzip. (Files created by minigzip can still be extracted by + * gzip.) + */ +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + (1 << (windowBits+2)) + (1 << (memLevel+9)) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus about 7 kilobytes + for small objects. +*/ + + /* Type declarations */ + +#ifndef OF /* function prototypes */ +# ifdef STDC +# define OF(args) args +# else +# define OF(args) () +# endif +#endif + +/* The following definitions for FAR are needed only for MSDOS mixed + * model programming (small or medium model with some far allocations). + * This was tested only with MSC; for other MSDOS compilers you may have + * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, + * just define FAR to be empty. + */ +#ifdef SYS16BIT +# if defined(M_I86SM) || defined(M_I86MM) + /* MSC small or medium model */ +# define SMALL_MEDIUM +# ifdef _MSC_VER +# define FAR _far +# else +# define FAR far +# endif +# endif +# if (defined(__SMALL__) || defined(__MEDIUM__)) + /* Turbo C small or medium model */ +# define SMALL_MEDIUM +# ifdef __BORLANDC__ +# define FAR _far +# else +# define FAR far +# endif +# endif +#endif + +#if defined(WINDOWS) || defined(WIN32) + /* If building or using zlib as a DLL, define ZLIB_DLL. + * This is not mandatory, but it offers a little performance increase. + */ +# ifdef ZLIB_DLL +# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) +# ifdef ZLIB_INTERNAL +# define ZEXTERN extern __declspec(dllexport) +# else +# define ZEXTERN extern __declspec(dllimport) +# endif +# endif +# endif /* ZLIB_DLL */ + /* If building or using zlib with the WINAPI/WINAPIV calling convention, + * define ZLIB_WINAPI. + * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. + */ +# ifdef ZLIB_WINAPI +# ifdef FAR +# undef FAR +# endif +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# include + /* No need for _export, use ZLIB.DEF instead. */ + /* For complete Windows compatibility, use WINAPI, not __stdcall. */ +# define ZEXPORT WINAPI +# ifdef WIN32 +# define ZEXPORTVA WINAPIV +# else +# define ZEXPORTVA FAR CDECL +# endif +# endif +#endif + +#if defined (__BEOS__) +# ifdef ZLIB_DLL +# ifdef ZLIB_INTERNAL +# define ZEXPORT __declspec(dllexport) +# define ZEXPORTVA __declspec(dllexport) +# else +# define ZEXPORT __declspec(dllimport) +# define ZEXPORTVA __declspec(dllimport) +# endif +# endif +#endif + +#ifndef ZEXTERN +# define ZEXTERN extern +#endif +#ifndef ZEXPORT +# define ZEXPORT +#endif +#ifndef ZEXPORTVA +# define ZEXPORTVA +#endif + +#ifndef FAR +# define FAR +#endif + +#if !defined(__MACTYPES__) +typedef unsigned char Byte; /* 8 bits */ +#endif +typedef unsigned int uInt; /* 16 bits or more */ +typedef unsigned long uLong; /* 32 bits or more */ + +#ifdef SMALL_MEDIUM + /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ +# define Bytef Byte FAR +#else + typedef Byte FAR Bytef; +#endif +typedef char FAR charf; +typedef int FAR intf; +typedef uInt FAR uIntf; +typedef uLong FAR uLongf; + +#ifdef STDC + typedef void const *voidpc; + typedef void FAR *voidpf; + typedef void *voidp; +#else + typedef Byte const *voidpc; + typedef Byte FAR *voidpf; + typedef Byte *voidp; +#endif + +#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC) +# include +# if (UINT_MAX == 0xffffffffUL) +# define Z_U4 unsigned +# elif (ULONG_MAX == 0xffffffffUL) +# define Z_U4 unsigned long +# elif (USHRT_MAX == 0xffffffffUL) +# define Z_U4 unsigned short +# endif +#endif + +#ifdef Z_U4 + typedef Z_U4 z_crc_t; +#else + typedef unsigned long z_crc_t; +#endif + +#if 1 /* was set to #if 1 by ./configure */ +# define Z_HAVE_UNISTD_H +#endif + +#if 1 /* was set to #if 1 by ./configure */ +# define Z_HAVE_STDARG_H +#endif + +#ifdef STDC +# ifndef Z_SOLO +# include /* for off_t */ +# endif +#endif + +#if defined(STDC) || defined(Z_HAVE_STDARG_H) +# ifndef Z_SOLO +# include /* for va_list */ +# endif +#endif + +#ifdef _WIN32 +# ifndef Z_SOLO +# include /* for wchar_t */ +# endif +#endif + +/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and + * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even + * though the former does not conform to the LFS document), but considering + * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as + * equivalently requesting no 64-bit operations + */ +#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1 +# undef _LARGEFILE64_SOURCE +#endif + +#ifndef Z_HAVE_UNISTD_H +# ifdef __WATCOMC__ +# define Z_HAVE_UNISTD_H +# endif +#endif +#ifndef Z_HAVE_UNISTD_H +# if defined(_LARGEFILE64_SOURCE) && !defined(_WIN32) +# define Z_HAVE_UNISTD_H +# endif +#endif +#ifndef Z_SOLO +# if defined(Z_HAVE_UNISTD_H) +# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# ifdef VMS +# include /* for off_t */ +# endif +# ifndef z_off_t +# define z_off_t off_t +# endif +# endif +#endif + +#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0 +# define Z_LFS64 +#endif + +#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64) +# define Z_LARGE64 +#endif + +#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64) +# define Z_WANT64 +#endif + +#if !defined(SEEK_SET) && !defined(Z_SOLO) +# define SEEK_SET 0 /* Seek from beginning of file. */ +# define SEEK_CUR 1 /* Seek from current position. */ +# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ +#endif + +#ifndef z_off_t +# define z_off_t long +#endif + +#if !defined(_WIN32) && defined(Z_LARGE64) +# define z_off64_t off64_t +#else +# if defined(_WIN32) && !defined(__GNUC__) +# define z_off64_t __int64 +# else +# define z_off64_t z_off_t +# endif +#endif + +/* MVS linker does not support external names larger than 8 bytes */ +#if defined(__MVS__) + #pragma map(deflateInit_,"DEIN") + #pragma map(deflateInit2_,"DEIN2") + #pragma map(deflateEnd,"DEEND") + #pragma map(deflateBound,"DEBND") + #pragma map(inflateInit_,"ININ") + #pragma map(inflateInit2_,"ININ2") + #pragma map(inflateEnd,"INEND") + #pragma map(inflateSync,"INSY") + #pragma map(inflateSetDictionary,"INSEDI") + #pragma map(compressBound,"CMBND") + #pragma map(inflate_table,"INTABL") + #pragma map(inflate_fast,"INFA") + #pragma map(inflate_copyright,"INCOPY") +#endif + +#endif /* ZCONF_H */ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zlib.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zlib.h new file mode 100644 index 0000000000000000000000000000000000000000..8d4b932eaf6a0fbb8133b3ab49ba5ef587059fa0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zlib.h @@ -0,0 +1,1938 @@ +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.3.1, January 22nd, 2024 + + Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950 + (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format). +*/ + +#ifndef ZLIB_H +#define ZLIB_H + +#include "zconf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZLIB_VERSION "1.3.1" +#define ZLIB_VERNUM 0x1310 +#define ZLIB_VER_MAJOR 1 +#define ZLIB_VER_MINOR 3 +#define ZLIB_VER_REVISION 1 +#define ZLIB_VER_SUBREVISION 0 + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed data. + This version of the library supports only one compression method (deflation) + but other algorithms will be added later and will have the same stream + interface. + + Compression can be done in a single step if the buffers are large enough, + or can be done by repeated calls of the compression function. In the latter + case, the application must provide more input and/or consume the output + (providing more output space) before each call. + + The compressed data format used by default by the in-memory functions is + the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped + around a deflate stream, which is itself documented in RFC 1951. + + The library also supports reading and writing files in gzip (.gz) format + with an interface similar to that of stdio using the functions that start + with "gz". The gzip format is different from the zlib format. gzip is a + gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. + + This library can optionally read and write gzip and raw deflate streams in + memory as well. + + The zlib format was designed to be compact and fast for use in memory + and on communications channels. The gzip format was designed for single- + file compression on file systems, has a larger header than zlib to maintain + directory information, and uses a different, slower check method than zlib. + + The library does not install any signal handler. The decoder checks + the consistency of the compressed data, so the library should never crash + even in the case of corrupted input. +*/ + +typedef voidpf (*alloc_func)(voidpf opaque, uInt items, uInt size); +typedef void (*free_func)(voidpf opaque, voidpf address); + +struct internal_state; + +typedef struct z_stream_s { + z_const Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total number of input bytes read so far */ + + Bytef *next_out; /* next output byte will go here */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total number of bytes output so far */ + + z_const char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: binary or text + for deflate, or the decoding state for inflate */ + uLong adler; /* Adler-32 or CRC-32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + gzip header information passed to and from zlib routines. See RFC 1952 + for more details on the meanings of these fields. +*/ +typedef struct gz_header_s { + int text; /* true if compressed data believed to be text */ + uLong time; /* modification time */ + int xflags; /* extra flags (not used when writing a gzip file) */ + int os; /* operating system */ + Bytef *extra; /* pointer to extra field or Z_NULL if none */ + uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ + uInt extra_max; /* space at extra (only when reading header) */ + Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ + uInt name_max; /* space at name (only when reading header) */ + Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ + uInt comm_max; /* space at comment (only when reading header) */ + int hcrc; /* true if there was or will be a header crc */ + int done; /* true when done reading gzip header (not used + when writing a gzip file) */ +} gz_header; + +typedef gz_header FAR *gz_headerp; + +/* + The application must update next_in and avail_in when avail_in has dropped + to zero. It must update next_out and avail_out when avail_out has dropped + to zero. The application must initialize zalloc, zfree and opaque before + calling the init function. All other fields are set by the compression + library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + If zlib is used in a multi-threaded application, zalloc and zfree must be + thread safe. In that case, zlib is thread-safe. When zalloc and zfree are + Z_NULL on entry to the initialization function, they are set to internal + routines that use the standard library functions malloc() and free(). + + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this if + the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, pointers + returned by zalloc for objects of exactly 65536 bytes *must* have their + offset normalized to zero. The default allocation function provided by this + library ensures this (see zutil.c). To reduce memory requirements and avoid + any allocation of 64K objects, at the expense of compression ratio, compile + the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or progress + reports. After compression, total_in holds the total size of the + uncompressed data and may be saved for use by the decompressor (particularly + if the decompressor wants to decompress everything in a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 +#define Z_SYNC_FLUSH 2 +#define Z_FULL_FLUSH 3 +#define Z_FINISH 4 +#define Z_BLOCK 5 +#define Z_TREES 6 +/* Allowed flush values; see deflate() and inflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative values + * are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_RLE 3 +#define Z_FIXED 4 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_TEXT 1 +#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ +#define Z_UNKNOWN 2 +/* Possible values of the data_type field for deflate() */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + + /* basic functions */ + +ZEXTERN const char * ZEXPORT zlibVersion(void); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is not + compatible with the zlib.h header file used by the application. This check + is automatically made by deflateInit and inflateInit. + */ + +/* +ZEXTERN int ZEXPORT deflateInit(z_streamp strm, int level); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. If + zalloc and zfree are set to Z_NULL, deflateInit updates them to use default + allocation functions. total_in, total_out, adler, and msg are initialized. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at all + (the input data is simply copied a block at a time). Z_DEFAULT_COMPRESSION + requests a default compromise between speed and compression (currently + equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if level is not a valid compression level, or + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). msg is set to null + if there is no error message. deflateInit does not perform any compression: + this will be done by deflate(). +*/ + + +ZEXTERN int ZEXPORT deflate(z_streamp strm, int flush); +/* + deflate compresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. deflate performs one or both of the + following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Generate more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary. Some output may be provided even if + flush is zero. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming more + output, and updating avail_in or avail_out accordingly; avail_out should + never be zero before the call. The application can consume the compressed + output when it wants, for example when the output buffer is full (avail_out + == 0), or after each call of deflate(). If deflate returns Z_OK and with + zero avail_out, it must be called again after making room in the output + buffer because there might be more output pending. See deflatePending(), + which can be used if desired to determine whether or not there is more output + in that case. + + Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to + decide how much data to accumulate before producing output, in order to + maximize compression. + + If the parameter flush is set to Z_SYNC_FLUSH, all pending output is + flushed to the output buffer and the output is aligned on a byte boundary, so + that the decompressor can get all input data available so far. (In + particular avail_in is zero after the call if enough output space has been + provided before the call.) Flushing may degrade compression for some + compression algorithms and so it should be used only when necessary. This + completes the current deflate block and follows it with an empty stored block + that is three bits plus filler bits to the next byte, followed by four bytes + (00 00 ff ff). + + If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the + output buffer, but the output is not aligned to a byte boundary. All of the + input data so far will be available to the decompressor, as for Z_SYNC_FLUSH. + This completes the current deflate block and follows it with an empty fixed + codes block that is 10 bits long. This assures that enough bytes are output + in order for the decompressor to finish the block before the empty fixed + codes block. + + If flush is set to Z_BLOCK, a deflate block is completed and emitted, as + for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to + seven bits of the current block are held to be written as the next byte after + the next deflate block is completed. In this case, the decompressor may not + be provided enough bits at this point in order to complete decompression of + the data provided so far to the compressor. It may need to wait for the next + block to be emitted. This is for advanced applications that need to control + the emission of deflate blocks. + + If flush is set to Z_FULL_FLUSH, all output is flushed as with + Z_SYNC_FLUSH, and the compression state is reset so that decompression can + restart from this point if previous compressed data has been damaged or if + random access is desired. Using Z_FULL_FLUSH too often can seriously degrade + compression. + + If deflate returns with avail_out == 0, this function must be called again + with the same value of the flush parameter and more output space (updated + avail_out), until the flush is complete (deflate returns with non-zero + avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that + avail_out is greater than six when the flush marker begins, in order to avoid + repeated flush markers upon calling deflate() again when avail_out == 0. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there was + enough output space. If deflate returns with Z_OK or Z_BUF_ERROR, this + function must be called again with Z_FINISH and more output space (updated + avail_out) but no more input data, until it returns with Z_STREAM_END or an + error. After deflate has returned Z_STREAM_END, the only possible operations + on the stream are deflateReset or deflateEnd. + + Z_FINISH can be used in the first deflate call after deflateInit if all the + compression is to be done in a single step. In order to complete in one + call, avail_out must be at least the value returned by deflateBound (see + below). Then deflate is guaranteed to return Z_STREAM_END. If not enough + output space is provided, deflate will not return Z_STREAM_END, and it must + be called again as described above. + + deflate() sets strm->adler to the Adler-32 checksum of all input read + so far (that is, total_in bytes). If a gzip stream is being generated, then + strm->adler will be the CRC-32 checksum of the input read so far. (See + deflateInit2 below.) + + deflate() may update strm->data_type if it can make a good guess about + the input data type (Z_BINARY or Z_TEXT). If in doubt, the data is + considered binary. This field is only for information purposes and does not + affect the compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was Z_NULL or the state was inadvertently written over + by the application), or Z_BUF_ERROR if no progress is possible (for example + avail_in or avail_out was zero). Note that Z_BUF_ERROR is not fatal, and + deflate() can be called again with more input and more output space to + continue compressing. +*/ + + +ZEXTERN int ZEXPORT deflateEnd(z_streamp strm); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any pending + output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, msg + may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +ZEXTERN int ZEXPORT inflateInit(z_streamp strm); + + Initializes the internal stream state for decompression. The fields + next_in, avail_in, zalloc, zfree and opaque must be initialized before by + the caller. In the current version of inflate, the provided input is not + read or consumed. The allocation of a sliding window will be deferred to + the first call of inflate (if the decompression does not complete on the + first call). If zalloc and zfree are set to Z_NULL, inflateInit updates + them to use default allocation functions. total_in, total_out, adler, and + msg are initialized. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller, or Z_STREAM_ERROR if the parameters are + invalid, such as a null pointer to the structure. msg is set to null if + there is no error message. inflateInit does not perform any decompression. + Actual decompression will be done by inflate(). So next_in, and avail_in, + next_out, and avail_out are unused and unchanged. The current + implementation of inflateInit() does not process any header information -- + that is deferred until inflate() is called. +*/ + + +ZEXTERN int ZEXPORT inflate(z_streamp strm, int flush); +/* + inflate decompresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. inflate performs one or both of the + following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), then next_in and avail_in are updated + accordingly, and processing will resume at this point for the next call of + inflate(). + + - Generate more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there is + no more input data or no more space in the output buffer (see below about + the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming more + output, and updating the next_* and avail_* values accordingly. If the + caller of inflate() does not provide both available input and available + output space, it is possible that there will be no progress made. The + application can consume the uncompressed output when it wants, for example + when the output buffer is full (avail_out == 0), or after each call of + inflate(). If inflate returns Z_OK and with zero avail_out, it must be + called again after making room in the output buffer because there might be + more output pending. + + The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH, + Z_BLOCK, or Z_TREES. Z_SYNC_FLUSH requests that inflate() flush as much + output as possible to the output buffer. Z_BLOCK requests that inflate() + stop if and when it gets to the next deflate block boundary. When decoding + the zlib or gzip format, this will cause inflate() to return immediately + after the header and before the first block. When doing a raw inflate, + inflate() will go ahead and process the first block, and will return when it + gets to the end of that block, or when it runs out of data. + + The Z_BLOCK option assists in appending to or combining deflate streams. + To assist in this, on return inflate() always sets strm->data_type to the + number of unused bits in the last byte taken from strm->next_in, plus 64 if + inflate() is currently decoding the last block in the deflate stream, plus + 128 if inflate() returned immediately after decoding an end-of-block code or + decoding the complete header up to just before the first byte of the deflate + stream. The end-of-block will not be indicated until all of the uncompressed + data from that block has been written to strm->next_out. The number of + unused bits may in general be greater than seven, except when bit 7 of + data_type is set, in which case the number of unused bits will be less than + eight. data_type is set as noted here every time inflate() returns for all + flush options, and so can be used to determine the amount of currently + consumed input in bits. + + The Z_TREES option behaves as Z_BLOCK does, but it also returns when the + end of each deflate block header is reached, before any actual data in that + block is decoded. This allows the caller to determine the length of the + deflate block header for later use in random access within a deflate block. + 256 is added to the value of strm->data_type when inflate() returns + immediately after reaching the end of the deflate block header. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step (a + single call of inflate), the parameter flush should be set to Z_FINISH. In + this case all pending input is processed and all pending output is flushed; + avail_out must be large enough to hold all of the uncompressed data for the + operation to complete. (The size of the uncompressed data may have been + saved by the compressor for this purpose.) The use of Z_FINISH is not + required to perform an inflation in one step. However it may be used to + inform inflate that a faster approach can be used for the single inflate() + call. Z_FINISH also informs inflate to not maintain a sliding window if the + stream completes, which reduces inflate's memory footprint. If the stream + does not complete, either because not all of the stream is provided or not + enough output space is provided, then a sliding window will be allocated and + inflate() can be called again to continue the operation as if Z_NO_FLUSH had + been used. + + In this implementation, inflate() always flushes as much output as + possible to the output buffer, and always uses the faster approach on the + first call. So the effects of the flush parameter in this implementation are + on the return value of inflate() as noted below, when inflate() returns early + when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of + memory for a sliding window when Z_FINISH is used. + + If a preset dictionary is needed after this call (see inflateSetDictionary + below), inflate sets strm->adler to the Adler-32 checksum of the dictionary + chosen by the compressor and returns Z_NEED_DICT; otherwise it sets + strm->adler to the Adler-32 checksum of all output produced so far (that is, + total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described + below. At the end of the stream, inflate() checks that its computed Adler-32 + checksum is equal to that saved by the compressor and returns Z_STREAM_END + only if the checksum is correct. + + inflate() can decompress and check either zlib-wrapped or gzip-wrapped + deflate data. The header type is detected automatically, if requested when + initializing with inflateInit2(). Any information contained in the gzip + header is not retained unless inflateGetHeader() is used. When processing + gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output + produced so far. The CRC-32 is checked against the gzip trailer, as is the + uncompressed length, modulo 2^32. + + inflate() returns Z_OK if some progress has been made (more input processed + or more output produced), Z_STREAM_END if the end of the compressed data has + been reached and all uncompressed output has been produced, Z_NEED_DICT if a + preset dictionary is needed at this point, Z_DATA_ERROR if the input data was + corrupted (input stream not conforming to the zlib format or incorrect check + value, in which case strm->msg points to a string with a more specific + error), Z_STREAM_ERROR if the stream structure was inconsistent (for example + next_in or next_out was Z_NULL, or the state was inadvertently written over + by the application), Z_MEM_ERROR if there was not enough memory, Z_BUF_ERROR + if no progress was possible or if there was not enough room in the output + buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and + inflate() can be called again with more input and more output space to + continue decompressing. If Z_DATA_ERROR is returned, the application may + then call inflateSync() to look for a good compression block if a partial + recovery of the data is to be attempted. +*/ + + +ZEXTERN int ZEXPORT inflateEnd(z_streamp strm); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any pending + output. + + inflateEnd returns Z_OK if success, or Z_STREAM_ERROR if the stream state + was inconsistent. +*/ + + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +ZEXTERN int ZEXPORT deflateInit2(z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy); + + This is another version of deflateInit with more compression options. The + fields zalloc, zfree and opaque must be initialized before by the caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library. Larger values of this parameter result in better + compression at the expense of memory usage. The default value is 15 if + deflateInit is used instead. + + For the current implementation of deflate(), a windowBits value of 8 (a + window size of 256 bytes) is not supported. As a result, a request for 8 + will result in 9 (a 512-byte window). In that case, providing 8 to + inflateInit2() will result in an error when the zlib header with 9 is + checked against the initialization of inflate(). The remedy is to not use 8 + with deflateInit2() with this initialization, or at least in that case use 9 + with inflateInit2(). + + windowBits can also be -8..-15 for raw deflate. In this case, -windowBits + determines the window size. deflate() will then generate raw deflate data + with no zlib header or trailer, and will not compute a check value. + + windowBits can also be greater than 15 for optional gzip encoding. Add + 16 to windowBits to write a simple gzip header and trailer around the + compressed data instead of a zlib wrapper. The gzip header will have no + file name, no extra data, no comment, no modification time (set to zero), no + header crc, and the operating system will be set to the appropriate value, + if the operating system was determined at compile time. If a gzip stream is + being written, strm->adler is a CRC-32 instead of an Adler-32. + + For raw deflate or gzip encoding, a request for a 256-byte window is + rejected as invalid, since only the zlib header provides a means of + transmitting the window size to the decompressor. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but is + slow and reduces compression ratio; memLevel=9 uses maximum memory for + optimal speed. The default value is 8. See zconf.h for total memory usage + as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match), or Z_RLE to limit match distances to one (run-length + encoding). Filtered data consists mostly of small values with a somewhat + random distribution. In this case, the compression algorithm is tuned to + compress them better. The effect of Z_FILTERED is to force more Huffman + coding and less string matching; it is somewhat intermediate between + Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as + fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data. The + strategy parameter only affects the compression ratio but not the + correctness of the compressed output even if it is not set appropriately. + Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler + decoder for special applications. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid + method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is + incompatible with the version assumed by the caller (ZLIB_VERSION). msg is + set to null if there is no error message. deflateInit2 does not perform any + compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateSetDictionary(z_streamp strm, + const Bytef *dictionary, + uInt dictLength); +/* + Initializes the compression dictionary from the given byte sequence + without producing any compressed output. When using the zlib format, this + function must be called immediately after deflateInit, deflateInit2 or + deflateReset, and before any call of deflate. When doing raw deflate, this + function must be called either before any call of deflate, or immediately + after the completion of a deflate block, i.e. after all input has been + consumed and all output has been delivered when using any of the flush + options Z_BLOCK, Z_PARTIAL_FLUSH, Z_SYNC_FLUSH, or Z_FULL_FLUSH. The + compressor and decompressor must use exactly the same dictionary (see + inflateSetDictionary). + + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and can be + predicted with good accuracy; the data can then be compressed better than + with the default empty dictionary. + + Depending on the size of the compression data structures selected by + deflateInit or deflateInit2, a part of the dictionary may in effect be + discarded, for example if the dictionary is larger than the window size + provided in deflateInit or deflateInit2. Thus the strings most likely to be + useful should be put at the end of the dictionary, not at the front. In + addition, the current implementation of deflate will use at most the window + size minus 262 bytes of the provided dictionary. + + Upon return of this function, strm->adler is set to the Adler-32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The Adler-32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) If a raw deflate was requested, then the + Adler-32 value is not computed and strm->adler is not set. + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (e.g. dictionary being Z_NULL) or the stream state is + inconsistent (for example if deflate has already been called for this stream + or if not at a block boundary for raw deflate). deflateSetDictionary does + not perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateGetDictionary(z_streamp strm, + Bytef *dictionary, + uInt *dictLength); +/* + Returns the sliding dictionary being maintained by deflate. dictLength is + set to the number of bytes in the dictionary, and that many bytes are copied + to dictionary. dictionary must have enough space, where 32768 bytes is + always enough. If deflateGetDictionary() is called with dictionary equal to + Z_NULL, then only the dictionary length is returned, and nothing is copied. + Similarly, if dictLength is Z_NULL, then it is not set. + + deflateGetDictionary() may return a length less than the window size, even + when more than the window size in input has been provided. It may return up + to 258 bytes less in that case, due to how zlib's implementation of deflate + manages the sliding window and lookahead for matches, where matches can be + up to 258 bytes long. If the application needs the last window-size bytes of + input, then that would need to be saved by the application outside of zlib. + + deflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the + stream state is inconsistent. +*/ + +ZEXTERN int ZEXPORT deflateCopy(z_streamp dest, + z_streamp source); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and can + consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being Z_NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT deflateReset(z_streamp strm); +/* + This function is equivalent to deflateEnd followed by deflateInit, but + does not free and reallocate the internal compression state. The stream + will leave the compression level and any other attributes that may have been + set unchanged. total_in, total_out, adler, and msg are initialized. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being Z_NULL). +*/ + +ZEXTERN int ZEXPORT deflateParams(z_streamp strm, + int level, + int strategy); +/* + Dynamically update the compression level and compression strategy. The + interpretation of level and strategy is as in deflateInit2(). This can be + used to switch between compression and straight copy of the input data, or + to switch to a different kind of input data requiring a different strategy. + If the compression approach (which is a function of the level) or the + strategy is changed, and if there have been any deflate() calls since the + state was initialized or reset, then the input available so far is + compressed with the old level and strategy using deflate(strm, Z_BLOCK). + There are three approaches for the compression levels 0, 1..3, and 4..9 + respectively. The new level and strategy will take effect at the next call + of deflate(). + + If a deflate(strm, Z_BLOCK) is performed by deflateParams(), and it does + not have enough output space to complete, then the parameter change will not + take effect. In this case, deflateParams() can be called again with the + same parameters and more output space to try again. + + In order to assure a change in the parameters on the first try, the + deflate stream should be flushed using deflate() with Z_BLOCK or other flush + request until strm.avail_out is not zero, before calling deflateParams(). + Then no more input data should be provided before the deflateParams() call. + If this is done, the old level and strategy will be applied to the data + compressed before deflateParams(), and the new level and strategy will be + applied to the data compressed after deflateParams(). + + deflateParams returns Z_OK on success, Z_STREAM_ERROR if the source stream + state was inconsistent or if a parameter was invalid, or Z_BUF_ERROR if + there was not enough output space to complete the compression of the + available input data before a change in the strategy or approach. Note that + in the case of a Z_BUF_ERROR, the parameters are not changed. A return + value of Z_BUF_ERROR is not fatal, in which case deflateParams() can be + retried with more output space. +*/ + +ZEXTERN int ZEXPORT deflateTune(z_streamp strm, + int good_length, + int max_lazy, + int nice_length, + int max_chain); +/* + Fine tune deflate's internal compression parameters. This should only be + used by someone who understands the algorithm used by zlib's deflate for + searching for the best matching string, and even then only by the most + fanatic optimizer trying to squeeze out the last compressed bit for their + specific input data. Read the deflate.c source code for the meaning of the + max_lazy, good_length, nice_length, and max_chain parameters. + + deflateTune() can be called after deflateInit() or deflateInit2(), and + returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. + */ + +ZEXTERN uLong ZEXPORT deflateBound(z_streamp strm, + uLong sourceLen); +/* + deflateBound() returns an upper bound on the compressed size after + deflation of sourceLen bytes. It must be called after deflateInit() or + deflateInit2(), and after deflateSetHeader(), if used. This would be used + to allocate an output buffer for deflation in a single pass, and so would be + called before deflate(). If that first deflate() call is provided the + sourceLen input bytes, an output buffer allocated to the size returned by + deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed + to return Z_STREAM_END. Note that it is possible for the compressed size to + be larger than the value returned by deflateBound() if flush options other + than Z_FINISH or Z_NO_FLUSH are used. +*/ + +ZEXTERN int ZEXPORT deflatePending(z_streamp strm, + unsigned *pending, + int *bits); +/* + deflatePending() returns the number of bytes and bits of output that have + been generated, but not yet provided in the available output. The bytes not + provided would be due to the available output space having being consumed. + The number of bits of output not provided are between 0 and 7, where they + await more bits to join them in order to fill out a full byte. If pending + or bits are Z_NULL, then those values are not set. + + deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. + */ + +ZEXTERN int ZEXPORT deflatePrime(z_streamp strm, + int bits, + int value); +/* + deflatePrime() inserts bits in the deflate output stream. The intent + is that this function is used to start off the deflate output with the bits + leftover from a previous deflate stream when appending to it. As such, this + function can only be used for raw deflate, and must be used before the first + deflate() call after a deflateInit2() or deflateReset(). bits must be less + than or equal to 16, and that many of the least significant bits of value + will be inserted in the output. + + deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough + room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the + source stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT deflateSetHeader(z_streamp strm, + gz_headerp head); +/* + deflateSetHeader() provides gzip header information for when a gzip + stream is requested by deflateInit2(). deflateSetHeader() may be called + after deflateInit2() or deflateReset() and before the first call of + deflate(). The text, time, os, extra field, name, and comment information + in the provided gz_header structure are written to the gzip header (xflag is + ignored -- the extra flags are set according to the compression level). The + caller must assure that, if not Z_NULL, name and comment are terminated with + a zero byte, and that if extra is not Z_NULL, that extra_len bytes are + available there. If hcrc is true, a gzip header crc is included. Note that + the current versions of the command-line version of gzip (up through version + 1.3.x) do not support header crc's, and will report that it is a "multi-part + gzip file" and give up. + + If deflateSetHeader is not used, the default gzip header has text false, + the time set to zero, and os set to the current operating system, with no + extra, name, or comment fields. The gzip header is returned to the default + state by deflateReset(). + + deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateInit2(z_streamp strm, + int windowBits); + + This is another version of inflateInit with an extra parameter. The + fields next_in, avail_in, zalloc, zfree and opaque must be initialized + before by the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library. The default value is 15 if inflateInit is used + instead. windowBits must be greater than or equal to the windowBits value + provided to deflateInit2() while compressing, or it must be equal to 15 if + deflateInit2() was not used. If a compressed stream with a larger window + size is given as input, inflate() will return with the error code + Z_DATA_ERROR instead of trying to allocate a larger window. + + windowBits can also be zero to request that inflate use the window size in + the zlib header of the compressed stream. + + windowBits can also be -8..-15 for raw inflate. In this case, -windowBits + determines the window size. inflate() will then process raw deflate data, + not looking for a zlib or gzip header, not generating a check value, and not + looking for any check values for comparison at the end of the stream. This + is for use with other formats that use the deflate compressed data format + such as zip. Those formats provide their own check values. If a custom + format is developed using the raw deflate format for compressed data, it is + recommended that a check value such as an Adler-32 or a CRC-32 be applied to + the uncompressed data as is done in the zlib, gzip, and zip formats. For + most applications, the zlib format should be used as is. Note that comments + above on the use in deflateInit2() applies to the magnitude of windowBits. + + windowBits can also be greater than 15 for optional gzip decoding. Add + 32 to windowBits to enable zlib and gzip decoding with automatic header + detection, or add 16 to decode only the gzip format (the zlib format will + return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is a + CRC-32 instead of an Adler-32. Unlike the gunzip utility and gzread() (see + below), inflate() will *not* automatically decode concatenated gzip members. + inflate() will return Z_STREAM_END at the end of the gzip member. The state + would need to be reset to continue decoding a subsequent gzip member. This + *must* be done if there is more data after a gzip member, in order for the + decompression to be compliant with the gzip standard (RFC 1952). + + inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller, or Z_STREAM_ERROR if the parameters are + invalid, such as a null pointer to the structure. msg is set to null if + there is no error message. inflateInit2 does not perform any decompression + apart from possibly reading the zlib header if present: actual decompression + will be done by inflate(). (So next_in and avail_in may be modified, but + next_out and avail_out are unused and unchanged.) The current implementation + of inflateInit2() does not process any header information -- that is + deferred until inflate() is called. +*/ + +ZEXTERN int ZEXPORT inflateSetDictionary(z_streamp strm, + const Bytef *dictionary, + uInt dictLength); +/* + Initializes the decompression dictionary from the given uncompressed byte + sequence. This function must be called immediately after a call of inflate, + if that call returned Z_NEED_DICT. The dictionary chosen by the compressor + can be determined from the Adler-32 value returned by that call of inflate. + The compressor and decompressor must use exactly the same dictionary (see + deflateSetDictionary). For raw inflate, this function can be called at any + time to set the dictionary. If the provided dictionary is smaller than the + window and there is already data in the window, then the provided dictionary + will amend what's there. The application must insure that the dictionary + that was used for compression is provided. + + inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a + parameter is invalid (e.g. dictionary being Z_NULL) or the stream state is + inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the + expected one (incorrect Adler-32 value). inflateSetDictionary does not + perform any decompression: this will be done by subsequent calls of + inflate(). +*/ + +ZEXTERN int ZEXPORT inflateGetDictionary(z_streamp strm, + Bytef *dictionary, + uInt *dictLength); +/* + Returns the sliding dictionary being maintained by inflate. dictLength is + set to the number of bytes in the dictionary, and that many bytes are copied + to dictionary. dictionary must have enough space, where 32768 bytes is + always enough. If inflateGetDictionary() is called with dictionary equal to + Z_NULL, then only the dictionary length is returned, and nothing is copied. + Similarly, if dictLength is Z_NULL, then it is not set. + + inflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the + stream state is inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateSync(z_streamp strm); +/* + Skips invalid compressed data until a possible full flush point (see above + for the description of deflate with Z_FULL_FLUSH) can be found, or until all + available input is skipped. No output is provided. + + inflateSync searches for a 00 00 FF FF pattern in the compressed data. + All full flush points have this pattern, but not all occurrences of this + pattern are full flush points. + + inflateSync returns Z_OK if a possible full flush point has been found, + Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point + has been found, or Z_STREAM_ERROR if the stream structure was inconsistent. + In the success case, the application may save the current value of total_in + which indicates where valid compressed data was found. In the error case, + the application may repeatedly call inflateSync, providing more input each + time, until success or end of the input data. +*/ + +ZEXTERN int ZEXPORT inflateCopy(z_streamp dest, + z_streamp source); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when randomly accessing a large stream. The + first pass through the stream can periodically record the inflate state, + allowing restarting inflate at those points when randomly accessing the + stream. + + inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being Z_NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT inflateReset(z_streamp strm); +/* + This function is equivalent to inflateEnd followed by inflateInit, + but does not free and reallocate the internal decompression state. The + stream will keep attributes that may have been set by inflateInit2. + total_in, total_out, adler, and msg are initialized. + + inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being Z_NULL). +*/ + +ZEXTERN int ZEXPORT inflateReset2(z_streamp strm, + int windowBits); +/* + This function is the same as inflateReset, but it also permits changing + the wrap and window size requests. The windowBits parameter is interpreted + the same as it is for inflateInit2. If the window size is changed, then the + memory allocated for the window is freed, and the window will be reallocated + by inflate() if needed. + + inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being Z_NULL), or if + the windowBits parameter is invalid. +*/ + +ZEXTERN int ZEXPORT inflatePrime(z_streamp strm, + int bits, + int value); +/* + This function inserts bits in the inflate input stream. The intent is + that this function is used to start inflating at a bit position in the + middle of a byte. The provided bits will be used before any bytes are used + from next_in. This function should only be used with raw inflate, and + should be used before the first inflate() call after inflateInit2() or + inflateReset(). bits must be less than or equal to 16, and that many of the + least significant bits of value will be inserted in the input. + + If bits is negative, then the input stream bit buffer is emptied. Then + inflatePrime() can be called again to put bits in the buffer. This is used + to clear out bits leftover after feeding inflate a block description prior + to feeding inflate codes. + + inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN long ZEXPORT inflateMark(z_streamp strm); +/* + This function returns two values, one in the lower 16 bits of the return + value, and the other in the remaining upper bits, obtained by shifting the + return value down 16 bits. If the upper value is -1 and the lower value is + zero, then inflate() is currently decoding information outside of a block. + If the upper value is -1 and the lower value is non-zero, then inflate is in + the middle of a stored block, with the lower value equaling the number of + bytes from the input remaining to copy. If the upper value is not -1, then + it is the number of bits back from the current bit position in the input of + the code (literal or length/distance pair) currently being processed. In + that case the lower value is the number of bytes already emitted for that + code. + + A code is being processed if inflate is waiting for more input to complete + decoding of the code, or if it has completed decoding but is waiting for + more output space to write the literal or match data. + + inflateMark() is used to mark locations in the input data for random + access, which may be at bit positions, and to note those cases where the + output of a code may span boundaries of random access blocks. The current + location in the input stream can be determined from avail_in and data_type + as noted in the description for the Z_BLOCK flush parameter for inflate. + + inflateMark returns the value noted above, or -65536 if the provided + source stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateGetHeader(z_streamp strm, + gz_headerp head); +/* + inflateGetHeader() requests that gzip header information be stored in the + provided gz_header structure. inflateGetHeader() may be called after + inflateInit2() or inflateReset(), and before the first call of inflate(). + As inflate() processes the gzip stream, head->done is zero until the header + is completed, at which time head->done is set to one. If a zlib stream is + being decoded, then head->done is set to -1 to indicate that there will be + no gzip header information forthcoming. Note that Z_BLOCK or Z_TREES can be + used to force inflate() to return immediately after header processing is + complete and before any actual data is decompressed. + + The text, time, xflags, and os fields are filled in with the gzip header + contents. hcrc is set to true if there is a header CRC. (The header CRC + was valid if done is set to one.) If extra is not Z_NULL, then extra_max + contains the maximum number of bytes to write to extra. Once done is true, + extra_len contains the actual extra field length, and extra contains the + extra field, or that field truncated if extra_max is less than extra_len. + If name is not Z_NULL, then up to name_max characters are written there, + terminated with a zero unless the length is greater than name_max. If + comment is not Z_NULL, then up to comm_max characters are written there, + terminated with a zero unless the length is greater than comm_max. When any + of extra, name, or comment are not Z_NULL and the respective field is not + present in the header, then that field is set to Z_NULL to signal its + absence. This allows the use of deflateSetHeader() with the returned + structure to duplicate the header. However if those fields are set to + allocated memory, then the application will need to save those pointers + elsewhere so that they can be eventually freed. + + If inflateGetHeader is not used, then the header information is simply + discarded. The header is always checked for validity, including the header + CRC if present. inflateReset() will reset the process to discard the header + information. The application would need to call inflateGetHeader() again to + retrieve the header from the next gzip stream. + + inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateBackInit(z_streamp strm, int windowBits, + unsigned char FAR *window); + + Initialize the internal stream state for decompression using inflateBack() + calls. The fields zalloc, zfree and opaque in strm must be initialized + before the call. If zalloc and zfree are Z_NULL, then the default library- + derived memory allocation routines are used. windowBits is the base two + logarithm of the window size, in the range 8..15. window is a caller + supplied buffer of that size. Except for special applications where it is + assured that deflate was used with small window sizes, windowBits must be 15 + and a 32K byte window must be supplied to be able to decompress general + deflate streams. + + See inflateBack() for the usage of these routines. + + inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of + the parameters are invalid, Z_MEM_ERROR if the internal state could not be + allocated, or Z_VERSION_ERROR if the version of the library does not match + the version of the header file. +*/ + +typedef unsigned (*in_func)(void FAR *, + z_const unsigned char FAR * FAR *); +typedef int (*out_func)(void FAR *, unsigned char FAR *, unsigned); + +ZEXTERN int ZEXPORT inflateBack(z_streamp strm, + in_func in, void FAR *in_desc, + out_func out, void FAR *out_desc); +/* + inflateBack() does a raw inflate with a single call using a call-back + interface for input and output. This is potentially more efficient than + inflate() for file i/o applications, in that it avoids copying between the + output and the sliding window by simply making the window itself the output + buffer. inflate() can be faster on modern CPUs when used with large + buffers. inflateBack() trusts the application to not change the output + buffer passed by the output function, at least until inflateBack() returns. + + inflateBackInit() must be called first to allocate the internal state + and to initialize the state with the user-provided window buffer. + inflateBack() may then be used multiple times to inflate a complete, raw + deflate stream with each call. inflateBackEnd() is then called to free the + allocated state. + + A raw deflate stream is one with no zlib or gzip header or trailer. + This routine would normally be used in a utility that reads zip or gzip + files and writes out uncompressed files. The utility would decode the + header and process the trailer on its own, hence this routine expects only + the raw deflate stream to decompress. This is different from the default + behavior of inflate(), which expects a zlib header and trailer around the + deflate stream. + + inflateBack() uses two subroutines supplied by the caller that are then + called by inflateBack() for input and output. inflateBack() calls those + routines until it reads a complete deflate stream and writes out all of the + uncompressed data, or until it encounters an error. The function's + parameters and return types are defined above in the in_func and out_func + typedefs. inflateBack() will call in(in_desc, &buf) which should return the + number of bytes of provided input, and a pointer to that input in buf. If + there is no input available, in() must return zero -- buf is ignored in that + case -- and inflateBack() will return a buffer error. inflateBack() will + call out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. + out() should return zero on success, or non-zero on failure. If out() + returns non-zero, inflateBack() will return with an error. Neither in() nor + out() are permitted to change the contents of the window provided to + inflateBackInit(), which is also the buffer that out() uses to write from. + The length written by out() will be at most the window size. Any non-zero + amount of input may be provided by in(). + + For convenience, inflateBack() can be provided input on the first call by + setting strm->next_in and strm->avail_in. If that input is exhausted, then + in() will be called. Therefore strm->next_in must be initialized before + calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called + immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in + must also be initialized, and then if strm->avail_in is not zero, input will + initially be taken from strm->next_in[0 .. strm->avail_in - 1]. + + The in_desc and out_desc parameters of inflateBack() is passed as the + first parameter of in() and out() respectively when they are called. These + descriptors can be optionally used to pass any information that the caller- + supplied in() and out() functions need to do their job. + + On return, inflateBack() will set strm->next_in and strm->avail_in to + pass back any unused input that was provided by the last in() call. The + return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR + if in() or out() returned an error, Z_DATA_ERROR if there was a format error + in the deflate stream (in which case strm->msg is set to indicate the nature + of the error), or Z_STREAM_ERROR if the stream was not properly initialized. + In the case of Z_BUF_ERROR, an input or output error can be distinguished + using strm->next_in which will be Z_NULL only if in() returned an error. If + strm->next_in is not Z_NULL, then the Z_BUF_ERROR was due to out() returning + non-zero. (in() will always be called before out(), so strm->next_in is + assured to be defined if out() returns non-zero.) Note that inflateBack() + cannot return Z_OK. +*/ + +ZEXTERN int ZEXPORT inflateBackEnd(z_streamp strm); +/* + All memory allocated by inflateBackInit() is freed. + + inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream + state was inconsistent. +*/ + +ZEXTERN uLong ZEXPORT zlibCompileFlags(void); +/* Return flags indicating compile-time options. + + Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: + 1.0: size of uInt + 3.2: size of uLong + 5.4: size of voidpf (pointer) + 7.6: size of z_off_t + + Compiler, assembler, and debug options: + 8: ZLIB_DEBUG + 9: ASMV or ASMINF -- use ASM code + 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention + 11: 0 (reserved) + + One-time table building (smaller code, but not thread-safe if true): + 12: BUILDFIXED -- build static block decoding tables when needed + 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed + 14,15: 0 (reserved) + + Library content (indicates missing functionality): + 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking + deflate code when not needed) + 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect + and decode gzip streams (to avoid linking crc code) + 18-19: 0 (reserved) + + Operation variations (changes in library functionality): + 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate + 21: FASTEST -- deflate algorithm with only one, lowest compression level + 22,23: 0 (reserved) + + The sprintf variant used by gzprintf (zero is best): + 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format + 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! + 26: 0 = returns value, 1 = void -- 1 means inferred string length returned + + Remainder: + 27-31: 0 (reserved) + */ + +#ifndef Z_SOLO + + /* utility functions */ + +/* + The following utility functions are implemented on top of the basic + stream-oriented functions. To simplify the interface, some default options + are assumed (compression level and memory usage, standard memory allocation + functions). The source code of these utility functions can be modified if + you need special options. +*/ + +ZEXTERN int ZEXPORT compress(Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen); +/* + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total size + of the destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed data. compress() is equivalent to compress2() with a level + parameter of Z_DEFAULT_COMPRESSION. + + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + +ZEXTERN int ZEXPORT compress2(Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen, + int level); +/* + Compresses the source buffer into the destination buffer. The level + parameter has the same meaning as in deflateInit. sourceLen is the byte + length of the source buffer. Upon entry, destLen is the total size of the + destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed data. + + compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_BUF_ERROR if there was not enough room in the output buffer, + Z_STREAM_ERROR if the level parameter is invalid. +*/ + +ZEXTERN uLong ZEXPORT compressBound(uLong sourceLen); +/* + compressBound() returns an upper bound on the compressed size after + compress() or compress2() on sourceLen bytes. It would be used before a + compress() or compress2() call to allocate the destination buffer. +*/ + +ZEXTERN int ZEXPORT uncompress(Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen); +/* + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total size + of the destination buffer, which must be large enough to hold the entire + uncompressed data. (The size of the uncompressed data must have been saved + previously by the compressor and transmitted to the decompressor by some + mechanism outside the scope of this compression library.) Upon exit, destLen + is the actual size of the uncompressed data. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. In + the case where there is not enough room, uncompress() will fill the output + buffer with the uncompressed data up to that point. +*/ + +ZEXTERN int ZEXPORT uncompress2(Bytef *dest, uLongf *destLen, + const Bytef *source, uLong *sourceLen); +/* + Same as uncompress, except that sourceLen is a pointer, where the + length of the source is *sourceLen. On return, *sourceLen is the number of + source bytes consumed. +*/ + + /* gzip file access functions */ + +/* + This library supports reading and writing files in gzip (.gz) format with + an interface similar to that of stdio, using the functions that start with + "gz". The gzip format is different from the zlib format. gzip is a gzip + wrapper, documented in RFC 1952, wrapped around a deflate stream. +*/ + +typedef struct gzFile_s *gzFile; /* semi-opaque gzip file descriptor */ + +/* +ZEXTERN gzFile ZEXPORT gzopen(const char *path, const char *mode); + + Open the gzip (.gz) file at path for reading and decompressing, or + compressing and writing. The mode parameter is as in fopen ("rb" or "wb") + but can also include a compression level ("wb9") or a strategy: 'f' for + filtered data as in "wb6f", 'h' for Huffman-only compression as in "wb1h", + 'R' for run-length encoding as in "wb1R", or 'F' for fixed code compression + as in "wb9F". (See the description of deflateInit2 for more information + about the strategy parameter.) 'T' will request transparent writing or + appending with no compression and not using the gzip format. + + "a" can be used instead of "w" to request that the gzip stream that will + be written be appended to the file. "+" will result in an error, since + reading and writing to the same gzip file is not supported. The addition of + "x" when writing will create the file exclusively, which fails if the file + already exists. On systems that support it, the addition of "e" when + reading or writing will set the flag to close the file on an execve() call. + + These functions, as well as gzip, will read and decode a sequence of gzip + streams in a file. The append function of gzopen() can be used to create + such a file. (Also see gzflush() for another way to do this.) When + appending, gzopen does not test whether the file begins with a gzip stream, + nor does it look for the end of the gzip streams to begin appending. gzopen + will simply append a gzip stream to the existing file. + + gzopen can be used to read a file which is not in gzip format; in this + case gzread will directly read from the file without decompression. When + reading, this will be detected automatically by looking for the magic two- + byte gzip header. + + gzopen returns NULL if the file could not be opened, if there was + insufficient memory to allocate the gzFile state, or if an invalid mode was + specified (an 'r', 'w', or 'a' was not provided, or '+' was provided). + errno can be checked to determine if the reason gzopen failed was that the + file could not be opened. +*/ + +ZEXTERN gzFile ZEXPORT gzdopen(int fd, const char *mode); +/* + Associate a gzFile with the file descriptor fd. File descriptors are + obtained from calls like open, dup, creat, pipe or fileno (if the file has + been previously opened with fopen). The mode parameter is as in gzopen. + + The next call of gzclose on the returned gzFile will also close the file + descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor + fd. If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd, + mode);. The duplicated descriptor should be saved to avoid a leak, since + gzdopen does not close fd if it fails. If you are using fileno() to get the + file descriptor from a FILE *, then you will have to use dup() to avoid + double-close()ing the file descriptor. Both gzclose() and fclose() will + close the associated file descriptor, so they need to have different file + descriptors. + + gzdopen returns NULL if there was insufficient memory to allocate the + gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not + provided, or '+' was provided), or if fd is -1. The file descriptor is not + used until the next gz* read, write, seek, or close operation, so gzdopen + will not detect if fd is invalid (unless fd is -1). +*/ + +ZEXTERN int ZEXPORT gzbuffer(gzFile file, unsigned size); +/* + Set the internal buffer size used by this library's functions for file to + size. The default buffer size is 8192 bytes. This function must be called + after gzopen() or gzdopen(), and before any other calls that read or write + the file. The buffer memory allocation is always deferred to the first read + or write. Three times that size in buffer space is allocated. A larger + buffer size of, for example, 64K or 128K bytes will noticeably increase the + speed of decompression (reading). + + The new buffer size also affects the maximum length for gzprintf(). + + gzbuffer() returns 0 on success, or -1 on failure, such as being called + too late. +*/ + +ZEXTERN int ZEXPORT gzsetparams(gzFile file, int level, int strategy); +/* + Dynamically update the compression level and strategy for file. See the + description of deflateInit2 for the meaning of these parameters. Previously + provided data is flushed before applying the parameter changes. + + gzsetparams returns Z_OK if success, Z_STREAM_ERROR if the file was not + opened for writing, Z_ERRNO if there is an error writing the flushed data, + or Z_MEM_ERROR if there is a memory allocation error. +*/ + +ZEXTERN int ZEXPORT gzread(gzFile file, voidp buf, unsigned len); +/* + Read and decompress up to len uncompressed bytes from file into buf. If + the input file is not in gzip format, gzread copies the given number of + bytes into the buffer directly from the file. + + After reaching the end of a gzip stream in the input, gzread will continue + to read, looking for another gzip stream. Any number of gzip streams may be + concatenated in the input file, and will all be decompressed by gzread(). + If something other than a gzip stream is encountered after a gzip stream, + that remaining trailing garbage is ignored (and no error is returned). + + gzread can be used to read a gzip file that is being concurrently written. + Upon reaching the end of the input, gzread will return with the available + data. If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then + gzclearerr can be used to clear the end of file indicator in order to permit + gzread to be tried again. Z_OK indicates that a gzip stream was completed + on the last gzread. Z_BUF_ERROR indicates that the input file ended in the + middle of a gzip stream. Note that gzread does not return -1 in the event + of an incomplete gzip stream. This error is deferred until gzclose(), which + will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip + stream. Alternatively, gzerror can be used before gzclose to detect this + case. + + gzread returns the number of uncompressed bytes actually read, less than + len for end of file, or -1 for error. If len is too large to fit in an int, + then nothing is read, -1 is returned, and the error state is set to + Z_STREAM_ERROR. +*/ + +ZEXTERN z_size_t ZEXPORT gzfread(voidp buf, z_size_t size, z_size_t nitems, + gzFile file); +/* + Read and decompress up to nitems items of size size from file into buf, + otherwise operating as gzread() does. This duplicates the interface of + stdio's fread(), with size_t request and return types. If the library + defines size_t, then z_size_t is identical to size_t. If not, then z_size_t + is an unsigned integer type that can contain a pointer. + + gzfread() returns the number of full items read of size size, or zero if + the end of the file was reached and a full item could not be read, or if + there was an error. gzerror() must be consulted if zero is returned in + order to determine if there was an error. If the multiplication of size and + nitems overflows, i.e. the product does not fit in a z_size_t, then nothing + is read, zero is returned, and the error state is set to Z_STREAM_ERROR. + + In the event that the end of file is reached and only a partial item is + available at the end, i.e. the remaining uncompressed data length is not a + multiple of size, then the final partial item is nevertheless read into buf + and the end-of-file flag is set. The length of the partial item read is not + provided, but could be inferred from the result of gztell(). This behavior + is the same as the behavior of fread() implementations in common libraries, + but it prevents the direct use of gzfread() to read a concurrently written + file, resetting and retrying on end-of-file, when size is not 1. +*/ + +ZEXTERN int ZEXPORT gzwrite(gzFile file, voidpc buf, unsigned len); +/* + Compress and write the len uncompressed bytes at buf to file. gzwrite + returns the number of uncompressed bytes written or 0 in case of error. +*/ + +ZEXTERN z_size_t ZEXPORT gzfwrite(voidpc buf, z_size_t size, + z_size_t nitems, gzFile file); +/* + Compress and write nitems items of size size from buf to file, duplicating + the interface of stdio's fwrite(), with size_t request and return types. If + the library defines size_t, then z_size_t is identical to size_t. If not, + then z_size_t is an unsigned integer type that can contain a pointer. + + gzfwrite() returns the number of full items written of size size, or zero + if there was an error. If the multiplication of size and nitems overflows, + i.e. the product does not fit in a z_size_t, then nothing is written, zero + is returned, and the error state is set to Z_STREAM_ERROR. +*/ + +ZEXTERN int ZEXPORTVA gzprintf(gzFile file, const char *format, ...); +/* + Convert, format, compress, and write the arguments (...) to file under + control of the string format, as in fprintf. gzprintf returns the number of + uncompressed bytes actually written, or a negative zlib error code in case + of error. The number of uncompressed bytes written is limited to 8191, or + one less than the buffer size given to gzbuffer(). The caller should assure + that this limit is not exceeded. If it is exceeded, then gzprintf() will + return an error (0) with nothing written. In this case, there may also be a + buffer overflow with unpredictable consequences, which is possible only if + zlib was compiled with the insecure functions sprintf() or vsprintf(), + because the secure snprintf() or vsnprintf() functions were not available. + This can be determined using zlibCompileFlags(). +*/ + +ZEXTERN int ZEXPORT gzputs(gzFile file, const char *s); +/* + Compress and write the given null-terminated string s to file, excluding + the terminating null character. + + gzputs returns the number of characters written, or -1 in case of error. +*/ + +ZEXTERN char * ZEXPORT gzgets(gzFile file, char *buf, int len); +/* + Read and decompress bytes from file into buf, until len-1 characters are + read, or until a newline character is read and transferred to buf, or an + end-of-file condition is encountered. If any characters are read or if len + is one, the string is terminated with a null character. If no characters + are read due to an end-of-file or len is less than one, then the buffer is + left untouched. + + gzgets returns buf which is a null-terminated string, or it returns NULL + for end-of-file or in case of error. If there was an error, the contents at + buf are indeterminate. +*/ + +ZEXTERN int ZEXPORT gzputc(gzFile file, int c); +/* + Compress and write c, converted to an unsigned char, into file. gzputc + returns the value that was written, or -1 in case of error. +*/ + +ZEXTERN int ZEXPORT gzgetc(gzFile file); +/* + Read and decompress one byte from file. gzgetc returns this byte or -1 + in case of end of file or error. This is implemented as a macro for speed. + As such, it does not do all of the checking the other functions do. I.e. + it does not check to see if file is NULL, nor whether the structure file + points to has been clobbered or not. +*/ + +ZEXTERN int ZEXPORT gzungetc(int c, gzFile file); +/* + Push c back onto the stream for file to be read as the first character on + the next read. At least one character of push-back is always allowed. + gzungetc() returns the character pushed, or -1 on failure. gzungetc() will + fail if c is -1, and may fail if a character has been pushed but not read + yet. If gzungetc is used immediately after gzopen or gzdopen, at least the + output buffer size of pushed characters is allowed. (See gzbuffer above.) + The pushed character will be discarded if the stream is repositioned with + gzseek() or gzrewind(). +*/ + +ZEXTERN int ZEXPORT gzflush(gzFile file, int flush); +/* + Flush all pending output to file. The parameter flush is as in the + deflate() function. The return value is the zlib error number (see function + gzerror below). gzflush is only permitted when writing. + + If the flush parameter is Z_FINISH, the remaining data is written and the + gzip stream is completed in the output. If gzwrite() is called again, a new + gzip stream will be started in the output. gzread() is able to read such + concatenated gzip streams. + + gzflush should be called only when strictly necessary because it will + degrade compression if called too often. +*/ + +/* +ZEXTERN z_off_t ZEXPORT gzseek(gzFile file, + z_off_t offset, int whence); + + Set the starting position to offset relative to whence for the next gzread + or gzwrite on file. The offset represents a number of bytes in the + uncompressed data stream. The whence parameter is defined as in lseek(2); + the value SEEK_END is not supported. + + If the file is opened for reading, this function is emulated but can be + extremely slow. If the file is opened for writing, only forward seeks are + supported; gzseek then compresses a sequence of zeroes up to the new + starting position. + + gzseek returns the resulting offset location as measured in bytes from + the beginning of the uncompressed stream, or -1 in case of error, in + particular if the file is opened for writing and the new starting position + would be before the current position. +*/ + +ZEXTERN int ZEXPORT gzrewind(gzFile file); +/* + Rewind file. This function is supported only for reading. + + gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET). +*/ + +/* +ZEXTERN z_off_t ZEXPORT gztell(gzFile file); + + Return the starting position for the next gzread or gzwrite on file. + This position represents a number of bytes in the uncompressed data stream, + and is zero when starting, even if appending or reading a gzip stream from + the middle of a file using gzdopen(). + + gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) +*/ + +/* +ZEXTERN z_off_t ZEXPORT gzoffset(gzFile file); + + Return the current compressed (actual) read or write offset of file. This + offset includes the count of bytes that precede the gzip stream, for example + when appending or when using gzdopen() for reading. When reading, the + offset does not include as yet unused buffered input. This information can + be used for a progress indicator. On error, gzoffset() returns -1. +*/ + +ZEXTERN int ZEXPORT gzeof(gzFile file); +/* + Return true (1) if the end-of-file indicator for file has been set while + reading, false (0) otherwise. Note that the end-of-file indicator is set + only if the read tried to go past the end of the input, but came up short. + Therefore, just like feof(), gzeof() may return false even if there is no + more data to read, in the event that the last read request was for the exact + number of bytes remaining in the input file. This will happen if the input + file size is an exact multiple of the buffer size. + + If gzeof() returns true, then the read functions will return no more data, + unless the end-of-file indicator is reset by gzclearerr() and the input file + has grown since the previous end of file was detected. +*/ + +ZEXTERN int ZEXPORT gzdirect(gzFile file); +/* + Return true (1) if file is being copied directly while reading, or false + (0) if file is a gzip stream being decompressed. + + If the input file is empty, gzdirect() will return true, since the input + does not contain a gzip stream. + + If gzdirect() is used immediately after gzopen() or gzdopen() it will + cause buffers to be allocated to allow reading the file to determine if it + is a gzip file. Therefore if gzbuffer() is used, it should be called before + gzdirect(). + + When writing, gzdirect() returns true (1) if transparent writing was + requested ("wT" for the gzopen() mode), or false (0) otherwise. (Note: + gzdirect() is not needed when writing. Transparent writing must be + explicitly requested, so the application already knows the answer. When + linking statically, using gzdirect() will include all of the zlib code for + gzip file reading and decompression, which may not be desired.) +*/ + +ZEXTERN int ZEXPORT gzclose(gzFile file); +/* + Flush all pending output for file, if necessary, close file and + deallocate the (de)compression state. Note that once file is closed, you + cannot call gzerror with file, since its structures have been deallocated. + gzclose must not be called more than once on the same file, just as free + must not be called more than once on the same allocation. + + gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a + file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the + last read ended in the middle of a gzip stream, or Z_OK on success. +*/ + +ZEXTERN int ZEXPORT gzclose_r(gzFile file); +ZEXTERN int ZEXPORT gzclose_w(gzFile file); +/* + Same as gzclose(), but gzclose_r() is only for use when reading, and + gzclose_w() is only for use when writing or appending. The advantage to + using these instead of gzclose() is that they avoid linking in zlib + compression or decompression code that is not used when only reading or only + writing respectively. If gzclose() is used, then both compression and + decompression code will be included the application when linking to a static + zlib library. +*/ + +ZEXTERN const char * ZEXPORT gzerror(gzFile file, int *errnum); +/* + Return the error message for the last error which occurred on file. + errnum is set to zlib error number. If an error occurred in the file system + and not in the compression library, errnum is set to Z_ERRNO and the + application may consult errno to get the exact error code. + + The application must not modify the returned string. Future calls to + this function may invalidate the previously returned string. If file is + closed, then the string previously returned by gzerror will no longer be + available. + + gzerror() should be used to distinguish errors from end-of-file for those + functions above that do not distinguish those cases in their return values. +*/ + +ZEXTERN void ZEXPORT gzclearerr(gzFile file); +/* + Clear the error and end-of-file flags for file. This is analogous to the + clearerr() function in stdio. This is useful for continuing to read a gzip + file that is being written concurrently. +*/ + +#endif /* !Z_SOLO */ + + /* checksum functions */ + +/* + These functions are not related to compression but are exported + anyway because they might be useful in applications using the compression + library. +*/ + +ZEXTERN uLong ZEXPORT adler32(uLong adler, const Bytef *buf, uInt len); +/* + Update a running Adler-32 checksum with the bytes buf[0..len-1] and + return the updated checksum. An Adler-32 value is in the range of a 32-bit + unsigned integer. If buf is Z_NULL, this function returns the required + initial value for the checksum. + + An Adler-32 checksum is almost as reliable as a CRC-32 but can be computed + much faster. + + Usage example: + + uLong adler = adler32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + adler = adler32(adler, buffer, length); + } + if (adler != original_adler) error(); +*/ + +ZEXTERN uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, + z_size_t len); +/* + Same as adler32(), but with a size_t length. +*/ + +/* +ZEXTERN uLong ZEXPORT adler32_combine(uLong adler1, uLong adler2, + z_off_t len2); + + Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 + and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for + each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of + seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. Note + that the z_off_t type (like off_t) is a signed integer. If len2 is + negative, the result has no meaning or utility. +*/ + +ZEXTERN uLong ZEXPORT crc32(uLong crc, const Bytef *buf, uInt len); +/* + Update a running CRC-32 with the bytes buf[0..len-1] and return the + updated CRC-32. A CRC-32 value is in the range of a 32-bit unsigned integer. + If buf is Z_NULL, this function returns the required initial value for the + crc. Pre- and post-conditioning (one's complement) is performed within this + function so it shouldn't be done by the application. + + Usage example: + + uLong crc = crc32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + crc = crc32(crc, buffer, length); + } + if (crc != original_crc) error(); +*/ + +ZEXTERN uLong ZEXPORT crc32_z(uLong crc, const Bytef *buf, + z_size_t len); +/* + Same as crc32(), but with a size_t length. +*/ + +/* +ZEXTERN uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2); + + Combine two CRC-32 check values into one. For two sequences of bytes, + seq1 and seq2 with lengths len1 and len2, CRC-32 check values were + calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 + check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and + len2. len2 must be non-negative. +*/ + +/* +ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t len2); + + Return the operator corresponding to length len2, to be used with + crc32_combine_op(). len2 must be non-negative. +*/ + +ZEXTERN uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op); +/* + Give the same result as crc32_combine(), using op in place of len2. op is + is generated from len2 by crc32_combine_gen(). This will be faster than + crc32_combine() if the generated op is used more than once. +*/ + + + /* various hacks, don't look :) */ + +/* deflateInit and inflateInit are macros to allow checking the zlib version + * and the compiler's view of z_stream: + */ +ZEXTERN int ZEXPORT deflateInit_(z_streamp strm, int level, + const char *version, int stream_size); +ZEXTERN int ZEXPORT inflateInit_(z_streamp strm, + const char *version, int stream_size); +ZEXTERN int ZEXPORT deflateInit2_(z_streamp strm, int level, int method, + int windowBits, int memLevel, + int strategy, const char *version, + int stream_size); +ZEXTERN int ZEXPORT inflateInit2_(z_streamp strm, int windowBits, + const char *version, int stream_size); +ZEXTERN int ZEXPORT inflateBackInit_(z_streamp strm, int windowBits, + unsigned char FAR *window, + const char *version, + int stream_size); +#ifdef Z_PREFIX_SET +# define z_deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream)) +# define z_inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream)) +# define z_deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, (int)sizeof(z_stream)) +# define z_inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, \ + (int)sizeof(z_stream)) +# define z_inflateBackInit(strm, windowBits, window) \ + inflateBackInit_((strm), (windowBits), (window), \ + ZLIB_VERSION, (int)sizeof(z_stream)) +#else +# define deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream)) +# define inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream)) +# define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, (int)sizeof(z_stream)) +# define inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, \ + (int)sizeof(z_stream)) +# define inflateBackInit(strm, windowBits, window) \ + inflateBackInit_((strm), (windowBits), (window), \ + ZLIB_VERSION, (int)sizeof(z_stream)) +#endif + +#ifndef Z_SOLO + +/* gzgetc() macro and its supporting function and exposed data structure. Note + * that the real internal state is much larger than the exposed structure. + * This abbreviated structure exposes just enough for the gzgetc() macro. The + * user should not mess with these exposed elements, since their names or + * behavior could change in the future, perhaps even capriciously. They can + * only be used by the gzgetc() macro. You have been warned. + */ +struct gzFile_s { + unsigned have; + unsigned char *next; + z_off64_t pos; +}; +ZEXTERN int ZEXPORT gzgetc_(gzFile file); /* backward compatibility */ +#ifdef Z_PREFIX_SET +# undef z_gzgetc +# define z_gzgetc(g) \ + ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : (gzgetc)(g)) +#else +# define gzgetc(g) \ + ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : (gzgetc)(g)) +#endif + +/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or + * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if + * both are true, the application gets the *64 functions, and the regular + * functions are changed to 64 bits) -- in case these are set on systems + * without large file support, _LFS64_LARGEFILE must also be true + */ +#ifdef Z_LARGE64 + ZEXTERN gzFile ZEXPORT gzopen64(const char *, const char *); + ZEXTERN z_off64_t ZEXPORT gzseek64(gzFile, z_off64_t, int); + ZEXTERN z_off64_t ZEXPORT gztell64(gzFile); + ZEXTERN z_off64_t ZEXPORT gzoffset64(gzFile); + ZEXTERN uLong ZEXPORT adler32_combine64(uLong, uLong, z_off64_t); + ZEXTERN uLong ZEXPORT crc32_combine64(uLong, uLong, z_off64_t); + ZEXTERN uLong ZEXPORT crc32_combine_gen64(z_off64_t); +#endif + +#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64) +# ifdef Z_PREFIX_SET +# define z_gzopen z_gzopen64 +# define z_gzseek z_gzseek64 +# define z_gztell z_gztell64 +# define z_gzoffset z_gzoffset64 +# define z_adler32_combine z_adler32_combine64 +# define z_crc32_combine z_crc32_combine64 +# define z_crc32_combine_gen z_crc32_combine_gen64 +# else +# define gzopen gzopen64 +# define gzseek gzseek64 +# define gztell gztell64 +# define gzoffset gzoffset64 +# define adler32_combine adler32_combine64 +# define crc32_combine crc32_combine64 +# define crc32_combine_gen crc32_combine_gen64 +# endif +# ifndef Z_LARGE64 + ZEXTERN gzFile ZEXPORT gzopen64(const char *, const char *); + ZEXTERN z_off_t ZEXPORT gzseek64(gzFile, z_off_t, int); + ZEXTERN z_off_t ZEXPORT gztell64(gzFile); + ZEXTERN z_off_t ZEXPORT gzoffset64(gzFile); + ZEXTERN uLong ZEXPORT adler32_combine64(uLong, uLong, z_off_t); + ZEXTERN uLong ZEXPORT crc32_combine64(uLong, uLong, z_off_t); + ZEXTERN uLong ZEXPORT crc32_combine_gen64(z_off_t); +# endif +#else + ZEXTERN gzFile ZEXPORT gzopen(const char *, const char *); + ZEXTERN z_off_t ZEXPORT gzseek(gzFile, z_off_t, int); + ZEXTERN z_off_t ZEXPORT gztell(gzFile); + ZEXTERN z_off_t ZEXPORT gzoffset(gzFile); + ZEXTERN uLong ZEXPORT adler32_combine(uLong, uLong, z_off_t); + ZEXTERN uLong ZEXPORT crc32_combine(uLong, uLong, z_off_t); + ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t); +#endif + +#else /* Z_SOLO */ + + ZEXTERN uLong ZEXPORT adler32_combine(uLong, uLong, z_off_t); + ZEXTERN uLong ZEXPORT crc32_combine(uLong, uLong, z_off_t); + ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t); + +#endif /* !Z_SOLO */ + +/* undocumented functions */ +ZEXTERN const char * ZEXPORT zError(int); +ZEXTERN int ZEXPORT inflateSyncPoint(z_streamp); +ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table(void); +ZEXTERN int ZEXPORT inflateUndermine(z_streamp, int); +ZEXTERN int ZEXPORT inflateValidate(z_streamp, int); +ZEXTERN unsigned long ZEXPORT inflateCodesUsed(z_streamp); +ZEXTERN int ZEXPORT inflateResetKeep(z_streamp); +ZEXTERN int ZEXPORT deflateResetKeep(z_streamp); +#if defined(_WIN32) && !defined(Z_SOLO) +ZEXTERN gzFile ZEXPORT gzopen_w(const wchar_t *path, + const char *mode); +#endif +#if defined(STDC) || defined(Z_HAVE_STDARG_H) +# ifndef Z_SOLO +ZEXTERN int ZEXPORTVA gzvprintf(gzFile file, + const char *format, + va_list va); +# endif +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* ZLIB_H */ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/htmlparser.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/htmlparser.pxd new file mode 100644 index 0000000000000000000000000000000000000000..31dcc406cdc3d006afe811e7e1f778b56407510f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/htmlparser.pxd @@ -0,0 +1,56 @@ +from libc.string cimport const_char + +from lxml.includes.tree cimport xmlDoc +from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback +from lxml.includes.xmlparser cimport xmlParserCtxt, xmlSAXHandler, xmlSAXHandlerV1 + +cdef extern from "libxml/HTMLparser.h" nogil: + ctypedef enum htmlParserOption: + HTML_PARSE_NOERROR # suppress error reports + HTML_PARSE_NOWARNING # suppress warning reports + HTML_PARSE_PEDANTIC # pedantic error reporting + HTML_PARSE_NOBLANKS # remove blank nodes + HTML_PARSE_NONET # Forbid network access + # libxml2 2.6.21+ only: + HTML_PARSE_RECOVER # Relaxed parsing + HTML_PARSE_COMPACT # compact small text nodes + # libxml2 2.7.7+ only: + HTML_PARSE_NOIMPLIED # Do not add implied html/body... elements + # libxml2 2.7.8+ only: + HTML_PARSE_NODEFDTD # do not default a doctype if not found + # libxml2 2.8.0+ only: + XML_PARSE_IGNORE_ENC # ignore internal document encoding hint + + xmlSAXHandlerV1 htmlDefaultSAXHandler + + cdef xmlParserCtxt* htmlCreateMemoryParserCtxt( + char* buffer, int size) + cdef xmlParserCtxt* htmlCreateFileParserCtxt( + char* filename, char* encoding) + cdef xmlParserCtxt* htmlCreatePushParserCtxt(xmlSAXHandler* sax, + void* user_data, + char* chunk, int size, + char* filename, int enc) + cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt) + cdef void htmlCtxtReset(xmlParserCtxt* ctxt) + cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) + cdef int htmlParseDocument(xmlParserCtxt* ctxt) + cdef int htmlParseChunk(xmlParserCtxt* ctxt, + char* chunk, int size, int terminate) + + cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt, + char* filename, const_char* encoding, + int options) + cdef xmlDoc* htmlCtxtReadDoc(xmlParserCtxt* ctxt, + char* buffer, char* URL, const_char* encoding, + int options) + cdef xmlDoc* htmlCtxtReadIO(xmlParserCtxt* ctxt, + xmlInputReadCallback ioread, + xmlInputCloseCallback ioclose, + void* ioctx, + char* URL, const_char* encoding, + int options) + cdef xmlDoc* htmlCtxtReadMemory(xmlParserCtxt* ctxt, + char* buffer, int size, + char* filename, const_char* encoding, + int options) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libexslt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libexslt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..baeab1cb4e5688692c8f4ff813cf7f10e7904e32 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libexslt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxml/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxml/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea4697672b43dde3663b43831beaa7f91e75bebf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxml/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3e07a3aaf90df3db76528fed2a67ee4772e6eeb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/numbersInternals.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/numbersInternals.h new file mode 100644 index 0000000000000000000000000000000000000000..8524592811aedebb82db9412b95c4bb50d918503 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/numbersInternals.h @@ -0,0 +1,73 @@ +/* + * Summary: Implementation of the XSLT number functions + * Description: Implementation of the XSLT number functions + * + * Copy: See Copyright for the status of this software. + * + * Author: Bjorn Reese and Daniel Veillard + */ + +#ifndef __XML_XSLT_NUMBERSINTERNALS_H__ +#define __XML_XSLT_NUMBERSINTERNALS_H__ + +#include +#include "xsltexports.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct _xsltCompMatch; + +/** + * xsltNumberData: + * + * This data structure is just a wrapper to pass xsl:number data in. + */ +typedef struct _xsltNumberData xsltNumberData; +typedef xsltNumberData *xsltNumberDataPtr; + +struct _xsltNumberData { + const xmlChar *level; + const xmlChar *count; + const xmlChar *from; + const xmlChar *value; + const xmlChar *format; + int has_format; + int digitsPerGroup; + int groupingCharacter; + int groupingCharacterLen; + xmlDocPtr doc; + xmlNodePtr node; + struct _xsltCompMatch *countPat; + struct _xsltCompMatch *fromPat; + + /* + * accelerators + */ +}; + +/** + * xsltFormatNumberInfo,: + * + * This data structure lists the various parameters needed to format numbers. + */ +typedef struct _xsltFormatNumberInfo xsltFormatNumberInfo; +typedef xsltFormatNumberInfo *xsltFormatNumberInfoPtr; + +struct _xsltFormatNumberInfo { + int integer_hash; /* Number of '#' in integer part */ + int integer_digits; /* Number of '0' in integer part */ + int frac_digits; /* Number of '0' in fractional part */ + int frac_hash; /* Number of '#' in fractional part */ + int group; /* Number of chars per display 'group' */ + int multiplier; /* Scaling for percent or permille */ + char add_decimal; /* Flag for whether decimal point appears in pattern */ + char is_multiplier_set; /* Flag to catch multiple occurences of percent/permille */ + char is_negative_pattern;/* Flag for processing -ve prefix/suffix */ +}; + +#ifdef __cplusplus +} +#endif +#endif /* __XML_XSLT_NUMBERSINTERNALS_H__ */ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/preproc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/preproc.h new file mode 100644 index 0000000000000000000000000000000000000000..2a2fc7e4305e567464874ba6cb7bcba69f85efdf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/preproc.h @@ -0,0 +1,43 @@ +/* + * Summary: precomputing stylesheets + * Description: this is the compilation phase, where most of the + * stylesheet is "compiled" into faster to use data. + * + * Copy: See Copyright for the status of this software. + * + * Author: Daniel Veillard + */ + +#ifndef __XML_XSLT_PRECOMP_H__ +#define __XML_XSLT_PRECOMP_H__ + +#include +#include "xsltexports.h" +#include "xsltInternals.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Interfaces + */ +XSLTPUBVAR const xmlChar *xsltExtMarker; + +XSLTPUBFUN xsltElemPreCompPtr XSLTCALL + xsltDocumentComp (xsltStylesheetPtr style, + xmlNodePtr inst, + xsltTransformFunction function); + +XSLTPUBFUN void XSLTCALL + xsltStylePreCompute (xsltStylesheetPtr style, + xmlNodePtr inst); +XSLTPUBFUN void XSLTCALL + xsltFreeStylePreComps (xsltStylesheetPtr style); + +#ifdef __cplusplus +} +#endif + +#endif /* __XML_XSLT_PRECOMP_H__ */ + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/xsltexports.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/xsltexports.h new file mode 100644 index 0000000000000000000000000000000000000000..95c352fee2a808c4e3f12e4273ab4622eeb249b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/xsltexports.h @@ -0,0 +1,64 @@ +/* + * Summary: macros for marking symbols as exportable/importable. + * Description: macros for marking symbols as exportable/importable. + * + * Copy: See Copyright for the status of this software. + */ + +#ifndef __XSLT_EXPORTS_H__ +#define __XSLT_EXPORTS_H__ + +#if defined(_WIN32) || defined(__CYGWIN__) +/** DOC_DISABLE */ + +#ifdef LIBXSLT_STATIC + #define XSLTPUBLIC +#elif defined(IN_LIBXSLT) + #define XSLTPUBLIC __declspec(dllexport) +#else + #define XSLTPUBLIC __declspec(dllimport) +#endif + +#define XSLTCALL __cdecl + +/** DOC_ENABLE */ +#else /* not Windows */ + +/** + * XSLTPUBLIC: + * + * Macro which declares a public symbol + */ +#define XSLTPUBLIC + +/** + * XSLTCALL: + * + * Macro which declares the calling convention for exported functions + */ +#define XSLTCALL + +#endif /* platform switch */ + +/* + * XSLTPUBFUN: + * + * Macro which declares an exportable function + */ +#define XSLTPUBFUN XSLTPUBLIC + +/** + * XSLTPUBVAR: + * + * Macro which declares an exportable variable + */ +#define XSLTPUBVAR XSLTPUBLIC extern + +/* Compatibility */ +#if !defined(LIBXSLT_PUBLIC) +#define LIBXSLT_PUBLIC XSLTPUBVAR +#endif + +#endif /* __XSLT_EXPORTS_H__ */ + + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/lxml-version.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/lxml-version.h new file mode 100644 index 0000000000000000000000000000000000000000..7ad0146eb7d6c49c603081547a08fdad13f20af4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/lxml-version.h @@ -0,0 +1,3 @@ +#ifndef LXML_VERSION_STRING +#define LXML_VERSION_STRING "6.0.2" +#endif diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/tree.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/tree.pxd new file mode 100644 index 0000000000000000000000000000000000000000..43a52e647be68a5aea00f7363df1df1de4f2a48c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/tree.pxd @@ -0,0 +1,492 @@ +from libc cimport stdio +from libc.string cimport const_char, const_uchar + +cdef extern from "lxml-version.h": + # deprecated declaration, use etreepublic.pxd instead + cdef char* LXML_VERSION_STRING + +cdef extern from "libxml/xmlversion.h": + const char* xmlParserVersion + int LIBXML_VERSION + + +cdef extern from "libxml/xmlstring.h" nogil: + ctypedef unsigned char xmlChar + ctypedef const xmlChar const_xmlChar "const xmlChar" + cdef int xmlStrlen(const_xmlChar* str) + cdef xmlChar* xmlStrdup(const_xmlChar* cur) + cdef int xmlStrncmp(const_xmlChar* str1, const_xmlChar* str2, int length) + cdef int xmlStrcmp(const_xmlChar* str1, const_xmlChar* str2) + cdef int xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) + cdef const_xmlChar* xmlStrstr(const_xmlChar* str1, const_xmlChar* str2) + cdef const_xmlChar* xmlStrchr(const_xmlChar* str1, xmlChar ch) + cdef const_xmlChar* _xcstr "(const xmlChar*)PyBytes_AS_STRING" (object s) + +cdef extern from "libxml/encoding.h" nogil: + ctypedef enum xmlCharEncoding: + XML_CHAR_ENCODING_ERROR = -1 # No char encoding detected + XML_CHAR_ENCODING_NONE = 0 # No char encoding detected + XML_CHAR_ENCODING_UTF8 = 1 # UTF-8 + XML_CHAR_ENCODING_UTF16LE = 2 # UTF-16 little endian + XML_CHAR_ENCODING_UTF16BE = 3 # UTF-16 big endian + XML_CHAR_ENCODING_UCS4LE = 4 # UCS-4 little endian + XML_CHAR_ENCODING_UCS4BE = 5 # UCS-4 big endian + XML_CHAR_ENCODING_EBCDIC = 6 # EBCDIC uh! + XML_CHAR_ENCODING_UCS4_2143 = 7 # UCS-4 unusual ordering + XML_CHAR_ENCODING_UCS4_3412 = 8 # UCS-4 unusual ordering + XML_CHAR_ENCODING_UCS2 = 9 # UCS-2 + XML_CHAR_ENCODING_8859_1 = 10 # ISO-8859-1 ISO Latin 1 + XML_CHAR_ENCODING_8859_2 = 11 # ISO-8859-2 ISO Latin 2 + XML_CHAR_ENCODING_8859_3 = 12 # ISO-8859-3 + XML_CHAR_ENCODING_8859_4 = 13 # ISO-8859-4 + XML_CHAR_ENCODING_8859_5 = 14 # ISO-8859-5 + XML_CHAR_ENCODING_8859_6 = 15 # ISO-8859-6 + XML_CHAR_ENCODING_8859_7 = 16 # ISO-8859-7 + XML_CHAR_ENCODING_8859_8 = 17 # ISO-8859-8 + XML_CHAR_ENCODING_8859_9 = 18 # ISO-8859-9 + XML_CHAR_ENCODING_2022_JP = 19 # ISO-2022-JP + XML_CHAR_ENCODING_SHIFT_JIS = 20 # Shift_JIS + XML_CHAR_ENCODING_EUC_JP = 21 # EUC-JP + XML_CHAR_ENCODING_ASCII = 22 # pure ASCII + + ctypedef struct xmlCharEncodingHandler: + char* name + + cdef xmlCharEncodingHandler* xmlFindCharEncodingHandler(char* name) + cdef xmlCharEncodingHandler* xmlGetCharEncodingHandler( + xmlCharEncoding enc) + cdef int xmlCharEncCloseFunc(xmlCharEncodingHandler* handler) + cdef xmlCharEncoding xmlDetectCharEncoding(const_xmlChar* text, int len) + cdef const_char* xmlGetCharEncodingName(xmlCharEncoding enc) + cdef xmlCharEncoding xmlParseCharEncoding(char* name) + ctypedef int (*xmlCharEncodingOutputFunc)( + unsigned char *out_buf, int *outlen, const_uchar *in_buf, int *inlen) + +cdef extern from "libxml/chvalid.h" nogil: + cdef int xmlIsChar_ch(char c) + cdef int xmlIsCharQ(int ch) + +cdef extern from "libxml/hash.h": + ctypedef struct xmlHashTable + ctypedef void (*xmlHashScanner)(void* payload, void* data, const_xmlChar* name) noexcept # may require GIL! + void xmlHashScan(xmlHashTable* table, xmlHashScanner f, void* data) nogil + void* xmlHashLookup(xmlHashTable* table, const_xmlChar* name) nogil + ctypedef void (*xmlHashDeallocator)(void *payload, xmlChar *name) noexcept + cdef xmlHashTable* xmlHashCreate(int size) nogil + cdef xmlHashTable* xmlHashCreateDict(int size, xmlDict *dict) nogil + cdef int xmlHashSize(xmlHashTable* table) nogil + cdef void xmlHashFree(xmlHashTable* table, xmlHashDeallocator f) nogil + +cdef extern from * nogil: # actually "libxml/dict.h" + # libxml/dict.h appears to be broken to include in C + ctypedef struct xmlDict + cdef const_xmlChar* xmlDictLookup(xmlDict* dict, const_xmlChar* name, int len) + cdef const_xmlChar* xmlDictExists(xmlDict* dict, const_xmlChar* name, int len) + cdef int xmlDictOwns(xmlDict* dict, const_xmlChar* name) + cdef size_t xmlDictSize(xmlDict* dict) + +cdef extern from "libxml/tree.h" nogil: + ctypedef struct xmlDoc + ctypedef struct xmlAttr + ctypedef struct xmlNotationTable + + ctypedef enum xmlElementType: + XML_ELEMENT_NODE= 1 + XML_ATTRIBUTE_NODE= 2 + XML_TEXT_NODE= 3 + XML_CDATA_SECTION_NODE= 4 + XML_ENTITY_REF_NODE= 5 + XML_ENTITY_NODE= 6 + XML_PI_NODE= 7 + XML_COMMENT_NODE= 8 + XML_DOCUMENT_NODE= 9 + XML_DOCUMENT_TYPE_NODE= 10 + XML_DOCUMENT_FRAG_NODE= 11 + XML_NOTATION_NODE= 12 + XML_HTML_DOCUMENT_NODE= 13 + XML_DTD_NODE= 14 + XML_ELEMENT_DECL= 15 + XML_ATTRIBUTE_DECL= 16 + XML_ENTITY_DECL= 17 + XML_NAMESPACE_DECL= 18 + XML_XINCLUDE_START= 19 + XML_XINCLUDE_END= 20 + + ctypedef enum xmlElementTypeVal: + XML_ELEMENT_TYPE_UNDEFINED= 0 + XML_ELEMENT_TYPE_EMPTY= 1 + XML_ELEMENT_TYPE_ANY= 2 + XML_ELEMENT_TYPE_MIXED= 3 + XML_ELEMENT_TYPE_ELEMENT= 4 + + ctypedef enum xmlElementContentType: + XML_ELEMENT_CONTENT_PCDATA= 1 + XML_ELEMENT_CONTENT_ELEMENT= 2 + XML_ELEMENT_CONTENT_SEQ= 3 + XML_ELEMENT_CONTENT_OR= 4 + + ctypedef enum xmlElementContentOccur: + XML_ELEMENT_CONTENT_ONCE= 1 + XML_ELEMENT_CONTENT_OPT= 2 + XML_ELEMENT_CONTENT_MULT= 3 + XML_ELEMENT_CONTENT_PLUS= 4 + + ctypedef enum xmlAttributeType: + XML_ATTRIBUTE_CDATA = 1 + XML_ATTRIBUTE_ID= 2 + XML_ATTRIBUTE_IDREF= 3 + XML_ATTRIBUTE_IDREFS= 4 + XML_ATTRIBUTE_ENTITY= 5 + XML_ATTRIBUTE_ENTITIES= 6 + XML_ATTRIBUTE_NMTOKEN= 7 + XML_ATTRIBUTE_NMTOKENS= 8 + XML_ATTRIBUTE_ENUMERATION= 9 + XML_ATTRIBUTE_NOTATION= 10 + + ctypedef enum xmlAttributeDefault: + XML_ATTRIBUTE_NONE= 1 + XML_ATTRIBUTE_REQUIRED= 2 + XML_ATTRIBUTE_IMPLIED= 3 + XML_ATTRIBUTE_FIXED= 4 + + ctypedef enum xmlEntityType: + XML_INTERNAL_GENERAL_ENTITY= 1 + XML_EXTERNAL_GENERAL_PARSED_ENTITY= 2 + XML_EXTERNAL_GENERAL_UNPARSED_ENTITY= 3 + XML_INTERNAL_PARAMETER_ENTITY= 4 + XML_EXTERNAL_PARAMETER_ENTITY= 5 + XML_INTERNAL_PREDEFINED_ENTITY= 6 + + ctypedef enum xmlDocProperties: + XML_DOC_WELLFORMED = 1 # /* document is XML well formed */ + XML_DOC_NSVALID = 2 # /* document is Namespace valid */ + XML_DOC_OLD10 = 4 # /* parsed with old XML-1.0 parser */ + XML_DOC_DTDVALID = 8 # /* DTD validation was successful */ + XML_DOC_XINCLUDE = 16 # /* XInclude substitution was done */ + XML_DOC_USERBUILT = 32 # /* Document was built using the API + # and not by parsing an instance */ + XML_DOC_INTERNAL = 64 # /* built for internal processing */ + XML_DOC_HTML = 128 # /* parsed or built HTML document */ + + ctypedef struct xmlNs: + const_xmlChar* href + const_xmlChar* prefix + xmlNs* next + + ctypedef struct xmlNode: + void* _private + xmlElementType type + const_xmlChar* name + xmlNode* children + xmlNode* last + xmlNode* parent + xmlNode* next + xmlNode* prev + xmlDoc* doc + xmlChar* content + xmlAttr* properties + xmlNs* ns + xmlNs* nsDef + unsigned short line + + ctypedef struct xmlElementContent: + xmlElementContentType type + xmlElementContentOccur ocur + const_xmlChar *name + xmlElementContent *c1 + xmlElementContent *c2 + xmlElementContent *parent + const_xmlChar *prefix + + ctypedef struct xmlEnumeration: + xmlEnumeration *next + const_xmlChar *name + + ctypedef struct xmlAttribute: + void* _private + xmlElementType type + const_xmlChar* name + xmlNode* children + xmlNode* last + xmlDtd* parent + xmlNode* next + xmlNode* prev + xmlDoc* doc + xmlAttribute* nexth + xmlAttributeType atype + xmlAttributeDefault def_ "def" + const_xmlChar* defaultValue + xmlEnumeration* tree + const_xmlChar* prefix + const_xmlChar* elem + + ctypedef struct xmlElement: + void* _private + xmlElementType type + const_xmlChar* name + xmlNode* children + xmlNode* last + xmlNode* parent + xmlNode* next + xmlNode* prev + xmlDoc* doc + xmlElementTypeVal etype + xmlElementContent* content + xmlAttribute* attributes + const_xmlChar* prefix + void *contModel + + ctypedef struct xmlEntity: + void* _private + xmlElementType type + const_xmlChar* name + xmlNode* children + xmlNode* last + xmlDtd* parent + xmlNode* next + xmlNode* prev + xmlDoc* doc + xmlChar* orig + xmlChar* content + int length + xmlEntityType etype + const_xmlChar* ExternalID + const_xmlChar* SystemID + xmlEntity* nexte + const_xmlChar* URI + int owner + int checked + + ctypedef struct xmlDtd: + const_xmlChar* name + const_xmlChar* ExternalID + const_xmlChar* SystemID + void* notations + void* entities + void* pentities + void* attributes + void* elements + xmlNode* children + xmlNode* last + xmlDoc* doc + + ctypedef struct xmlDoc: + xmlElementType type + char* name + xmlNode* children + xmlNode* last + xmlNode* parent + xmlNode* next + xmlNode* prev + xmlDoc* doc + xmlDict* dict + xmlHashTable* ids + int standalone + const_xmlChar* version + const_xmlChar* encoding + const_xmlChar* URL + void* _private + xmlDtd* intSubset + xmlDtd* extSubset + int properties + + ctypedef struct xmlAttr: + void* _private + xmlElementType type + const_xmlChar* name + xmlNode* children + xmlNode* last + xmlNode* parent + xmlAttr* next + xmlAttr* prev + xmlDoc* doc + xmlNs* ns + xmlAttributeType atype + + ctypedef struct xmlID: + const_xmlChar* value + const_xmlChar* name + xmlAttr* attr + xmlDoc* doc + + ctypedef struct xmlBuffer + + ctypedef struct xmlBuf # new in libxml2 2.9 + + ctypedef struct xmlOutputBuffer: + xmlBuf* buffer + xmlBuf* conv + int error + + const_xmlChar* XML_XML_NAMESPACE + + cdef void xmlFreeDoc(xmlDoc* cur) + cdef void xmlFreeDtd(xmlDtd* cur) + cdef void xmlFreeNode(xmlNode* cur) + cdef void xmlFreeNsList(xmlNs* ns) + cdef void xmlFreeNs(xmlNs* ns) + cdef void xmlFree(void* buf) + + cdef xmlNode* xmlNewNode(xmlNs* ns, const_xmlChar* name) + cdef xmlNode* xmlNewDocText(xmlDoc* doc, const_xmlChar* content) + cdef xmlNode* xmlNewDocComment(xmlDoc* doc, const_xmlChar* content) + cdef xmlNode* xmlNewDocPI(xmlDoc* doc, const_xmlChar* name, const_xmlChar* content) + cdef xmlNode* xmlNewReference(xmlDoc* doc, const_xmlChar* name) + cdef xmlNode* xmlNewCDataBlock(xmlDoc* doc, const_xmlChar* text, int len) + cdef xmlNs* xmlNewNs(xmlNode* node, const_xmlChar* href, const_xmlChar* prefix) + cdef xmlNode* xmlAddChild(xmlNode* parent, xmlNode* cur) + cdef xmlNode* xmlReplaceNode(xmlNode* old, xmlNode* cur) + cdef xmlNode* xmlAddPrevSibling(xmlNode* cur, xmlNode* elem) + cdef xmlNode* xmlAddNextSibling(xmlNode* cur, xmlNode* elem) + cdef xmlNode* xmlNewDocNode(xmlDoc* doc, xmlNs* ns, + const_xmlChar* name, const_xmlChar* content) + cdef xmlDoc* xmlNewDoc(const_xmlChar* version) + cdef xmlAttr* xmlNewProp(xmlNode* node, const_xmlChar* name, const_xmlChar* value) + cdef xmlAttr* xmlNewNsProp(xmlNode* node, xmlNs* ns, + const_xmlChar* name, const_xmlChar* value) + cdef xmlChar* xmlGetNoNsProp(xmlNode* node, const_xmlChar* name) + cdef xmlChar* xmlGetNsProp(xmlNode* node, const_xmlChar* name, const_xmlChar* nameSpace) + cdef void xmlSetNs(xmlNode* node, xmlNs* ns) + cdef xmlAttr* xmlSetProp(xmlNode* node, const_xmlChar* name, const_xmlChar* value) + cdef xmlAttr* xmlSetNsProp(xmlNode* node, xmlNs* ns, + const_xmlChar* name, const_xmlChar* value) + cdef int xmlRemoveID(xmlDoc* doc, xmlAttr* cur) + cdef int xmlRemoveProp(xmlAttr* cur) + cdef void xmlFreePropList(xmlAttr* cur) + cdef xmlChar* xmlGetNodePath(xmlNode* node) + cdef void xmlDocDumpMemory(xmlDoc* cur, char** mem, int* size) + cdef void xmlDocDumpMemoryEnc(xmlDoc* cur, char** mem, int* size, + char* encoding) + cdef int xmlSaveFileTo(xmlOutputBuffer* out, xmlDoc* cur, + char* encoding) + + cdef void xmlUnlinkNode(xmlNode* cur) + cdef xmlNode* xmlDocSetRootElement(xmlDoc* doc, xmlNode* root) + cdef xmlNode* xmlDocGetRootElement(xmlDoc* doc) + cdef void xmlSetTreeDoc(xmlNode* tree, xmlDoc* doc) + cdef xmlAttr* xmlHasProp(xmlNode* node, const_xmlChar* name) + cdef xmlAttr* xmlHasNsProp(xmlNode* node, const_xmlChar* name, const_xmlChar* nameSpace) + cdef xmlChar* xmlNodeGetContent(xmlNode* cur) + cdef int xmlNodeBufGetContent(xmlBuffer* buffer, xmlNode* cur) + cdef xmlNs* xmlSearchNs(xmlDoc* doc, xmlNode* node, const_xmlChar* prefix) + cdef xmlNs* xmlSearchNsByHref(xmlDoc* doc, xmlNode* node, const_xmlChar* href) + cdef int xmlIsBlankNode(xmlNode* node) + cdef long xmlGetLineNo(xmlNode* node) + cdef void xmlElemDump(stdio.FILE* f, xmlDoc* doc, xmlNode* cur) + cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf, + xmlDoc* doc, xmlNode* cur, int level, + int format, const_char* encoding) + cdef void xmlBufAttrSerializeTxtContent(xmlOutputBuffer *buf, xmlDoc *doc, + xmlAttr *attr, const_xmlChar *string) + cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name) + cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content) + cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd) + cdef xmlDoc* xmlCopyDoc(xmlDoc* doc, int recursive) + cdef xmlNode* xmlCopyNode(xmlNode* node, int extended) + cdef xmlNode* xmlDocCopyNode(xmlNode* node, xmlDoc* doc, int extended) + cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree) + cdef xmlNs* xmlNewReconciliedNs(xmlDoc* doc, xmlNode* tree, xmlNs* ns) + cdef xmlBuffer* xmlBufferCreate() + cdef void xmlBufferWriteChar(xmlBuffer* buf, char* string) + cdef void xmlBufferFree(xmlBuffer* buf) + cdef const_xmlChar* xmlBufferContent(xmlBuffer* buf) + cdef int xmlBufferLength(xmlBuffer* buf) + cdef const_xmlChar* xmlBufContent(xmlBuf* buf) # new in libxml2 2.9 + cdef size_t xmlBufUse(xmlBuf* buf) # new in libxml2 2.9 + cdef int xmlKeepBlanksDefault(int val) + cdef xmlChar* xmlNodeGetBase(xmlDoc* doc, xmlNode* node) + cdef xmlDtd* xmlCreateIntSubset(xmlDoc* doc, const_xmlChar* name, + const_xmlChar* ExternalID, const_xmlChar* SystemID) + cdef void xmlNodeSetBase(xmlNode* node, const_xmlChar* uri) + cdef int xmlValidateNCName(const_xmlChar* value, int space) + +cdef extern from "libxml/uri.h" nogil: + cdef const_xmlChar* xmlBuildURI(const_xmlChar* href, const_xmlChar* base) + +cdef extern from "libxml/HTMLtree.h" nogil: + cdef void htmlNodeDumpFormatOutput(xmlOutputBuffer* buf, + xmlDoc* doc, xmlNode* cur, + char* encoding, int format) + cdef xmlDoc* htmlNewDoc(const_xmlChar* uri, const_xmlChar* externalID) + +cdef extern from "libxml/valid.h" nogil: + cdef xmlAttr* xmlGetID(xmlDoc* doc, const_xmlChar* ID) + cdef void xmlDumpNotationTable(xmlBuffer* buffer, + xmlNotationTable* table) + cdef int xmlValidateNameValue(const_xmlChar* value) + +cdef extern from "libxml/xmlIO.h": + cdef int xmlOutputBufferWrite(xmlOutputBuffer* out, + int len, const_char* str) nogil + cdef int xmlOutputBufferWriteString(xmlOutputBuffer* out, const_char* str) nogil + cdef int xmlOutputBufferWriteEscape(xmlOutputBuffer* out, + const_xmlChar* str, + xmlCharEncodingOutputFunc escapefunc) nogil + cdef int xmlOutputBufferFlush(xmlOutputBuffer* out) nogil + cdef int xmlOutputBufferClose(xmlOutputBuffer* out) nogil + + ctypedef int (*xmlInputReadCallback)(void* context, + char* buffer, int len) noexcept nogil + ctypedef int (*xmlInputCloseCallback)(void* context) noexcept nogil + + ctypedef int (*xmlOutputWriteCallback)(void* context, + char* buffer, int len) noexcept + ctypedef int (*xmlOutputCloseCallback)(void* context) noexcept + + cdef xmlOutputBuffer* xmlAllocOutputBuffer( + xmlCharEncodingHandler* encoder) nogil + cdef xmlOutputBuffer* xmlOutputBufferCreateIO( + xmlOutputWriteCallback iowrite, + xmlOutputCloseCallback ioclose, + void * ioctx, + xmlCharEncodingHandler* encoder) nogil + cdef xmlOutputBuffer* xmlOutputBufferCreateFile( + stdio.FILE* file, xmlCharEncodingHandler* encoder) nogil + cdef xmlOutputBuffer* xmlOutputBufferCreateFilename( + char* URI, xmlCharEncodingHandler* encoder, int compression) nogil + +cdef extern from "libxml/xmlsave.h" nogil: + ctypedef struct xmlSaveCtxt + + ctypedef enum xmlSaveOption: + XML_SAVE_FORMAT = 1 # format save output (2.6.17) + XML_SAVE_NO_DECL = 2 # drop the xml declaration (2.6.21) + XML_SAVE_NO_EMPTY = 4 # no empty tags (2.6.22) + XML_SAVE_NO_XHTML = 8 # disable XHTML1 specific rules (2.6.22) + XML_SAVE_XHTML = 16 # force XHTML1 specific rules (2.7.2) + XML_SAVE_AS_XML = 32 # force XML serialization on HTML doc (2.7.2) + XML_SAVE_AS_HTML = 64 # force HTML serialization on XML doc (2.7.2) + + cdef xmlSaveCtxt* xmlSaveToFilename(char* filename, char* encoding, + int options) + cdef xmlSaveCtxt* xmlSaveToBuffer(xmlBuffer* buffer, char* encoding, + int options) # libxml2 2.6.23 + cdef long xmlSaveDoc(xmlSaveCtxt* ctxt, xmlDoc* doc) + cdef long xmlSaveTree(xmlSaveCtxt* ctxt, xmlNode* node) + cdef int xmlSaveClose(xmlSaveCtxt* ctxt) + cdef int xmlSaveFlush(xmlSaveCtxt* ctxt) + cdef int xmlSaveSetAttrEscape(xmlSaveCtxt* ctxt, void* escape_func) + cdef int xmlSaveSetEscape(xmlSaveCtxt* ctxt, void* escape_func) + +cdef extern from "libxml/globals.h" nogil: + cdef int xmlThrDefKeepBlanksDefaultValue(int onoff) + cdef int xmlThrDefLineNumbersDefaultValue(int onoff) + cdef int xmlThrDefIndentTreeOutput(int onoff) + +cdef extern from "libxml/xmlmemory.h" nogil: + cdef void* xmlMalloc(size_t size) + cdef int xmlMemBlocks() + cdef int xmlMemUsed() + +cdef extern from "etree_defs.h" nogil: + cdef bint _isElement(xmlNode* node) + cdef bint _isElementOrXInclude(xmlNode* node) + cdef const_xmlChar* _getNs(xmlNode* node) + cdef void BEGIN_FOR_EACH_ELEMENT_FROM(xmlNode* tree_top, + xmlNode* start_node, + bint inclusive) + cdef void END_FOR_EACH_ELEMENT_FROM(xmlNode* start_node) + cdef void BEGIN_FOR_EACH_FROM(xmlNode* tree_top, + xmlNode* start_node, + bint inclusive) + cdef void END_FOR_EACH_FROM(xmlNode* start_node) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/uri.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/uri.pxd new file mode 100644 index 0000000000000000000000000000000000000000..f886a54b9a9d7fc3094f99210f54edd98f12fab7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/uri.pxd @@ -0,0 +1,5 @@ +cdef extern from "libxml/uri.h" nogil: + ctypedef struct xmlURI + + cdef xmlURI* xmlParseURI(char* str) + cdef void xmlFreeURI(xmlURI* uri) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xinclude.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xinclude.pxd new file mode 100644 index 0000000000000000000000000000000000000000..68267175afa602f6bb0970566bd5f71f2d318af7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xinclude.pxd @@ -0,0 +1,22 @@ +from lxml.includes.tree cimport xmlDoc, xmlNode + +cdef extern from "libxml/xinclude.h" nogil: + + ctypedef struct xmlXIncludeCtxt + + cdef int xmlXIncludeProcess(xmlDoc* doc) + cdef int xmlXIncludeProcessFlags(xmlDoc* doc, int parser_opts) + cdef int xmlXIncludeProcessTree(xmlNode* doc) + cdef int xmlXIncludeProcessTreeFlags(xmlNode* doc, int parser_opts) + + # libxml2 >= 2.7.4 + cdef int xmlXIncludeProcessTreeFlagsData( + xmlNode* doc, int parser_opts, void* data) + + cdef xmlXIncludeCtxt* xmlXIncludeNewContext(xmlDoc* doc) + cdef int xmlXIncludeProcessNode(xmlXIncludeCtxt* ctxt, xmlNode* node) + cdef int xmlXIncludeSetFlags(xmlXIncludeCtxt* ctxt, int flags) + + # libxml2 >= 2.6.27 + cdef int xmlXIncludeProcessFlagsData( + xmlDoc* doc, int flags, void* data) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlerror.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlerror.pxd new file mode 100644 index 0000000000000000000000000000000000000000..0249a45e2c4b757f236ab5891510934f9add6b3e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlerror.pxd @@ -0,0 +1,860 @@ + +# --- BEGIN: GENERATED CONSTANTS --- + +# This section is generated by the script 'update-error-constants.py'. + +cdef extern from "libxml/xmlerror.h": + ctypedef enum xmlErrorLevel: + XML_ERR_NONE = 0 # Success + XML_ERR_WARNING = 1 # A warning + XML_ERR_ERROR = 2 # An error + XML_ERR_FATAL = 3 # A fatal error + + ctypedef enum xmlErrorDomain: + XML_FROM_NONE = 0 # Unknown + XML_FROM_PARSER = 1 # The XML parser + XML_FROM_TREE = 2 # The tree module (unused) + XML_FROM_NAMESPACE = 3 # The XML Namespace module + XML_FROM_DTD = 4 # The XML DTD validation with parser context + XML_FROM_HTML = 5 # The HTML parser + XML_FROM_MEMORY = 6 # The memory allocator (unused) + XML_FROM_OUTPUT = 7 # The serialization code + XML_FROM_IO = 8 # The Input/Output stack + XML_FROM_FTP = 9 # The FTP module (unused) + XML_FROM_HTTP = 10 # The HTTP module (unused) + XML_FROM_XINCLUDE = 11 # The XInclude processing + XML_FROM_XPATH = 12 # The XPath module + XML_FROM_XPOINTER = 13 # The XPointer module + XML_FROM_REGEXP = 14 # The regular expressions module + XML_FROM_DATATYPE = 15 # The W3C XML Schemas Datatype module + XML_FROM_SCHEMASP = 16 # The W3C XML Schemas parser module + XML_FROM_SCHEMASV = 17 # The W3C XML Schemas validation module + XML_FROM_RELAXNGP = 18 # The Relax-NG parser module + XML_FROM_RELAXNGV = 19 # The Relax-NG validator module + XML_FROM_CATALOG = 20 # The Catalog module + XML_FROM_C14N = 21 # The Canonicalization module + XML_FROM_XSLT = 22 # The XSLT engine from libxslt (unused) + XML_FROM_VALID = 23 # The XML DTD validation with valid context + XML_FROM_CHECK = 24 # The error checking module (unused) + XML_FROM_WRITER = 25 # The xmlwriter module + XML_FROM_MODULE = 26 # The dynamically loaded module module (unused) + XML_FROM_I18N = 27 # The module handling character conversion (unused) + XML_FROM_SCHEMATRONV = 28 # The Schematron validator module + XML_FROM_BUFFER = 29 # The buffers module (unused) + XML_FROM_URI = 30 # The URI module (unused) + + ctypedef enum xmlParserErrors: + XML_ERR_OK = 0 # Success + XML_ERR_INTERNAL_ERROR = 1 # Internal assertion failure + XML_ERR_NO_MEMORY = 2 # Out of memory + XML_ERR_DOCUMENT_START = 3 + XML_ERR_DOCUMENT_EMPTY = 4 + XML_ERR_DOCUMENT_END = 5 + XML_ERR_INVALID_HEX_CHARREF = 6 + XML_ERR_INVALID_DEC_CHARREF = 7 + XML_ERR_INVALID_CHARREF = 8 + XML_ERR_INVALID_CHAR = 9 + XML_ERR_CHARREF_AT_EOF = 10 + XML_ERR_CHARREF_IN_PROLOG = 11 + XML_ERR_CHARREF_IN_EPILOG = 12 + XML_ERR_CHARREF_IN_DTD = 13 + XML_ERR_ENTITYREF_AT_EOF = 14 + XML_ERR_ENTITYREF_IN_PROLOG = 15 + XML_ERR_ENTITYREF_IN_EPILOG = 16 + XML_ERR_ENTITYREF_IN_DTD = 17 + XML_ERR_PEREF_AT_EOF = 18 + XML_ERR_PEREF_IN_PROLOG = 19 + XML_ERR_PEREF_IN_EPILOG = 20 + XML_ERR_PEREF_IN_INT_SUBSET = 21 + XML_ERR_ENTITYREF_NO_NAME = 22 + XML_ERR_ENTITYREF_SEMICOL_MISSING = 23 + XML_ERR_PEREF_NO_NAME = 24 + XML_ERR_PEREF_SEMICOL_MISSING = 25 + XML_ERR_UNDECLARED_ENTITY = 26 + XML_WAR_UNDECLARED_ENTITY = 27 + XML_ERR_UNPARSED_ENTITY = 28 + XML_ERR_ENTITY_IS_EXTERNAL = 29 + XML_ERR_ENTITY_IS_PARAMETER = 30 + XML_ERR_UNKNOWN_ENCODING = 31 + XML_ERR_UNSUPPORTED_ENCODING = 32 # Unsupported character encoding + XML_ERR_STRING_NOT_STARTED = 33 + XML_ERR_STRING_NOT_CLOSED = 34 + XML_ERR_NS_DECL_ERROR = 35 + XML_ERR_ENTITY_NOT_STARTED = 36 + XML_ERR_ENTITY_NOT_FINISHED = 37 + XML_ERR_LT_IN_ATTRIBUTE = 38 + XML_ERR_ATTRIBUTE_NOT_STARTED = 39 + XML_ERR_ATTRIBUTE_NOT_FINISHED = 40 + XML_ERR_ATTRIBUTE_WITHOUT_VALUE = 41 + XML_ERR_ATTRIBUTE_REDEFINED = 42 + XML_ERR_LITERAL_NOT_STARTED = 43 + XML_ERR_LITERAL_NOT_FINISHED = 44 + XML_ERR_COMMENT_NOT_FINISHED = 45 + XML_ERR_PI_NOT_STARTED = 46 + XML_ERR_PI_NOT_FINISHED = 47 + XML_ERR_NOTATION_NOT_STARTED = 48 + XML_ERR_NOTATION_NOT_FINISHED = 49 + XML_ERR_ATTLIST_NOT_STARTED = 50 + XML_ERR_ATTLIST_NOT_FINISHED = 51 + XML_ERR_MIXED_NOT_STARTED = 52 + XML_ERR_MIXED_NOT_FINISHED = 53 + XML_ERR_ELEMCONTENT_NOT_STARTED = 54 + XML_ERR_ELEMCONTENT_NOT_FINISHED = 55 + XML_ERR_XMLDECL_NOT_STARTED = 56 + XML_ERR_XMLDECL_NOT_FINISHED = 57 + XML_ERR_CONDSEC_NOT_STARTED = 58 + XML_ERR_CONDSEC_NOT_FINISHED = 59 + XML_ERR_EXT_SUBSET_NOT_FINISHED = 60 + XML_ERR_DOCTYPE_NOT_FINISHED = 61 + XML_ERR_MISPLACED_CDATA_END = 62 + XML_ERR_CDATA_NOT_FINISHED = 63 + XML_ERR_RESERVED_XML_NAME = 64 + XML_ERR_SPACE_REQUIRED = 65 + XML_ERR_SEPARATOR_REQUIRED = 66 + XML_ERR_NMTOKEN_REQUIRED = 67 + XML_ERR_NAME_REQUIRED = 68 + XML_ERR_PCDATA_REQUIRED = 69 + XML_ERR_URI_REQUIRED = 70 + XML_ERR_PUBID_REQUIRED = 71 + XML_ERR_LT_REQUIRED = 72 + XML_ERR_GT_REQUIRED = 73 + XML_ERR_LTSLASH_REQUIRED = 74 + XML_ERR_EQUAL_REQUIRED = 75 + XML_ERR_TAG_NAME_MISMATCH = 76 + XML_ERR_TAG_NOT_FINISHED = 77 + XML_ERR_STANDALONE_VALUE = 78 + XML_ERR_ENCODING_NAME = 79 + XML_ERR_HYPHEN_IN_COMMENT = 80 + XML_ERR_INVALID_ENCODING = 81 + XML_ERR_EXT_ENTITY_STANDALONE = 82 + XML_ERR_CONDSEC_INVALID = 83 + XML_ERR_VALUE_REQUIRED = 84 + XML_ERR_NOT_WELL_BALANCED = 85 + XML_ERR_EXTRA_CONTENT = 86 + XML_ERR_ENTITY_CHAR_ERROR = 87 + XML_ERR_ENTITY_PE_INTERNAL = 88 + XML_ERR_ENTITY_LOOP = 89 + XML_ERR_ENTITY_BOUNDARY = 90 + XML_ERR_INVALID_URI = 91 + XML_ERR_URI_FRAGMENT = 92 + XML_WAR_CATALOG_PI = 93 + XML_ERR_NO_DTD = 94 + XML_ERR_CONDSEC_INVALID_KEYWORD = 95 + XML_ERR_VERSION_MISSING = 96 + XML_WAR_UNKNOWN_VERSION = 97 + XML_WAR_LANG_VALUE = 98 + XML_WAR_NS_URI = 99 + XML_WAR_NS_URI_RELATIVE = 100 + XML_ERR_MISSING_ENCODING = 101 + XML_WAR_SPACE_VALUE = 102 + XML_ERR_NOT_STANDALONE = 103 + XML_ERR_ENTITY_PROCESSING = 104 + XML_ERR_NOTATION_PROCESSING = 105 + XML_WAR_NS_COLUMN = 106 + XML_WAR_ENTITY_REDEFINED = 107 + XML_ERR_UNKNOWN_VERSION = 108 + XML_ERR_VERSION_MISMATCH = 109 + XML_ERR_NAME_TOO_LONG = 110 + XML_ERR_USER_STOP = 111 + XML_ERR_COMMENT_ABRUPTLY_ENDED = 112 + XML_WAR_ENCODING_MISMATCH = 113 + XML_ERR_RESOURCE_LIMIT = 114 # Internal resource limit like maximum amplification factor exceeded + XML_ERR_ARGUMENT = 115 # Invalid argument + XML_ERR_SYSTEM = 116 # Unexpected error from the OS or an external library + XML_ERR_REDECL_PREDEF_ENTITY = 117 + XML_ERR_INT_SUBSET_NOT_FINISHED = 118 + XML_NS_ERR_XML_NAMESPACE = 200 + XML_NS_ERR_UNDEFINED_NAMESPACE = 201 + XML_NS_ERR_QNAME = 202 + XML_NS_ERR_ATTRIBUTE_REDEFINED = 203 + XML_NS_ERR_EMPTY = 204 + XML_NS_ERR_COLON = 205 + XML_DTD_ATTRIBUTE_DEFAULT = 500 + XML_DTD_ATTRIBUTE_REDEFINED = 501 + XML_DTD_ATTRIBUTE_VALUE = 502 + XML_DTD_CONTENT_ERROR = 503 + XML_DTD_CONTENT_MODEL = 504 + XML_DTD_CONTENT_NOT_DETERMINIST = 505 + XML_DTD_DIFFERENT_PREFIX = 506 + XML_DTD_ELEM_DEFAULT_NAMESPACE = 507 + XML_DTD_ELEM_NAMESPACE = 508 + XML_DTD_ELEM_REDEFINED = 509 + XML_DTD_EMPTY_NOTATION = 510 + XML_DTD_ENTITY_TYPE = 511 + XML_DTD_ID_FIXED = 512 + XML_DTD_ID_REDEFINED = 513 + XML_DTD_ID_SUBSET = 514 + XML_DTD_INVALID_CHILD = 515 + XML_DTD_INVALID_DEFAULT = 516 + XML_DTD_LOAD_ERROR = 517 + XML_DTD_MISSING_ATTRIBUTE = 518 + XML_DTD_MIXED_CORRUPT = 519 + XML_DTD_MULTIPLE_ID = 520 + XML_DTD_NO_DOC = 521 + XML_DTD_NO_DTD = 522 + XML_DTD_NO_ELEM_NAME = 523 + XML_DTD_NO_PREFIX = 524 + XML_DTD_NO_ROOT = 525 + XML_DTD_NOTATION_REDEFINED = 526 + XML_DTD_NOTATION_VALUE = 527 + XML_DTD_NOT_EMPTY = 528 + XML_DTD_NOT_PCDATA = 529 + XML_DTD_NOT_STANDALONE = 530 + XML_DTD_ROOT_NAME = 531 + XML_DTD_STANDALONE_WHITE_SPACE = 532 + XML_DTD_UNKNOWN_ATTRIBUTE = 533 + XML_DTD_UNKNOWN_ELEM = 534 + XML_DTD_UNKNOWN_ENTITY = 535 + XML_DTD_UNKNOWN_ID = 536 + XML_DTD_UNKNOWN_NOTATION = 537 + XML_DTD_STANDALONE_DEFAULTED = 538 + XML_DTD_XMLID_VALUE = 539 + XML_DTD_XMLID_TYPE = 540 + XML_DTD_DUP_TOKEN = 541 + XML_HTML_STRUCURE_ERROR = 800 + XML_HTML_UNKNOWN_TAG = 801 + XML_HTML_INCORRECTLY_OPENED_COMMENT = 802 + XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000 + XML_RNGP_ATTR_CONFLICT = 1001 + XML_RNGP_ATTRIBUTE_CHILDREN = 1002 + XML_RNGP_ATTRIBUTE_CONTENT = 1003 + XML_RNGP_ATTRIBUTE_EMPTY = 1004 + XML_RNGP_ATTRIBUTE_NOOP = 1005 + XML_RNGP_CHOICE_CONTENT = 1006 + XML_RNGP_CHOICE_EMPTY = 1007 + XML_RNGP_CREATE_FAILURE = 1008 + XML_RNGP_DATA_CONTENT = 1009 + XML_RNGP_DEF_CHOICE_AND_INTERLEAVE = 1010 + XML_RNGP_DEFINE_CREATE_FAILED = 1011 + XML_RNGP_DEFINE_EMPTY = 1012 + XML_RNGP_DEFINE_MISSING = 1013 + XML_RNGP_DEFINE_NAME_MISSING = 1014 + XML_RNGP_ELEM_CONTENT_EMPTY = 1015 + XML_RNGP_ELEM_CONTENT_ERROR = 1016 + XML_RNGP_ELEMENT_EMPTY = 1017 + XML_RNGP_ELEMENT_CONTENT = 1018 + XML_RNGP_ELEMENT_NAME = 1019 + XML_RNGP_ELEMENT_NO_CONTENT = 1020 + XML_RNGP_ELEM_TEXT_CONFLICT = 1021 + XML_RNGP_EMPTY = 1022 + XML_RNGP_EMPTY_CONSTRUCT = 1023 + XML_RNGP_EMPTY_CONTENT = 1024 + XML_RNGP_EMPTY_NOT_EMPTY = 1025 + XML_RNGP_ERROR_TYPE_LIB = 1026 + XML_RNGP_EXCEPT_EMPTY = 1027 + XML_RNGP_EXCEPT_MISSING = 1028 + XML_RNGP_EXCEPT_MULTIPLE = 1029 + XML_RNGP_EXCEPT_NO_CONTENT = 1030 + XML_RNGP_EXTERNALREF_EMTPY = 1031 + XML_RNGP_EXTERNAL_REF_FAILURE = 1032 + XML_RNGP_EXTERNALREF_RECURSE = 1033 + XML_RNGP_FORBIDDEN_ATTRIBUTE = 1034 + XML_RNGP_FOREIGN_ELEMENT = 1035 + XML_RNGP_GRAMMAR_CONTENT = 1036 + XML_RNGP_GRAMMAR_EMPTY = 1037 + XML_RNGP_GRAMMAR_MISSING = 1038 + XML_RNGP_GRAMMAR_NO_START = 1039 + XML_RNGP_GROUP_ATTR_CONFLICT = 1040 + XML_RNGP_HREF_ERROR = 1041 + XML_RNGP_INCLUDE_EMPTY = 1042 + XML_RNGP_INCLUDE_FAILURE = 1043 + XML_RNGP_INCLUDE_RECURSE = 1044 + XML_RNGP_INTERLEAVE_ADD = 1045 + XML_RNGP_INTERLEAVE_CREATE_FAILED = 1046 + XML_RNGP_INTERLEAVE_EMPTY = 1047 + XML_RNGP_INTERLEAVE_NO_CONTENT = 1048 + XML_RNGP_INVALID_DEFINE_NAME = 1049 + XML_RNGP_INVALID_URI = 1050 + XML_RNGP_INVALID_VALUE = 1051 + XML_RNGP_MISSING_HREF = 1052 + XML_RNGP_NAME_MISSING = 1053 + XML_RNGP_NEED_COMBINE = 1054 + XML_RNGP_NOTALLOWED_NOT_EMPTY = 1055 + XML_RNGP_NSNAME_ATTR_ANCESTOR = 1056 + XML_RNGP_NSNAME_NO_NS = 1057 + XML_RNGP_PARAM_FORBIDDEN = 1058 + XML_RNGP_PARAM_NAME_MISSING = 1059 + XML_RNGP_PARENTREF_CREATE_FAILED = 1060 + XML_RNGP_PARENTREF_NAME_INVALID = 1061 + XML_RNGP_PARENTREF_NO_NAME = 1062 + XML_RNGP_PARENTREF_NO_PARENT = 1063 + XML_RNGP_PARENTREF_NOT_EMPTY = 1064 + XML_RNGP_PARSE_ERROR = 1065 + XML_RNGP_PAT_ANYNAME_EXCEPT_ANYNAME = 1066 + XML_RNGP_PAT_ATTR_ATTR = 1067 + XML_RNGP_PAT_ATTR_ELEM = 1068 + XML_RNGP_PAT_DATA_EXCEPT_ATTR = 1069 + XML_RNGP_PAT_DATA_EXCEPT_ELEM = 1070 + XML_RNGP_PAT_DATA_EXCEPT_EMPTY = 1071 + XML_RNGP_PAT_DATA_EXCEPT_GROUP = 1072 + XML_RNGP_PAT_DATA_EXCEPT_INTERLEAVE = 1073 + XML_RNGP_PAT_DATA_EXCEPT_LIST = 1074 + XML_RNGP_PAT_DATA_EXCEPT_ONEMORE = 1075 + XML_RNGP_PAT_DATA_EXCEPT_REF = 1076 + XML_RNGP_PAT_DATA_EXCEPT_TEXT = 1077 + XML_RNGP_PAT_LIST_ATTR = 1078 + XML_RNGP_PAT_LIST_ELEM = 1079 + XML_RNGP_PAT_LIST_INTERLEAVE = 1080 + XML_RNGP_PAT_LIST_LIST = 1081 + XML_RNGP_PAT_LIST_REF = 1082 + XML_RNGP_PAT_LIST_TEXT = 1083 + XML_RNGP_PAT_NSNAME_EXCEPT_ANYNAME = 1084 + XML_RNGP_PAT_NSNAME_EXCEPT_NSNAME = 1085 + XML_RNGP_PAT_ONEMORE_GROUP_ATTR = 1086 + XML_RNGP_PAT_ONEMORE_INTERLEAVE_ATTR = 1087 + XML_RNGP_PAT_START_ATTR = 1088 + XML_RNGP_PAT_START_DATA = 1089 + XML_RNGP_PAT_START_EMPTY = 1090 + XML_RNGP_PAT_START_GROUP = 1091 + XML_RNGP_PAT_START_INTERLEAVE = 1092 + XML_RNGP_PAT_START_LIST = 1093 + XML_RNGP_PAT_START_ONEMORE = 1094 + XML_RNGP_PAT_START_TEXT = 1095 + XML_RNGP_PAT_START_VALUE = 1096 + XML_RNGP_PREFIX_UNDEFINED = 1097 + XML_RNGP_REF_CREATE_FAILED = 1098 + XML_RNGP_REF_CYCLE = 1099 + XML_RNGP_REF_NAME_INVALID = 1100 + XML_RNGP_REF_NO_DEF = 1101 + XML_RNGP_REF_NO_NAME = 1102 + XML_RNGP_REF_NOT_EMPTY = 1103 + XML_RNGP_START_CHOICE_AND_INTERLEAVE = 1104 + XML_RNGP_START_CONTENT = 1105 + XML_RNGP_START_EMPTY = 1106 + XML_RNGP_START_MISSING = 1107 + XML_RNGP_TEXT_EXPECTED = 1108 + XML_RNGP_TEXT_HAS_CHILD = 1109 + XML_RNGP_TYPE_MISSING = 1110 + XML_RNGP_TYPE_NOT_FOUND = 1111 + XML_RNGP_TYPE_VALUE = 1112 + XML_RNGP_UNKNOWN_ATTRIBUTE = 1113 + XML_RNGP_UNKNOWN_COMBINE = 1114 + XML_RNGP_UNKNOWN_CONSTRUCT = 1115 + XML_RNGP_UNKNOWN_TYPE_LIB = 1116 + XML_RNGP_URI_FRAGMENT = 1117 + XML_RNGP_URI_NOT_ABSOLUTE = 1118 + XML_RNGP_VALUE_EMPTY = 1119 + XML_RNGP_VALUE_NO_CONTENT = 1120 + XML_RNGP_XMLNS_NAME = 1121 + XML_RNGP_XML_NS = 1122 + XML_XPATH_EXPRESSION_OK = 1200 + XML_XPATH_NUMBER_ERROR = 1201 + XML_XPATH_UNFINISHED_LITERAL_ERROR = 1202 + XML_XPATH_START_LITERAL_ERROR = 1203 + XML_XPATH_VARIABLE_REF_ERROR = 1204 + XML_XPATH_UNDEF_VARIABLE_ERROR = 1205 + XML_XPATH_INVALID_PREDICATE_ERROR = 1206 + XML_XPATH_EXPR_ERROR = 1207 + XML_XPATH_UNCLOSED_ERROR = 1208 + XML_XPATH_UNKNOWN_FUNC_ERROR = 1209 + XML_XPATH_INVALID_OPERAND = 1210 + XML_XPATH_INVALID_TYPE = 1211 + XML_XPATH_INVALID_ARITY = 1212 + XML_XPATH_INVALID_CTXT_SIZE = 1213 + XML_XPATH_INVALID_CTXT_POSITION = 1214 + XML_XPATH_MEMORY_ERROR = 1215 + XML_XPTR_SYNTAX_ERROR = 1216 + XML_XPTR_RESOURCE_ERROR = 1217 + XML_XPTR_SUB_RESOURCE_ERROR = 1218 + XML_XPATH_UNDEF_PREFIX_ERROR = 1219 + XML_XPATH_ENCODING_ERROR = 1220 + XML_XPATH_INVALID_CHAR_ERROR = 1221 + XML_TREE_INVALID_HEX = 1300 + XML_TREE_INVALID_DEC = 1301 + XML_TREE_UNTERMINATED_ENTITY = 1302 + XML_TREE_NOT_UTF8 = 1303 + XML_SAVE_NOT_UTF8 = 1400 + XML_SAVE_CHAR_INVALID = 1401 + XML_SAVE_NO_DOCTYPE = 1402 + XML_SAVE_UNKNOWN_ENCODING = 1403 + XML_REGEXP_COMPILE_ERROR = 1450 + XML_IO_UNKNOWN = 1500 + XML_IO_EACCES = 1501 + XML_IO_EAGAIN = 1502 + XML_IO_EBADF = 1503 + XML_IO_EBADMSG = 1504 + XML_IO_EBUSY = 1505 + XML_IO_ECANCELED = 1506 + XML_IO_ECHILD = 1507 + XML_IO_EDEADLK = 1508 + XML_IO_EDOM = 1509 + XML_IO_EEXIST = 1510 + XML_IO_EFAULT = 1511 + XML_IO_EFBIG = 1512 + XML_IO_EINPROGRESS = 1513 + XML_IO_EINTR = 1514 + XML_IO_EINVAL = 1515 + XML_IO_EIO = 1516 + XML_IO_EISDIR = 1517 + XML_IO_EMFILE = 1518 + XML_IO_EMLINK = 1519 + XML_IO_EMSGSIZE = 1520 + XML_IO_ENAMETOOLONG = 1521 + XML_IO_ENFILE = 1522 + XML_IO_ENODEV = 1523 + XML_IO_ENOENT = 1524 # File not found + XML_IO_ENOEXEC = 1525 + XML_IO_ENOLCK = 1526 + XML_IO_ENOMEM = 1527 + XML_IO_ENOSPC = 1528 + XML_IO_ENOSYS = 1529 + XML_IO_ENOTDIR = 1530 + XML_IO_ENOTEMPTY = 1531 + XML_IO_ENOTSUP = 1532 + XML_IO_ENOTTY = 1533 + XML_IO_ENXIO = 1534 + XML_IO_EPERM = 1535 + XML_IO_EPIPE = 1536 + XML_IO_ERANGE = 1537 + XML_IO_EROFS = 1538 + XML_IO_ESPIPE = 1539 + XML_IO_ESRCH = 1540 + XML_IO_ETIMEDOUT = 1541 + XML_IO_EXDEV = 1542 + XML_IO_NETWORK_ATTEMPT = 1543 + XML_IO_ENCODER = 1544 + XML_IO_FLUSH = 1545 + XML_IO_WRITE = 1546 + XML_IO_NO_INPUT = 1547 + XML_IO_BUFFER_FULL = 1548 + XML_IO_LOAD_ERROR = 1549 + XML_IO_ENOTSOCK = 1550 + XML_IO_EISCONN = 1551 + XML_IO_ECONNREFUSED = 1552 + XML_IO_ENETUNREACH = 1553 + XML_IO_EADDRINUSE = 1554 + XML_IO_EALREADY = 1555 + XML_IO_EAFNOSUPPORT = 1556 + XML_IO_UNSUPPORTED_PROTOCOL = 1557 + XML_XINCLUDE_RECURSION = 1600 + XML_XINCLUDE_PARSE_VALUE = 1601 + XML_XINCLUDE_ENTITY_DEF_MISMATCH = 1602 + XML_XINCLUDE_NO_HREF = 1603 + XML_XINCLUDE_NO_FALLBACK = 1604 + XML_XINCLUDE_HREF_URI = 1605 + XML_XINCLUDE_TEXT_FRAGMENT = 1606 + XML_XINCLUDE_TEXT_DOCUMENT = 1607 + XML_XINCLUDE_INVALID_CHAR = 1608 + XML_XINCLUDE_BUILD_FAILED = 1609 + XML_XINCLUDE_UNKNOWN_ENCODING = 1610 + XML_XINCLUDE_MULTIPLE_ROOT = 1611 + XML_XINCLUDE_XPTR_FAILED = 1612 + XML_XINCLUDE_XPTR_RESULT = 1613 + XML_XINCLUDE_INCLUDE_IN_INCLUDE = 1614 + XML_XINCLUDE_FALLBACKS_IN_INCLUDE = 1615 + XML_XINCLUDE_FALLBACK_NOT_IN_INCLUDE = 1616 + XML_XINCLUDE_DEPRECATED_NS = 1617 + XML_XINCLUDE_FRAGMENT_ID = 1618 + XML_CATALOG_MISSING_ATTR = 1650 + XML_CATALOG_ENTRY_BROKEN = 1651 + XML_CATALOG_PREFER_VALUE = 1652 + XML_CATALOG_NOT_CATALOG = 1653 + XML_CATALOG_RECURSION = 1654 + XML_SCHEMAP_PREFIX_UNDEFINED = 1700 + XML_SCHEMAP_ATTRFORMDEFAULT_VALUE = 1701 + XML_SCHEMAP_ATTRGRP_NONAME_NOREF = 1702 + XML_SCHEMAP_ATTR_NONAME_NOREF = 1703 + XML_SCHEMAP_COMPLEXTYPE_NONAME_NOREF = 1704 + XML_SCHEMAP_ELEMFORMDEFAULT_VALUE = 1705 + XML_SCHEMAP_ELEM_NONAME_NOREF = 1706 + XML_SCHEMAP_EXTENSION_NO_BASE = 1707 + XML_SCHEMAP_FACET_NO_VALUE = 1708 + XML_SCHEMAP_FAILED_BUILD_IMPORT = 1709 + XML_SCHEMAP_GROUP_NONAME_NOREF = 1710 + XML_SCHEMAP_IMPORT_NAMESPACE_NOT_URI = 1711 + XML_SCHEMAP_IMPORT_REDEFINE_NSNAME = 1712 + XML_SCHEMAP_IMPORT_SCHEMA_NOT_URI = 1713 + XML_SCHEMAP_INVALID_BOOLEAN = 1714 + XML_SCHEMAP_INVALID_ENUM = 1715 + XML_SCHEMAP_INVALID_FACET = 1716 + XML_SCHEMAP_INVALID_FACET_VALUE = 1717 + XML_SCHEMAP_INVALID_MAXOCCURS = 1718 + XML_SCHEMAP_INVALID_MINOCCURS = 1719 + XML_SCHEMAP_INVALID_REF_AND_SUBTYPE = 1720 + XML_SCHEMAP_INVALID_WHITE_SPACE = 1721 + XML_SCHEMAP_NOATTR_NOREF = 1722 + XML_SCHEMAP_NOTATION_NO_NAME = 1723 + XML_SCHEMAP_NOTYPE_NOREF = 1724 + XML_SCHEMAP_REF_AND_SUBTYPE = 1725 + XML_SCHEMAP_RESTRICTION_NONAME_NOREF = 1726 + XML_SCHEMAP_SIMPLETYPE_NONAME = 1727 + XML_SCHEMAP_TYPE_AND_SUBTYPE = 1728 + XML_SCHEMAP_UNKNOWN_ALL_CHILD = 1729 + XML_SCHEMAP_UNKNOWN_ANYATTRIBUTE_CHILD = 1730 + XML_SCHEMAP_UNKNOWN_ATTR_CHILD = 1731 + XML_SCHEMAP_UNKNOWN_ATTRGRP_CHILD = 1732 + XML_SCHEMAP_UNKNOWN_ATTRIBUTE_GROUP = 1733 + XML_SCHEMAP_UNKNOWN_BASE_TYPE = 1734 + XML_SCHEMAP_UNKNOWN_CHOICE_CHILD = 1735 + XML_SCHEMAP_UNKNOWN_COMPLEXCONTENT_CHILD = 1736 + XML_SCHEMAP_UNKNOWN_COMPLEXTYPE_CHILD = 1737 + XML_SCHEMAP_UNKNOWN_ELEM_CHILD = 1738 + XML_SCHEMAP_UNKNOWN_EXTENSION_CHILD = 1739 + XML_SCHEMAP_UNKNOWN_FACET_CHILD = 1740 + XML_SCHEMAP_UNKNOWN_FACET_TYPE = 1741 + XML_SCHEMAP_UNKNOWN_GROUP_CHILD = 1742 + XML_SCHEMAP_UNKNOWN_IMPORT_CHILD = 1743 + XML_SCHEMAP_UNKNOWN_LIST_CHILD = 1744 + XML_SCHEMAP_UNKNOWN_NOTATION_CHILD = 1745 + XML_SCHEMAP_UNKNOWN_PROCESSCONTENT_CHILD = 1746 + XML_SCHEMAP_UNKNOWN_REF = 1747 + XML_SCHEMAP_UNKNOWN_RESTRICTION_CHILD = 1748 + XML_SCHEMAP_UNKNOWN_SCHEMAS_CHILD = 1749 + XML_SCHEMAP_UNKNOWN_SEQUENCE_CHILD = 1750 + XML_SCHEMAP_UNKNOWN_SIMPLECONTENT_CHILD = 1751 + XML_SCHEMAP_UNKNOWN_SIMPLETYPE_CHILD = 1752 + XML_SCHEMAP_UNKNOWN_TYPE = 1753 + XML_SCHEMAP_UNKNOWN_UNION_CHILD = 1754 + XML_SCHEMAP_ELEM_DEFAULT_FIXED = 1755 + XML_SCHEMAP_REGEXP_INVALID = 1756 + XML_SCHEMAP_FAILED_LOAD = 1757 + XML_SCHEMAP_NOTHING_TO_PARSE = 1758 + XML_SCHEMAP_NOROOT = 1759 + XML_SCHEMAP_REDEFINED_GROUP = 1760 + XML_SCHEMAP_REDEFINED_TYPE = 1761 + XML_SCHEMAP_REDEFINED_ELEMENT = 1762 + XML_SCHEMAP_REDEFINED_ATTRGROUP = 1763 + XML_SCHEMAP_REDEFINED_ATTR = 1764 + XML_SCHEMAP_REDEFINED_NOTATION = 1765 + XML_SCHEMAP_FAILED_PARSE = 1766 + XML_SCHEMAP_UNKNOWN_PREFIX = 1767 + XML_SCHEMAP_DEF_AND_PREFIX = 1768 + XML_SCHEMAP_UNKNOWN_INCLUDE_CHILD = 1769 + XML_SCHEMAP_INCLUDE_SCHEMA_NOT_URI = 1770 + XML_SCHEMAP_INCLUDE_SCHEMA_NO_URI = 1771 + XML_SCHEMAP_NOT_SCHEMA = 1772 + XML_SCHEMAP_UNKNOWN_MEMBER_TYPE = 1773 + XML_SCHEMAP_INVALID_ATTR_USE = 1774 + XML_SCHEMAP_RECURSIVE = 1775 + XML_SCHEMAP_SUPERNUMEROUS_LIST_ITEM_TYPE = 1776 + XML_SCHEMAP_INVALID_ATTR_COMBINATION = 1777 + XML_SCHEMAP_INVALID_ATTR_INLINE_COMBINATION = 1778 + XML_SCHEMAP_MISSING_SIMPLETYPE_CHILD = 1779 + XML_SCHEMAP_INVALID_ATTR_NAME = 1780 + XML_SCHEMAP_REF_AND_CONTENT = 1781 + XML_SCHEMAP_CT_PROPS_CORRECT_1 = 1782 + XML_SCHEMAP_CT_PROPS_CORRECT_2 = 1783 + XML_SCHEMAP_CT_PROPS_CORRECT_3 = 1784 + XML_SCHEMAP_CT_PROPS_CORRECT_4 = 1785 + XML_SCHEMAP_CT_PROPS_CORRECT_5 = 1786 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_1 = 1787 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_1 = 1788 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_2 = 1789 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_2 = 1790 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_3 = 1791 + XML_SCHEMAP_WILDCARD_INVALID_NS_MEMBER = 1792 + XML_SCHEMAP_INTERSECTION_NOT_EXPRESSIBLE = 1793 + XML_SCHEMAP_UNION_NOT_EXPRESSIBLE = 1794 + XML_SCHEMAP_SRC_IMPORT_3_1 = 1795 + XML_SCHEMAP_SRC_IMPORT_3_2 = 1796 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_1 = 1797 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_2 = 1798 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_3 = 1799 + XML_SCHEMAP_COS_CT_EXTENDS_1_3 = 1800 + XML_SCHEMAV_NOROOT = 1801 + XML_SCHEMAV_UNDECLAREDELEM = 1802 + XML_SCHEMAV_NOTTOPLEVEL = 1803 + XML_SCHEMAV_MISSING = 1804 + XML_SCHEMAV_WRONGELEM = 1805 + XML_SCHEMAV_NOTYPE = 1806 + XML_SCHEMAV_NOROLLBACK = 1807 + XML_SCHEMAV_ISABSTRACT = 1808 + XML_SCHEMAV_NOTEMPTY = 1809 + XML_SCHEMAV_ELEMCONT = 1810 + XML_SCHEMAV_HAVEDEFAULT = 1811 + XML_SCHEMAV_NOTNILLABLE = 1812 + XML_SCHEMAV_EXTRACONTENT = 1813 + XML_SCHEMAV_INVALIDATTR = 1814 + XML_SCHEMAV_INVALIDELEM = 1815 + XML_SCHEMAV_NOTDETERMINIST = 1816 + XML_SCHEMAV_CONSTRUCT = 1817 + XML_SCHEMAV_INTERNAL = 1818 + XML_SCHEMAV_NOTSIMPLE = 1819 + XML_SCHEMAV_ATTRUNKNOWN = 1820 + XML_SCHEMAV_ATTRINVALID = 1821 + XML_SCHEMAV_VALUE = 1822 + XML_SCHEMAV_FACET = 1823 + XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_1 = 1824 + XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_2 = 1825 + XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_3 = 1826 + XML_SCHEMAV_CVC_TYPE_3_1_1 = 1827 + XML_SCHEMAV_CVC_TYPE_3_1_2 = 1828 + XML_SCHEMAV_CVC_FACET_VALID = 1829 + XML_SCHEMAV_CVC_LENGTH_VALID = 1830 + XML_SCHEMAV_CVC_MINLENGTH_VALID = 1831 + XML_SCHEMAV_CVC_MAXLENGTH_VALID = 1832 + XML_SCHEMAV_CVC_MININCLUSIVE_VALID = 1833 + XML_SCHEMAV_CVC_MAXINCLUSIVE_VALID = 1834 + XML_SCHEMAV_CVC_MINEXCLUSIVE_VALID = 1835 + XML_SCHEMAV_CVC_MAXEXCLUSIVE_VALID = 1836 + XML_SCHEMAV_CVC_TOTALDIGITS_VALID = 1837 + XML_SCHEMAV_CVC_FRACTIONDIGITS_VALID = 1838 + XML_SCHEMAV_CVC_PATTERN_VALID = 1839 + XML_SCHEMAV_CVC_ENUMERATION_VALID = 1840 + XML_SCHEMAV_CVC_COMPLEX_TYPE_2_1 = 1841 + XML_SCHEMAV_CVC_COMPLEX_TYPE_2_2 = 1842 + XML_SCHEMAV_CVC_COMPLEX_TYPE_2_3 = 1843 + XML_SCHEMAV_CVC_COMPLEX_TYPE_2_4 = 1844 + XML_SCHEMAV_CVC_ELT_1 = 1845 + XML_SCHEMAV_CVC_ELT_2 = 1846 + XML_SCHEMAV_CVC_ELT_3_1 = 1847 + XML_SCHEMAV_CVC_ELT_3_2_1 = 1848 + XML_SCHEMAV_CVC_ELT_3_2_2 = 1849 + XML_SCHEMAV_CVC_ELT_4_1 = 1850 + XML_SCHEMAV_CVC_ELT_4_2 = 1851 + XML_SCHEMAV_CVC_ELT_4_3 = 1852 + XML_SCHEMAV_CVC_ELT_5_1_1 = 1853 + XML_SCHEMAV_CVC_ELT_5_1_2 = 1854 + XML_SCHEMAV_CVC_ELT_5_2_1 = 1855 + XML_SCHEMAV_CVC_ELT_5_2_2_1 = 1856 + XML_SCHEMAV_CVC_ELT_5_2_2_2_1 = 1857 + XML_SCHEMAV_CVC_ELT_5_2_2_2_2 = 1858 + XML_SCHEMAV_CVC_ELT_6 = 1859 + XML_SCHEMAV_CVC_ELT_7 = 1860 + XML_SCHEMAV_CVC_ATTRIBUTE_1 = 1861 + XML_SCHEMAV_CVC_ATTRIBUTE_2 = 1862 + XML_SCHEMAV_CVC_ATTRIBUTE_3 = 1863 + XML_SCHEMAV_CVC_ATTRIBUTE_4 = 1864 + XML_SCHEMAV_CVC_COMPLEX_TYPE_3_1 = 1865 + XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_1 = 1866 + XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_2 = 1867 + XML_SCHEMAV_CVC_COMPLEX_TYPE_4 = 1868 + XML_SCHEMAV_CVC_COMPLEX_TYPE_5_1 = 1869 + XML_SCHEMAV_CVC_COMPLEX_TYPE_5_2 = 1870 + XML_SCHEMAV_ELEMENT_CONTENT = 1871 + XML_SCHEMAV_DOCUMENT_ELEMENT_MISSING = 1872 + XML_SCHEMAV_CVC_COMPLEX_TYPE_1 = 1873 + XML_SCHEMAV_CVC_AU = 1874 + XML_SCHEMAV_CVC_TYPE_1 = 1875 + XML_SCHEMAV_CVC_TYPE_2 = 1876 + XML_SCHEMAV_CVC_IDC = 1877 + XML_SCHEMAV_CVC_WILDCARD = 1878 + XML_SCHEMAV_MISC = 1879 + XML_XPTR_UNKNOWN_SCHEME = 1900 + XML_XPTR_CHILDSEQ_START = 1901 + XML_XPTR_EVAL_FAILED = 1902 + XML_XPTR_EXTRA_OBJECTS = 1903 + XML_C14N_CREATE_CTXT = 1950 + XML_C14N_REQUIRES_UTF8 = 1951 + XML_C14N_CREATE_STACK = 1952 + XML_C14N_INVALID_NODE = 1953 + XML_C14N_UNKNOW_NODE = 1954 + XML_C14N_RELATIVE_NAMESPACE = 1955 + XML_FTP_PASV_ANSWER = 2000 + XML_FTP_EPSV_ANSWER = 2001 + XML_FTP_ACCNT = 2002 + XML_FTP_URL_SYNTAX = 2003 + XML_HTTP_URL_SYNTAX = 2020 + XML_HTTP_USE_IP = 2021 + XML_HTTP_UNKNOWN_HOST = 2022 + XML_SCHEMAP_SRC_SIMPLE_TYPE_1 = 3000 + XML_SCHEMAP_SRC_SIMPLE_TYPE_2 = 3001 + XML_SCHEMAP_SRC_SIMPLE_TYPE_3 = 3002 + XML_SCHEMAP_SRC_SIMPLE_TYPE_4 = 3003 + XML_SCHEMAP_SRC_RESOLVE = 3004 + XML_SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE = 3005 + XML_SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE = 3006 + XML_SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES = 3007 + XML_SCHEMAP_ST_PROPS_CORRECT_1 = 3008 + XML_SCHEMAP_ST_PROPS_CORRECT_2 = 3009 + XML_SCHEMAP_ST_PROPS_CORRECT_3 = 3010 + XML_SCHEMAP_COS_ST_RESTRICTS_1_1 = 3011 + XML_SCHEMAP_COS_ST_RESTRICTS_1_2 = 3012 + XML_SCHEMAP_COS_ST_RESTRICTS_1_3_1 = 3013 + XML_SCHEMAP_COS_ST_RESTRICTS_1_3_2 = 3014 + XML_SCHEMAP_COS_ST_RESTRICTS_2_1 = 3015 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_1 = 3016 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_2 = 3017 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_1 = 3018 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_2 = 3019 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_3 = 3020 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_4 = 3021 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_5 = 3022 + XML_SCHEMAP_COS_ST_RESTRICTS_3_1 = 3023 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1 = 3024 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1_2 = 3025 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_2 = 3026 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_1 = 3027 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_3 = 3028 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_4 = 3029 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_5 = 3030 + XML_SCHEMAP_COS_ST_DERIVED_OK_2_1 = 3031 + XML_SCHEMAP_COS_ST_DERIVED_OK_2_2 = 3032 + XML_SCHEMAP_S4S_ELEM_NOT_ALLOWED = 3033 + XML_SCHEMAP_S4S_ELEM_MISSING = 3034 + XML_SCHEMAP_S4S_ATTR_NOT_ALLOWED = 3035 + XML_SCHEMAP_S4S_ATTR_MISSING = 3036 + XML_SCHEMAP_S4S_ATTR_INVALID_VALUE = 3037 + XML_SCHEMAP_SRC_ELEMENT_1 = 3038 + XML_SCHEMAP_SRC_ELEMENT_2_1 = 3039 + XML_SCHEMAP_SRC_ELEMENT_2_2 = 3040 + XML_SCHEMAP_SRC_ELEMENT_3 = 3041 + XML_SCHEMAP_P_PROPS_CORRECT_1 = 3042 + XML_SCHEMAP_P_PROPS_CORRECT_2_1 = 3043 + XML_SCHEMAP_P_PROPS_CORRECT_2_2 = 3044 + XML_SCHEMAP_E_PROPS_CORRECT_2 = 3045 + XML_SCHEMAP_E_PROPS_CORRECT_3 = 3046 + XML_SCHEMAP_E_PROPS_CORRECT_4 = 3047 + XML_SCHEMAP_E_PROPS_CORRECT_5 = 3048 + XML_SCHEMAP_E_PROPS_CORRECT_6 = 3049 + XML_SCHEMAP_SRC_INCLUDE = 3050 + XML_SCHEMAP_SRC_ATTRIBUTE_1 = 3051 + XML_SCHEMAP_SRC_ATTRIBUTE_2 = 3052 + XML_SCHEMAP_SRC_ATTRIBUTE_3_1 = 3053 + XML_SCHEMAP_SRC_ATTRIBUTE_3_2 = 3054 + XML_SCHEMAP_SRC_ATTRIBUTE_4 = 3055 + XML_SCHEMAP_NO_XMLNS = 3056 + XML_SCHEMAP_NO_XSI = 3057 + XML_SCHEMAP_COS_VALID_DEFAULT_1 = 3058 + XML_SCHEMAP_COS_VALID_DEFAULT_2_1 = 3059 + XML_SCHEMAP_COS_VALID_DEFAULT_2_2_1 = 3060 + XML_SCHEMAP_COS_VALID_DEFAULT_2_2_2 = 3061 + XML_SCHEMAP_CVC_SIMPLE_TYPE = 3062 + XML_SCHEMAP_COS_CT_EXTENDS_1_1 = 3063 + XML_SCHEMAP_SRC_IMPORT_1_1 = 3064 + XML_SCHEMAP_SRC_IMPORT_1_2 = 3065 + XML_SCHEMAP_SRC_IMPORT_2 = 3066 + XML_SCHEMAP_SRC_IMPORT_2_1 = 3067 + XML_SCHEMAP_SRC_IMPORT_2_2 = 3068 + XML_SCHEMAP_INTERNAL = 3069 + XML_SCHEMAP_NOT_DETERMINISTIC = 3070 + XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_1 = 3071 + XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_2 = 3072 + XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_3 = 3073 + XML_SCHEMAP_MG_PROPS_CORRECT_1 = 3074 + XML_SCHEMAP_MG_PROPS_CORRECT_2 = 3075 + XML_SCHEMAP_SRC_CT_1 = 3076 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_3 = 3077 + XML_SCHEMAP_AU_PROPS_CORRECT_2 = 3078 + XML_SCHEMAP_A_PROPS_CORRECT_2 = 3079 + XML_SCHEMAP_C_PROPS_CORRECT = 3080 + XML_SCHEMAP_SRC_REDEFINE = 3081 + XML_SCHEMAP_SRC_IMPORT = 3082 + XML_SCHEMAP_WARN_SKIP_SCHEMA = 3083 + XML_SCHEMAP_WARN_UNLOCATED_SCHEMA = 3084 + XML_SCHEMAP_WARN_ATTR_REDECL_PROH = 3085 + XML_SCHEMAP_WARN_ATTR_POINTLESS_PROH = 3086 + XML_SCHEMAP_AG_PROPS_CORRECT = 3087 + XML_SCHEMAP_COS_CT_EXTENDS_1_2 = 3088 + XML_SCHEMAP_AU_PROPS_CORRECT = 3089 + XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 + XML_SCHEMAP_COS_ALL_LIMITED = 3091 + XML_SCHEMATRONV_ASSERT = 4000 + XML_SCHEMATRONV_REPORT = 4001 + XML_MODULE_OPEN = 4900 + XML_MODULE_CLOSE = 4901 + XML_CHECK_FOUND_ELEMENT = 5000 + XML_CHECK_FOUND_ATTRIBUTE = 5001 + XML_CHECK_FOUND_TEXT = 5002 + XML_CHECK_FOUND_CDATA = 5003 + XML_CHECK_FOUND_ENTITYREF = 5004 + XML_CHECK_FOUND_ENTITY = 5005 + XML_CHECK_FOUND_PI = 5006 + XML_CHECK_FOUND_COMMENT = 5007 + XML_CHECK_FOUND_DOCTYPE = 5008 + XML_CHECK_FOUND_FRAGMENT = 5009 + XML_CHECK_FOUND_NOTATION = 5010 + XML_CHECK_UNKNOWN_NODE = 5011 + XML_CHECK_ENTITY_TYPE = 5012 + XML_CHECK_NO_PARENT = 5013 + XML_CHECK_NO_DOC = 5014 + XML_CHECK_NO_NAME = 5015 + XML_CHECK_NO_ELEM = 5016 + XML_CHECK_WRONG_DOC = 5017 + XML_CHECK_NO_PREV = 5018 + XML_CHECK_WRONG_PREV = 5019 + XML_CHECK_NO_NEXT = 5020 + XML_CHECK_WRONG_NEXT = 5021 + XML_CHECK_NOT_DTD = 5022 + XML_CHECK_NOT_ATTR = 5023 + XML_CHECK_NOT_ATTR_DECL = 5024 + XML_CHECK_NOT_ELEM_DECL = 5025 + XML_CHECK_NOT_ENTITY_DECL = 5026 + XML_CHECK_NOT_NS_DECL = 5027 + XML_CHECK_NO_HREF = 5028 + XML_CHECK_WRONG_PARENT = 5029 + XML_CHECK_NS_SCOPE = 5030 + XML_CHECK_NS_ANCESTOR = 5031 + XML_CHECK_NOT_UTF8 = 5032 + XML_CHECK_NO_DICT = 5033 + XML_CHECK_NOT_NCNAME = 5034 + XML_CHECK_OUTSIDE_DICT = 5035 + XML_CHECK_WRONG_NAME = 5036 + XML_CHECK_NAME_NOT_NULL = 5037 + XML_I18N_NO_NAME = 6000 + XML_I18N_NO_HANDLER = 6001 + XML_I18N_EXCESS_HANDLER = 6002 + XML_I18N_CONV_FAILED = 6003 + XML_I18N_NO_OUTPUT = 6004 + XML_BUF_OVERFLOW = 7000 + + ctypedef enum xmlRelaxNGValidErr: + XML_RELAXNG_OK = 0 + XML_RELAXNG_ERR_MEMORY = 1 + XML_RELAXNG_ERR_TYPE = 2 + XML_RELAXNG_ERR_TYPEVAL = 3 + XML_RELAXNG_ERR_DUPID = 4 + XML_RELAXNG_ERR_TYPECMP = 5 + XML_RELAXNG_ERR_NOSTATE = 6 + XML_RELAXNG_ERR_NODEFINE = 7 + XML_RELAXNG_ERR_LISTEXTRA = 8 + XML_RELAXNG_ERR_LISTEMPTY = 9 + XML_RELAXNG_ERR_INTERNODATA = 10 + XML_RELAXNG_ERR_INTERSEQ = 11 + XML_RELAXNG_ERR_INTEREXTRA = 12 + XML_RELAXNG_ERR_ELEMNAME = 13 + XML_RELAXNG_ERR_ATTRNAME = 14 + XML_RELAXNG_ERR_ELEMNONS = 15 + XML_RELAXNG_ERR_ATTRNONS = 16 + XML_RELAXNG_ERR_ELEMWRONGNS = 17 + XML_RELAXNG_ERR_ATTRWRONGNS = 18 + XML_RELAXNG_ERR_ELEMEXTRANS = 19 + XML_RELAXNG_ERR_ATTREXTRANS = 20 + XML_RELAXNG_ERR_ELEMNOTEMPTY = 21 + XML_RELAXNG_ERR_NOELEM = 22 + XML_RELAXNG_ERR_NOTELEM = 23 + XML_RELAXNG_ERR_ATTRVALID = 24 + XML_RELAXNG_ERR_CONTENTVALID = 25 + XML_RELAXNG_ERR_EXTRACONTENT = 26 + XML_RELAXNG_ERR_INVALIDATTR = 27 + XML_RELAXNG_ERR_DATAELEM = 28 + XML_RELAXNG_ERR_VALELEM = 29 + XML_RELAXNG_ERR_LISTELEM = 30 + XML_RELAXNG_ERR_DATATYPE = 31 + XML_RELAXNG_ERR_VALUE = 32 + XML_RELAXNG_ERR_LIST = 33 + XML_RELAXNG_ERR_NOGRAMMAR = 34 + XML_RELAXNG_ERR_EXTRADATA = 35 + XML_RELAXNG_ERR_LACKDATA = 36 + XML_RELAXNG_ERR_INTERNAL = 37 + XML_RELAXNG_ERR_ELEMWRONG = 38 + XML_RELAXNG_ERR_TEXTWRONG = 39 +# --- END: GENERATED CONSTANTS --- + +cdef extern from "libxml/xmlerror.h" nogil: + ctypedef struct xmlError: + int domain + int code + char* message + xmlErrorLevel level + char* file + char* str1 + char* str2 + char* str3 + int line + int int1 + int int2 + void* node + + ctypedef void (*xmlGenericErrorFunc)(void* ctxt, char* msg, ...) noexcept + ctypedef void (*xmlStructuredErrorFunc)(void* userData, + const xmlError* error) noexcept + + cdef void xmlSetGenericErrorFunc( + void* ctxt, xmlGenericErrorFunc func) + cdef void xmlSetStructuredErrorFunc( + void* ctxt, xmlStructuredErrorFunc func) + +cdef extern from "libxml/globals.h" nogil: + cdef xmlStructuredErrorFunc xmlStructuredError + cdef void* xmlStructuredErrorContext diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlparser.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlparser.pxd new file mode 100644 index 0000000000000000000000000000000000000000..e0ef221af7e14f7fa2bb4c7ed8710a5b0524c5d0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlparser.pxd @@ -0,0 +1,318 @@ +from libc.string cimport const_char + +from lxml.includes.tree cimport ( + xmlDoc, xmlNode, xmlEntity, xmlDict, xmlDtd, xmlChar, const_xmlChar) +from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback +from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc, xmlErrorLevel + + +cdef extern from "libxml/parser.h" nogil: + ctypedef void (*startElementNsSAX2Func)(void* ctx, + const_xmlChar* localname, + const_xmlChar* prefix, + const_xmlChar* URI, + int nb_namespaces, + const_xmlChar** namespaces, + int nb_attributes, + int nb_defaulted, + const_xmlChar** attributes) noexcept + + ctypedef void (*endElementNsSAX2Func)(void* ctx, + const_xmlChar* localname, + const_xmlChar* prefix, + const_xmlChar* URI) noexcept + + ctypedef void (*startElementSAXFunc)(void* ctx, const_xmlChar* name, const_xmlChar** atts) noexcept + + ctypedef void (*endElementSAXFunc)(void* ctx, const_xmlChar* name) noexcept + + ctypedef void (*charactersSAXFunc)(void* ctx, const_xmlChar* ch, int len) noexcept + + ctypedef void (*cdataBlockSAXFunc)(void* ctx, const_xmlChar* value, int len) noexcept + + ctypedef void (*commentSAXFunc)(void* ctx, const_xmlChar* value) noexcept + + ctypedef void (*processingInstructionSAXFunc)(void* ctx, + const_xmlChar* target, + const_xmlChar* data) noexcept + + ctypedef void (*internalSubsetSAXFunc)(void* ctx, + const_xmlChar* name, + const_xmlChar* externalID, + const_xmlChar* systemID) noexcept + + ctypedef void (*endDocumentSAXFunc)(void* ctx) noexcept + + ctypedef void (*startDocumentSAXFunc)(void* ctx) noexcept + + ctypedef void (*referenceSAXFunc)(void * ctx, const_xmlChar* name) noexcept + + ctypedef xmlEntity* (*getEntitySAXFunc)(void* ctx, const_xmlChar* name) noexcept + + cdef int XML_SAX2_MAGIC + +cdef extern from "libxml/tree.h" nogil: + ctypedef struct xmlParserInput: + int line + int col + int length + const_xmlChar* base + const_xmlChar* cur + const_xmlChar* end + const_char *filename + + ctypedef struct xmlParserInputBuffer: + void* context + xmlInputReadCallback readcallback + xmlInputCloseCallback closecallback + + ctypedef struct xmlSAXHandlerV1: + # same as xmlSAXHandler, but without namespaces + pass + + ctypedef struct xmlSAXHandler: + internalSubsetSAXFunc internalSubset + startElementNsSAX2Func startElementNs + endElementNsSAX2Func endElementNs + startElementSAXFunc startElement + endElementSAXFunc endElement + charactersSAXFunc characters + cdataBlockSAXFunc cdataBlock + referenceSAXFunc reference + getEntitySAXFunc getEntity + commentSAXFunc comment + processingInstructionSAXFunc processingInstruction + startDocumentSAXFunc startDocument + endDocumentSAXFunc endDocument + int initialized + xmlStructuredErrorFunc serror + void* _private + + +cdef extern from "libxml/SAX2.h" nogil: + cdef void xmlSAX2StartDocument(void* ctxt) + + +cdef extern from "libxml/xmlIO.h" nogil: + cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) + + +cdef extern from "libxml/parser.h" nogil: + + ctypedef enum xmlFeature: + XML_WITH_THREAD = 1 + XML_WITH_TREE = 2 + XML_WITH_OUTPUT = 3 + XML_WITH_PUSH = 4 + XML_WITH_READER = 5 + XML_WITH_PATTERN = 6 + XML_WITH_WRITER = 7 + XML_WITH_SAX1 = 8 + XML_WITH_FTP = 9 + XML_WITH_HTTP = 10 + XML_WITH_VALID = 11 + XML_WITH_HTML = 12 + XML_WITH_LEGACY = 13 + XML_WITH_C14N = 14 + XML_WITH_CATALOG = 15 + XML_WITH_XPATH = 16 + XML_WITH_XPTR = 17 + XML_WITH_XINCLUDE = 18 + XML_WITH_ICONV = 19 + XML_WITH_ISO8859X = 20 + XML_WITH_UNICODE = 21 + XML_WITH_REGEXP = 22 + XML_WITH_AUTOMATA = 23 + XML_WITH_EXPR = 24 + XML_WITH_SCHEMAS = 25 + XML_WITH_SCHEMATRON = 26 + XML_WITH_MODULES = 27 + XML_WITH_DEBUG = 28 + XML_WITH_DEBUG_MEM = 29 + XML_WITH_DEBUG_RUN = 30 + XML_WITH_ZLIB = 31 + XML_WITH_ICU = 32 + XML_WITH_LZMA = 33 + + cdef bint xmlHasFeature(xmlFeature feature) + + cdef xmlDict* xmlDictCreate() + cdef xmlDict* xmlDictCreateSub(xmlDict* subdict) + cdef void xmlDictFree(xmlDict* sub) + cdef int xmlDictReference(xmlDict* dict) + + cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes + cdef int XML_SKIP_IDS # SAX option for not building an XML ID dict + + ctypedef enum xmlParserInputState: + XML_PARSER_EOF = -1 # nothing is to be parsed + XML_PARSER_START = 0 # nothing has been parsed + XML_PARSER_MISC = 1 # Misc* before int subset + XML_PARSER_PI = 2 # Within a processing instruction + XML_PARSER_DTD = 3 # within some DTD content + XML_PARSER_PROLOG = 4 # Misc* after internal subset + XML_PARSER_COMMENT = 5 # within a comment + XML_PARSER_START_TAG = 6 # within a start tag + XML_PARSER_CONTENT = 7 # within the content + XML_PARSER_CDATA_SECTION = 8 # within a CDATA section + XML_PARSER_END_TAG = 9 # within a closing tag + XML_PARSER_ENTITY_DECL = 10 # within an entity declaration + XML_PARSER_ENTITY_VALUE = 11 # within an entity value in a decl + XML_PARSER_ATTRIBUTE_VALUE = 12 # within an attribute value + XML_PARSER_SYSTEM_LITERAL = 13 # within a SYSTEM value + XML_PARSER_EPILOG = 14 # the Misc* after the last end tag + XML_PARSER_IGNORE = 15 # within an IGNORED section + XML_PARSER_PUBLIC_LITERAL = 16 # within a PUBLIC value + + + ctypedef struct xmlParserCtxt: + xmlDoc* myDoc + xmlDict* dict + int dictNames + void* _private + bint wellFormed + bint recovery + int options + bint disableSAX + int errNo + xmlParserInputState instate + bint replaceEntities + int loadsubset # != 0 if enabled, int value == why + bint validate + xmlError lastError + xmlNode* node + xmlSAXHandler* sax + void* userData + int* spaceTab + int spaceMax + int nsNr + bint html + bint progressive + int inSubset + int charset + xmlParserInput* input + int inputNr + xmlParserInput* inputTab[] + + ctypedef enum xmlParserOption: + XML_PARSE_RECOVER = 0x1 # recover on errors + XML_PARSE_NOENT = 0x2 # substitute entities + XML_PARSE_DTDLOAD = 0x4 # load the external subset + XML_PARSE_DTDATTR = 0x8 # default DTD attributes + XML_PARSE_DTDVALID = 0x10 # validate with the DTD + XML_PARSE_NOERROR = 0x20 # suppress error reports + XML_PARSE_NOWARNING = 0x40 # suppress warning reports + XML_PARSE_PEDANTIC = 0x80 # pedantic error reporting + XML_PARSE_NOBLANKS = 0x100 # remove blank nodes + XML_PARSE_SAX1 = 0x200 # use the SAX1 interface internally + XML_PARSE_XINCLUDE = 0x400 # Implement XInclude substitution + XML_PARSE_NONET = 0x800 # Forbid network access + XML_PARSE_NODICT = 0x1000 # Do not reuse the context dictionary + XML_PARSE_NSCLEAN = 0x2000 # remove redundant namespaces declarations + XML_PARSE_NOCDATA = 0x4000 # merge CDATA as text nodes + XML_PARSE_NOXINCNODE = 0x8000 # do not generate XINCLUDE START/END nodes + # libxml2 2.6.21+ only: + XML_PARSE_COMPACT = 0x1_0000 # compact small text nodes + # libxml2 2.7.0+ only: + XML_PARSE_OLD10 = 0x2_0000 # parse using XML-1.0 before update 5 + XML_PARSE_NOBASEFIX = 0x4_0000 # do not fixup XINCLUDE xml:base uris + XML_PARSE_HUGE = 0x8_0000 # relax any hardcoded limit from the parser + # libxml2 2.7.3+ only: + XML_PARSE_OLDSAX = 0x10_0000 # parse using SAX2 interface before 2.7.0 + # libxml2 2.8.0+ only: + XML_PARSE_IGNORE_ENC = 0x20_0000 # ignore internal document encoding hint + # libxml2 2.9.0+ only: + XML_PARSE_BIG_LINES = 0x40_0000 # Store big lines numbers in text PSVI field + # libxml2 2.13.0+ only: + XML_PARSE_NO_XXE = 0x80_0000 # Disable loading of external DTDs or entities + # libxml2 2.14.0+ only: + XML_PARSE_UNZIP = 0x100_0000 # Enable input decompression (and potential gzip bombs) + XML_PARSE_NO_SYS_CATALOG = 0x200_0000 # Disable the global system XML catalog + XML_PARSE_CATALOG_PI = 0x400_0000 # Enable XML catalog processing instructions + # libxml2 2.15.0+ only: + XML_PARSE_SKIP_IDS = 0x800_0000 # Force the parser to ignore IDs + + cdef void xmlInitParser() + cdef void xmlCleanupParser() + + cdef int xmlLineNumbersDefault(int onoff) + cdef xmlParserCtxt* xmlNewParserCtxt() + cdef xmlParserInput* xmlNewIOInputStream(xmlParserCtxt* ctxt, + xmlParserInputBuffer* input, + int enc) + cdef int xmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) + cdef void xmlFreeParserCtxt(xmlParserCtxt* ctxt) + cdef void xmlCtxtReset(xmlParserCtxt* ctxt) + cdef void xmlClearParserCtxt(xmlParserCtxt* ctxt) + cdef int xmlParseChunk(xmlParserCtxt* ctxt, + char* chunk, int size, int terminate) + cdef xmlDoc* xmlCtxtReadDoc(xmlParserCtxt* ctxt, + char* cur, char* URL, char* encoding, + int options) + cdef xmlDoc* xmlCtxtReadFile(xmlParserCtxt* ctxt, + char* filename, char* encoding, + int options) + cdef xmlDoc* xmlCtxtReadIO(xmlParserCtxt* ctxt, + xmlInputReadCallback ioread, + xmlInputCloseCallback ioclose, + void* ioctx, + char* URL, char* encoding, + int options) + cdef xmlDoc* xmlCtxtReadMemory(xmlParserCtxt* ctxt, + char* buffer, int size, + char* filename, const_char* encoding, + int options) + + cdef void xmlErrParser(xmlParserCtxt* ctxt, xmlNode* node, + int domain, int code, xmlErrorLevel level, + const xmlChar *str1, const xmlChar *str2, const xmlChar *str3, + int int1, const char *msg, ...) + + +# iterparse: + + cdef xmlParserCtxt* xmlCreatePushParserCtxt(xmlSAXHandler* sax, + void* user_data, + char* chunk, + int size, + char* filename) + + cdef int xmlCtxtResetPush(xmlParserCtxt* ctxt, + char* chunk, + int size, + char* filename, + char* encoding) + +# entity loaders: + + ctypedef xmlParserInput* (*xmlExternalEntityLoader)( + const_char * URL, const_char * ID, xmlParserCtxt* context) noexcept + cdef xmlExternalEntityLoader xmlGetExternalEntityLoader() + cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f) + + cdef xmlEntity* xmlSAX2GetEntity(void* ctxt, const_xmlChar* name) noexcept + +# DTDs: + + cdef xmlDtd* xmlParseDTD(const_xmlChar* ExternalID, const_xmlChar* SystemID) + cdef xmlDtd* xmlIOParseDTD(xmlSAXHandler* sax, + xmlParserInputBuffer* input, + int enc) + + +cdef extern from "libxml/parserInternals.h" nogil: + """ + #if LIBXML_VERSION < 21400 + #define xmlNewInputFromMemory(url, mem, size, flags) (NULL) + #endif + """ + cdef xmlParserInput* xmlNewInputStream(xmlParserCtxt* ctxt) + cdef xmlParserInput* xmlNewStringInputStream(xmlParserCtxt* ctxt, + char* buffer) + cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt, + char* filename) + cdef xmlParserInput* xmlNewInputFromMemory( + const char *url, const void *mem, size_t size, int flags) # actually "xmlParserInputFlags flags" + cdef void xmlFreeInputStream(xmlParserInput* input) + cdef int xmlSwitchEncoding(xmlParserCtxt* ctxt, int enc) + cdef bint xmlCtxtIsStopped(xmlParserCtxt* ctxt) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlschema.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlschema.pxd new file mode 100644 index 0000000000000000000000000000000000000000..0674111132ee3c2da8f2a3ec1db7d9dbb462ad30 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlschema.pxd @@ -0,0 +1,35 @@ +from lxml.includes.tree cimport xmlDoc +from lxml.includes.xmlparser cimport xmlSAXHandler +from lxml.includes.xmlerror cimport xmlStructuredErrorFunc + +cdef extern from "libxml/xmlschemas.h" nogil: + ctypedef struct xmlSchema + ctypedef struct xmlSchemaParserCtxt + + ctypedef struct xmlSchemaSAXPlugStruct + ctypedef struct xmlSchemaValidCtxt + + ctypedef enum xmlSchemaValidOption: + XML_SCHEMA_VAL_VC_I_CREATE = 1 + + cdef xmlSchemaValidCtxt* xmlSchemaNewValidCtxt(xmlSchema* schema) nogil + cdef void xmlSchemaSetParserStructuredErrors(xmlSchemaParserCtxt* ctxt, + xmlStructuredErrorFunc serror, void *ctx) + cdef void xmlSchemaSetValidStructuredErrors(xmlSchemaValidCtxt* ctxt, + xmlStructuredErrorFunc serror, void *ctx) + + cdef int xmlSchemaValidateDoc(xmlSchemaValidCtxt* ctxt, xmlDoc* doc) nogil + cdef xmlSchema* xmlSchemaParse(xmlSchemaParserCtxt* ctxt) nogil + cdef xmlSchemaParserCtxt* xmlSchemaNewParserCtxt(char* URL) nogil + cdef xmlSchemaParserCtxt* xmlSchemaNewDocParserCtxt(xmlDoc* doc) nogil + cdef void xmlSchemaFree(xmlSchema* schema) nogil + cdef void xmlSchemaFreeParserCtxt(xmlSchemaParserCtxt* ctxt) nogil + cdef void xmlSchemaFreeValidCtxt(xmlSchemaValidCtxt* ctxt) nogil + cdef int xmlSchemaSetValidOptions(xmlSchemaValidCtxt* ctxt, + int options) nogil + + cdef xmlSchemaSAXPlugStruct* xmlSchemaSAXPlug(xmlSchemaValidCtxt* ctxt, + xmlSAXHandler** sax, + void** data) nogil + cdef int xmlSchemaSAXUnplug(xmlSchemaSAXPlugStruct* sax_plug) + cdef int xmlSchemaIsValid(xmlSchemaValidCtxt* ctxt) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xslt.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xslt.pxd new file mode 100644 index 0000000000000000000000000000000000000000..abafe4325c9147e77590ff0f1647100bb5e9c56a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xslt.pxd @@ -0,0 +1,190 @@ +from lxml.includes.tree cimport xmlDoc, xmlNode, xmlDict, xmlChar, const_xmlChar, xmlOutputBuffer +from lxml.includes.xmlerror cimport xmlGenericErrorFunc +from lxml.includes.xpath cimport xmlXPathContext, xmlXPathFunction + +from libc.string cimport const_char + +cdef extern from "libxslt/xslt.h": + cdef int xsltLibxsltVersion + cdef int xsltMaxDepth + +cdef extern from "libxslt/xsltconfig.h": + cdef int LIBXSLT_VERSION + +cdef extern from "libxslt/xsltInternals.h" nogil: + ctypedef enum xsltTransformState: + XSLT_STATE_OK # 0 + XSLT_STATE_ERROR # 1 + XSLT_STATE_STOPPED # 2 + + ctypedef struct xsltDocument: + xmlDoc* doc + + ctypedef struct xsltStylesheet: + xmlChar* encoding + xmlDoc* doc + int errors + + ctypedef struct xsltTransformContext: + xsltStylesheet* style + xmlXPathContext* xpathCtxt + xsltDocument* document + void* _private + xmlDict* dict + int profile + xmlNode* node + xmlDoc* output + xmlNode* insert + xmlNode* inst + xsltTransformState state + + ctypedef struct xsltStackElem + + ctypedef struct xsltTemplate + + cdef xsltStylesheet* xsltParseStylesheetDoc(xmlDoc* doc) + cdef void xsltFreeStylesheet(xsltStylesheet* sheet) + +cdef extern from "libxslt/imports.h" nogil: + # actually defined in "etree_defs.h" + cdef void LXML_GET_XSLT_ENCODING(const_xmlChar* result_var, xsltStylesheet* style) + +cdef extern from "libxslt/extensions.h" nogil: + ctypedef void (*xsltTransformFunction)(xsltTransformContext* ctxt, + xmlNode* context_node, + xmlNode* inst, + void* precomp_unused) noexcept + + cdef int xsltRegisterExtFunction(xsltTransformContext* ctxt, + const_xmlChar* name, + const_xmlChar* URI, + xmlXPathFunction function) + cdef int xsltRegisterExtModuleFunction(const_xmlChar* name, const_xmlChar* URI, + xmlXPathFunction function) + cdef int xsltUnregisterExtModuleFunction(const_xmlChar* name, const_xmlChar* URI) + cdef xmlXPathFunction xsltExtModuleFunctionLookup( + const_xmlChar* name, const_xmlChar* URI) + cdef int xsltRegisterExtPrefix(xsltStylesheet* style, + const_xmlChar* prefix, const_xmlChar* URI) + cdef int xsltRegisterExtElement(xsltTransformContext* ctxt, + const_xmlChar* name, const_xmlChar* URI, + xsltTransformFunction function) + +cdef extern from "libxslt/documents.h" nogil: + ctypedef enum xsltLoadType: + XSLT_LOAD_START + XSLT_LOAD_STYLESHEET + XSLT_LOAD_DOCUMENT + + ctypedef xmlDoc* (*xsltDocLoaderFunc)(const_xmlChar* URI, xmlDict* dict, + int options, + void* ctxt, + xsltLoadType type) noexcept + cdef xsltDocLoaderFunc xsltDocDefaultLoader + cdef void xsltSetLoaderFunc(xsltDocLoaderFunc f) + +cdef extern from "libxslt/transform.h" nogil: + cdef xmlDoc* xsltApplyStylesheet(xsltStylesheet* style, xmlDoc* doc, + const_char** params) + cdef xmlDoc* xsltApplyStylesheetUser(xsltStylesheet* style, xmlDoc* doc, + const_char** params, const_char* output, + void* profile, + xsltTransformContext* context) + cdef void xsltProcessOneNode(xsltTransformContext* ctxt, + xmlNode* contextNode, + xsltStackElem* params) + cdef xsltTransformContext* xsltNewTransformContext(xsltStylesheet* style, + xmlDoc* doc) + cdef void xsltFreeTransformContext(xsltTransformContext* context) + cdef void xsltApplyOneTemplate(xsltTransformContext* ctxt, + xmlNode* contextNode, xmlNode* list, + xsltTemplate* templ, + xsltStackElem* params) + + +cdef extern from "libxslt/xsltutils.h" nogil: + cdef int xsltSaveResultToString(xmlChar** doc_txt_ptr, + int* doc_txt_len, + xmlDoc* result, + xsltStylesheet* style) + cdef int xsltSaveResultToFilename(const_char *URL, + xmlDoc* result, + xsltStylesheet* style, + int compression) + cdef int xsltSaveResultTo(xmlOutputBuffer* buf, + xmlDoc* result, + xsltStylesheet* style) + cdef xmlGenericErrorFunc xsltGenericError + cdef void *xsltGenericErrorContext + cdef void xsltSetGenericErrorFunc( + void* ctxt, void (*handler)(void* ctxt, char* msg, ...) nogil) + cdef void xsltSetTransformErrorFunc( + xsltTransformContext*, void* ctxt, + void (*handler)(void* ctxt, char* msg, ...) nogil) + cdef void xsltTransformError(xsltTransformContext* ctxt, + xsltStylesheet* style, + xmlNode* node, char* msg, ...) + cdef void xsltSetCtxtParseOptions( + xsltTransformContext* ctxt, int options) + + +cdef extern from "libxslt/security.h" nogil: + ctypedef struct xsltSecurityPrefs + ctypedef enum xsltSecurityOption: + XSLT_SECPREF_READ_FILE = 1 + XSLT_SECPREF_WRITE_FILE = 2 + XSLT_SECPREF_CREATE_DIRECTORY = 3 + XSLT_SECPREF_READ_NETWORK = 4 + XSLT_SECPREF_WRITE_NETWORK = 5 + + ctypedef int (*xsltSecurityCheck)(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) noexcept + + cdef xsltSecurityPrefs* xsltNewSecurityPrefs() + cdef void xsltFreeSecurityPrefs(xsltSecurityPrefs* sec) + cdef int xsltSecurityForbid(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) + cdef int xsltSecurityAllow(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) + cdef int xsltSetSecurityPrefs(xsltSecurityPrefs* sec, + xsltSecurityOption option, + xsltSecurityCheck func) + cdef xsltSecurityCheck xsltGetSecurityPrefs( + xsltSecurityPrefs* sec, + xsltSecurityOption option) + cdef int xsltSetCtxtSecurityPrefs(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt) + cdef xmlDoc* xsltGetProfileInformation(xsltTransformContext* ctxt) + +cdef extern from "libxslt/variables.h" nogil: + cdef int xsltQuoteUserParams(xsltTransformContext* ctxt, + const_char** params) + cdef int xsltQuoteOneUserParam(xsltTransformContext* ctxt, + const_xmlChar* name, + const_xmlChar* value) + +cdef extern from "libxslt/extra.h" nogil: + const_xmlChar* XSLT_LIBXSLT_NAMESPACE + const_xmlChar* XSLT_XALAN_NAMESPACE + const_xmlChar* XSLT_SAXON_NAMESPACE + const_xmlChar* XSLT_XT_NAMESPACE + + cdef xmlXPathFunction xsltFunctionNodeSet + cdef void xsltRegisterAllExtras() + +cdef extern from "libexslt/exslt.h" nogil: + cdef void exsltRegisterAll() + + # libexslt 1.1.25+ + const_xmlChar* EXSLT_DATE_NAMESPACE + const_xmlChar* EXSLT_SETS_NAMESPACE + const_xmlChar* EXSLT_MATH_NAMESPACE + const_xmlChar* EXSLT_STRINGS_NAMESPACE + + cdef int exsltDateXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix) + cdef int exsltSetsXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix) + cdef int exsltMathXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix) + cdef int exsltStrXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc9e2c299647192bf5ee650105ff46426b3e4076 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/rng/iso-schematron.rng b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/rng/iso-schematron.rng new file mode 100644 index 0000000000000000000000000000000000000000..a4f504af1f7d6f01f7523d447b9304f417c01800 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/rng/iso-schematron.rng @@ -0,0 +1,709 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ltr + rtl + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + preserve + default + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1 + + + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl new file mode 100644 index 0000000000000000000000000000000000000000..21a5d2a069cab9fa327d9a3cd4e4d56c21bb10db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl new file mode 100644 index 0000000000000000000000000000000000000000..de0c9ea700d20c78111660e5fe8bf4ddc5a88137 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl new file mode 100644 index 0000000000000000000000000000000000000000..5018395234799dd65d53a339daaddf445a09dbea --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl @@ -0,0 +1,313 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Suppressed abstract pattern was here + + + + + + + Start pattern based on abstract + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl new file mode 100644 index 0000000000000000000000000000000000000000..44e5573b73077e015d404935479bdc9344c5ea4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl @@ -0,0 +1,1160 @@ + + + + + + + + + + true + true + true + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Error: Impossible URL in RELAX NG extRef + include + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Error: Impossible URL in Schematron include + + + + + + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + + + + + Unable to locate id attribute: + + + + + + + + + + + Schema error: Use include to + include fragments, not a whole + schema + + + + + + + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + + Unable to locate id attribute: + + + + + + + + + + Schema error: Use include to include + fragments, not a whole schema + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Error: Impossible URL in Schematron include + + + + + + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Error: Impossible URL in Schematron include + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + Schema error: Use include to include + fragments, not a whole schema + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + Schema error: Use include to include + fragments, not a whole schema + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Error: Impossible URL in DTLL include + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Error: Impossible URL in CRDL include + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Fatal error: Xinclude href contains fragment + identifier # + + + + + + + Fatal error: Sorry, this software only + supports simple ids in XInclude xpointers + + + + + + + Fatal Error: Impossible URL in XInclude + include + + + + + + + + + + + + + + + + + + + + + + + + + + + Unable to open referenced included file and fallback + file: + + + + + + + Unable to open referenced included file: + + + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Error: Impossible URL in XLink embedding + link + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + Unable to open referenced included file: + + + + + + + Unable to locate id attribute: + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl new file mode 100644 index 0000000000000000000000000000000000000000..d59b8f38fe0bf28bc089b033c2acdd314839b8cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl @@ -0,0 +1,55 @@ + + + + + + + + + + + + + + + + + + + + + + + ( + / + ) + + + \ No newline at end of file diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl new file mode 100644 index 0000000000000000000000000000000000000000..b0e7175cfff34fb05d631622c9429f0adcda8d5d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl @@ -0,0 +1,1796 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #ALL + + + +false + +true + + + + + true + false + + + + + + + true + false + + + + + + + + + @*| + + * + node() + *|comment()|processing-instruction() + + + + + + + + + + + + +default + +false + + + +1 + + + + + Schema error: Schematron elements in old and new namespaces found + + + + + + + + + + + + + + + + + Schema error: in the queryBinding attribute, use 'xslt' + + + + + 1.0 + + + + + + + + + This XSLT was automatically generated from a Schematron schema. + + + + + 1.0 + + + + + + + + + + Fail: This implementation of ISO Schematron does not work with + schemas using the "" query language. + + + + + Implementers: please note that overriding process-prolog or process-root is + the preferred method for meta-stylesheets to use where possible. + + + + + + + + + + PHASES + + PROLOG + + KEYS + + DEFAULT RULES + + SCHEMA METADATA + + SCHEMATRON PATTERNS + + + + + + + + + + + + + + + + + + + + + + + Phase Error: no phase with name has been defined. + + + + + + + MODE: SCHEMATRON-SELECT-FULL-PATH + This mode can be used to generate an ugly though full XPath for locators + + + + + + + + + + + + + + + + + + + + + + + + + MODE: SCHEMATRON-FULL-PATH + This mode can be used to generate an ugly though full XPath for locators + + + + + + / + + + + + + [] + + + + *[local-name()=' + ' and namespace-uri()=' + + '] + + + [] + + + + + + + + + + / + + @ + + @*[local-name()=' + + ' and namespace-uri()=' + + '] + + + + + + + + + MODE: SCHEMATRON-FULL-PATH-2 + + This mode can be used to generate prefixed XPath for humans + + + + + + / + + + [ + + ] + + + + + /@ + + + + + MODE: GENERATE-ID-FROM-PATH + + + + + + + + + + + + + + + + + + + + + + . + + + + + + + MODE: SCHEMATRON-FULL-PATH-3 + + + This mode can be used to generate prefixed XPath for humans + (Top-level element has index) + + + + + + / + + + [ + + ] + + + + + /@ + + + + + MODE: GENERATE-ID-2 + + + U + + + U + + + + + U. + + n + + + + + U. + + _ + + _ + + + + + Strip characters + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Markup Error: no pattern attribute in <active> + + + + Reference Error: the pattern "" has been activated but is not declared + + + + + + + + Markup Error: no test attribute in <assert + + + ASSERT + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Markup Error: no test attribute in <report> + + + + REPORT + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Markup Error: no id attribute in <diagnostic> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Markup Error: no rule attribute in <extends> + + + Reference Error: the abstract rule "" has been referenced but is not declared + + + + + + + + + + + + + + Markup Error: no name attribute in <key> + + + Markup Error: no path or use attribute in <key> + + + + + + + + + + + + + + + + Markup Error: no path or use attribute in <key> + + + + + + + + + + + + Schema error: The key element is not in the ISO Schematron namespace. Use the XSLT namespace. + + + + + + + + Schema error: Empty href= attribute for include directive. + + + + + + + + + + + + + + Error: Impossible URL in Schematron include + + + + + + + Schema error: Use include to include fragments, not a whole schema + + + + + + + + + + Schema error: Use include to include fragments, not a whole schema + + + + + + + + + + + + + + + Error: Impossible URL in Schematron include + + + + + + + Schema error: Use include to include fragments, not a whole schema + + + + + + + + + + + Schema error: Use include to include fragments, not a whole schema + + + + + + + + + + Warning: Variables should not be used with the "xpath" query language binding. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Markup Error: no uri attribute in <ns> + + + Markup Error: no prefix attribute in <ns> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //( + + ( + + ) + | + + ) + [not(self::text())] + + + + + + + + + + + + + Schema implementation error: This schema has abstract patterns, yet they are supposed to be preprocessed out already + + + + + + + + + + PATTERN + + + + + + + + + + + + + + + + + + + + Markup Error: no id attribute in <phase> + + + + + + + + Markup Error: no context attribute in <rule> + + + RULE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Markup Error: no id attribute on abstract <rule> + + + Markup Error: (2) context attribute on abstract <rule> + + + + + + Markup Error: context attribute on abstract <rule> + + + + + + + + + + + + + + + + + + + + + + + + + + + Markup Error: no select attribute in <value-of> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Warning: + + must not contain any child elements + + + + + + + + + + + + + + + + + + + + + + + + + Reference error: A diagnostic "" has been referenced but is not declared + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Using the XSLT namespace with a prefix other than "xsl" in + Schematron rules is not supported + in this processor: + + + + + + + + + + + + + + + + + + + + Error: unrecognized element in ISO Schematron namespace: check spelling + and capitalization + + + + + + + + + + + + + Warning: unrecognized element + + + + + + + + + + + + + + + Warning: unrecognized element + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + title + + + + + + + schema-title + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl new file mode 100644 index 0000000000000000000000000000000000000000..dae74ff6a2bd56a4f21147905c3a1661baf0b3ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl @@ -0,0 +1,588 @@ + + + + + + + + + + + + + + + + +true + + + + + + + + + + + #ALL + + +false +true +true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + xslt1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +   +   +   + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5d6dfcd9e9c2787c32d98b0063a1fa1ec3236ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt @@ -0,0 +1,84 @@ +ISO SCHEMATRON 2010 + +XSLT implementation by Rick Jelliffe with assistance from members of Schematron-love-in maillist. + +2010-04-21 + +Two distributions are available. One is for XSLT1 engines. +The other is for XSLT2 engines, such as SAXON 9. + + +This version of Schematron splits the process into a pipeline of several different XSLT stages. + +1) First, preprocess your Schematron schema with iso_dsdl_include.xsl. +This is a macro processor to assemble the schema from various parts. +If your schema is not in separate parts, you can skip this stage. +This stage also generates error messages for some common XPath syntax problems. + +2) Second, preprocess the output from stage 1 with iso_abstract_expand.xsl. +This is a macro processor to convert abstract patterns to real patterns. +If your schema does not use abstract patterns, you can skip this +stage. + +3) Third, compile the Schematron schema into an XSLT script. +This will typically use iso_svrl_for_xslt1.xsl or iso_svrl_for_xslt2.xsl +(which in turn invoke iso_schematron_skeleton_for_xslt1.xsl or iso_schematron_skeleton_for_saxon.xsl) +However, other "meta-stylesheets" are also in common use; the principle of operation is the same. +If your schema uses Schematron phases, supply these as command line/invocation parameters +to this process. + +4) Fourth, run the script generated by stage 3 against the document being validated. +If you are using the SVRL script, then the output of validation will be an XML document. +If your schema uses Schematron parameters, supply these as command line/invocation parameters +to this process. + + +The XSLT2 distribution also features several next generation features, +such as validating multiple documents. See the source code for details. + +Schematron assertions can be written in any language, of course; the file +sch-messages-en.xhtml contains the diagnostics messages from the XSLT2 skeleton +in English, and this can be used as template to localize the skeleton's +error messages. Note that typically programming errors in Schematron are XPath +errors, which requires localized messages from the XSLT engine. + +ANT +--- +To give an example of how to process a document, here is a sample ANT task. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2441f8fa39ee4b43a36644da847825b8b795c232 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b35162aa894b8ef93271eeed3337fe812ed25a2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/glue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/glue.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..330872134db456f01cbda7e3c7539bf7bfa440e9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/glue.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ff620c8819707d2f7620274c1aed949d6bd4dbd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/squad.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/squad.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8362645998854a295cd55e5a0f8a86970ac38c40 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/squad.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd0d17aa55bb4529820ce347f6275d38f6c0caa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__init__.py @@ -0,0 +1,98 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +from ...utils import is_sklearn_available, requires_backends + + +if is_sklearn_available(): + from scipy.stats import pearsonr, spearmanr + from sklearn.metrics import f1_score, matthews_corrcoef + + +DEPRECATION_WARNING = ( + "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate " + "library. You can have a look at this example script for pointers: " + "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py" +) + + +def simple_accuracy(preds, labels): + warnings.warn(DEPRECATION_WARNING, FutureWarning) + requires_backends(simple_accuracy, "sklearn") + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + warnings.warn(DEPRECATION_WARNING, FutureWarning) + requires_backends(acc_and_f1, "sklearn") + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def pearson_and_spearman(preds, labels): + warnings.warn(DEPRECATION_WARNING, FutureWarning) + requires_backends(pearson_and_spearman, "sklearn") + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + + +def glue_compute_metrics(task_name, preds, labels): + warnings.warn(DEPRECATION_WARNING, FutureWarning) + requires_backends(glue_compute_metrics, "sklearn") + assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}" + if task_name == "cola": + return {"mcc": matthews_corrcoef(labels, preds)} + elif task_name == "sst-2": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mrpc": + return acc_and_f1(preds, labels) + elif task_name == "sts-b": + return pearson_and_spearman(preds, labels) + elif task_name == "qqp": + return acc_and_f1(preds, labels) + elif task_name == "mnli": + return {"mnli/acc": simple_accuracy(preds, labels)} + elif task_name == "mnli-mm": + return {"mnli-mm/acc": simple_accuracy(preds, labels)} + elif task_name == "qnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "rte": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "wnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "hans": + return {"acc": simple_accuracy(preds, labels)} + else: + raise KeyError(task_name) + + +def xnli_compute_metrics(task_name, preds, labels): + warnings.warn(DEPRECATION_WARNING, FutureWarning) + requires_backends(xnli_compute_metrics, "sklearn") + if len(preds) != len(labels): + raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}") + if task_name == "xnli": + return {"acc": simple_accuracy(preds, labels)} + else: + raise KeyError(task_name) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..392ea7f745d95e81d4afcc21161c531199b3b4cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6628a5c73a789682cff3f8f5bf4a617cf8b617e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/squad_metrics.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/squad_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..0ffc025b65a0451523004df12f5a4ae5e9d17b9a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/squad_metrics.py @@ -0,0 +1,779 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to +update `find_best_threshold` scripts for SQuAD V2.0 + +In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an +additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted +probability that a question is unanswerable. +""" + +import collections +import json +import math +import re +import string + +from ...models.bert import BasicTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) + return re.sub(regex, " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: + return [] + return normalize_answer(s).split() + + +def compute_exact(a_gold, a_pred): + return int(normalize_answer(a_gold) == normalize_answer(a_pred)) + + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def get_raw_scores(examples, preds): + """ + Computes the exact and f1 scores from the examples and the model predictions + """ + exact_scores = {} + f1_scores = {} + + for example in examples: + qas_id = example.qas_id + gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])] + + if not gold_answers: + # For unanswerable questions, only correct answer is empty string + gold_answers = [""] + + if qas_id not in preds: + print(f"Missing prediction for {qas_id}") + continue + + prediction = preds[qas_id] + exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers) + f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers) + + return exact_scores, f1_scores + + +def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): + new_scores = {} + for qid, s in scores.items(): + pred_na = na_probs[qid] > na_prob_thresh + if pred_na: + new_scores[qid] = float(not qid_to_has_ans[qid]) + else: + new_scores[qid] = s + return new_scores + + +def make_eval_dict(exact_scores, f1_scores, qid_list=None): + if not qid_list: + total = len(exact_scores) + return collections.OrderedDict( + [ + ("exact", 100.0 * sum(exact_scores.values()) / total), + ("f1", 100.0 * sum(f1_scores.values()) / total), + ("total", total), + ] + ) + else: + total = len(qid_list) + return collections.OrderedDict( + [ + ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total), + ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total), + ("total", total), + ] + ) + + +def merge_eval(main_eval, new_eval, prefix): + for k in new_eval: + main_eval[f"{prefix}_{k}"] = new_eval[k] + + +def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for qid in qid_list: + if qid not in scores: + continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + + has_ans_score, has_ans_cnt = 0, 0 + for qid in qid_list: + if not qid_to_has_ans[qid]: + continue + has_ans_cnt += 1 + + if qid not in scores: + continue + has_ans_score += scores[qid] + + return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt + + +def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans) + main_eval["best_exact"] = best_exact + main_eval["best_exact_thresh"] = exact_thresh + main_eval["best_f1"] = best_f1 + main_eval["best_f1_thresh"] = f1_thresh + main_eval["has_ans_exact"] = has_ans_exact + main_eval["has_ans_f1"] = has_ans_f1 + + +def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for _, qid in enumerate(qid_list): + if qid not in scores: + continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + return 100.0 * best_score / len(scores), best_thresh + + +def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) + + main_eval["best_exact"] = best_exact + main_eval["best_exact_thresh"] = exact_thresh + main_eval["best_f1"] = best_f1 + main_eval["best_f1_thresh"] = f1_thresh + + +def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0): + qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples} + has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer] + no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer] + + if no_answer_probs is None: + no_answer_probs = dict.fromkeys(preds, 0.0) + + exact, f1 = get_raw_scores(examples, preds) + + exact_threshold = apply_no_ans_threshold( + exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold + ) + f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) + + evaluation = make_eval_dict(exact_threshold, f1_threshold) + + if has_answer_qids: + has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids) + merge_eval(evaluation, has_ans_eval, "HasAns") + + if no_answer_qids: + no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids) + merge_eval(evaluation, no_ans_eval, "NoAns") + + if no_answer_probs: + find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer) + + return evaluation + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heuristic between + # `pred_text` and `orig_text` to get a character-to-character alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for i, c in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'") + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'") + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for i, tok_index in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + logger.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + logger.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position : (orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +def compute_predictions_logits( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + do_lower_case, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + verbose_logging, + version_2_with_negative, + null_score_diff_threshold, + tokenizer, +): + """Write final predictions to the json file and log-odds of null if needed.""" + if output_prediction_file: + logger.info(f"Writing predictions to: {output_prediction_file}") + if output_nbest_file: + logger.info(f"Writing nbest to: {output_nbest_file}") + if output_null_log_odds_file and version_2_with_negative: + logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}") + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"] + ) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for example_index, example in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min null score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for feature_index, feature in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index], + ) + ) + if version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit, + ) + ) + prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"] + ) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] + + tok_text = tokenizer.convert_tokens_to_string(tok_tokens) + + # tok_text = " ".join(tok_tokens) + # + # # De-tokenize WordPieces that have been split off. + # tok_text = tok_text.replace(" ##", "") + # tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) + # if we didn't include the empty option in the n-best, include it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) + + # In very rare edge cases we could only have single null prediction. + # So we just create a nonce prediction in this case to avoid failure. + if len(nbest) == 1: + nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + if len(nbest) < 1: + raise ValueError("No valid predictions") + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for i, entry in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + if len(nbest_json) < 1: + raise ValueError("No valid predictions") + + if not version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + all_nbest_json[example.qas_id] = nbest_json + + if output_prediction_file: + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + if output_nbest_file: + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if output_null_log_odds_file and version_2_with_negative: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions + + +def compute_predictions_log_probs( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + start_n_top, + end_n_top, + version_2_with_negative, + tokenizer, + verbose_logging, +): + """ + XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of + null if needed. + + Requires utils_squad_evaluate.py + """ + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"] + ) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] + ) + + logger.info(f"Writing predictions to: {output_prediction_file}") + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for example_index, example in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + + for feature_index, feature in enumerate(features): + result = unique_id_to_result[feature.unique_id] + + cur_null_score = result.cls_logits + + # if we could have irrelevant answers, get the min score of irrelevant + score_null = min(score_null, cur_null_score) + + for i in range(start_n_top): + for j in range(end_n_top): + start_log_prob = result.start_logits[i] + start_index = result.start_top_index[i] + + j_index = i * end_n_top + j + + end_log_prob = result.end_logits[j_index] + end_index = result.end_top_index[j_index] + + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= feature.paragraph_len - 1: + continue + if end_index >= feature.paragraph_len - 1: + continue + + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob, + ) + ) + + prelim_predictions = sorted( + prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True + ) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + + # XLNet un-tokenizer + # Let's keep it simple for now and see if we need all this later. + # + # tok_start_to_orig_index = feature.tok_start_to_orig_index + # tok_end_to_orig_index = feature.tok_end_to_orig_index + # start_orig_pos = tok_start_to_orig_index[pred.start_index] + # end_orig_pos = tok_end_to_orig_index[pred.end_index] + # paragraph_text = example.paragraph_text + # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + + # Previously used Bert untokenizer + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] + tok_text = tokenizer.convert_tokens_to_string(tok_tokens) + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + if hasattr(tokenizer, "do_lower_case"): + do_lower_case = tokenizer.do_lower_case + else: + do_lower_case = tokenizer.do_lowercase_and_remove_accent + + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob) + ) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6)) + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for i, entry in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) + + if len(nbest_json) < 1: + raise ValueError("No valid predictions") + if best_non_null_entry is None: + raise ValueError("No valid predictions") + + score_diff = score_null + scores_diff_json[example.qas_id] = score_diff + # note(zhiliny): always predict best_non_null_entry + # and the evaluation script will search for the best threshold + all_predictions[example.qas_id] = best_non_null_entry.text + + all_nbest_json[example.qas_id] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if version_2_with_negative: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a26ab5776d74715428b10c4d9cd943e53b253785 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels +from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features +from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor +from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68e97a656acfe188f942bb93470e803ea9270796 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/glue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/glue.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4ed7f71bbd9d4bb468883d751fa8f8fb6c85a2f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/glue.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/squad.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/squad.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0aeb1d0d8d25ed1ceda3ebfab617afd3ecd460ed Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/squad.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5b682fee961c0dbb7513218015ffe027f9e0d64 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/xnli.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/xnli.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b5c36e536c48b4fe62a9e3798468f5ba8000ffe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/xnli.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/glue.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/glue.py new file mode 100644 index 0000000000000000000000000000000000000000..e005c9bcda13d15bc3aa32a50c79941166d0ba28 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/glue.py @@ -0,0 +1,643 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GLUE processors and helpers""" + +import os +import warnings +from dataclasses import asdict +from enum import Enum +from typing import Optional, Union + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import is_tf_available, logging +from .utils import DataProcessor, InputExample, InputFeatures + + +if is_tf_available(): + import tensorflow as tf + +logger = logging.get_logger(__name__) + +DEPRECATION_WARNING = ( + "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets " + "library. You can have a look at this example script for pointers: " + "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py" +) + + +def glue_convert_examples_to_features( + examples: Union[list[InputExample], "tf.data.Dataset"], + tokenizer: PreTrainedTokenizer, + max_length: Optional[int] = None, + task=None, + label_list=None, + output_mode=None, +): + """ + Loads a data file into a list of `InputFeatures` + + Args: + examples: List of `InputExamples` or `tf.data.Dataset` containing the examples. + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length. Defaults to the tokenizer's max_len + task: GLUE task + label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method + output_mode: String indicating the output mode. Either `regression` or `classification` + + Returns: + If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific + features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which + can be fed to the model. + + """ + warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning) + if is_tf_available() and isinstance(examples, tf.data.Dataset): + if task is None: + raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.") + return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) + return _glue_convert_examples_to_features( + examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode + ) + + +if is_tf_available(): + + def _tf_glue_convert_examples_to_features( + examples: tf.data.Dataset, + tokenizer: PreTrainedTokenizer, + task=str, + max_length: Optional[int] = None, + ) -> tf.data.Dataset: + """ + Returns: + A `tf.data.Dataset` containing the task-specific features. + + """ + processor = glue_processors[task]() + examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples] + features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) + label_type = tf.float32 if task == "sts-b" else tf.int64 + + def gen(): + for ex in features: + d = {k: v for k, v in asdict(ex).items() if v is not None} + label = d.pop("label") + yield (d, label) + + input_names = tokenizer.model_input_names + + return tf.data.Dataset.from_generator( + gen, + (dict.fromkeys(input_names, tf.int32), label_type), + ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])), + ) + + +def _glue_convert_examples_to_features( + examples: list[InputExample], + tokenizer: PreTrainedTokenizer, + max_length: Optional[int] = None, + task=None, + label_list=None, + output_mode=None, +): + if max_length is None: + max_length = tokenizer.model_max_length + + if task is not None: + processor = glue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info(f"Using label list {label_list} for task {task}") + if output_mode is None: + output_mode = glue_output_modes[task] + logger.info(f"Using output mode {output_mode} for task {task}") + + label_map = {label: i for i, label in enumerate(label_list)} + + def label_from_example(example: InputExample) -> Union[int, float, None]: + if example.label is None: + return None + if output_mode == "classification": + return label_map[example.label] + elif output_mode == "regression": + return float(example.label) + raise KeyError(output_mode) + + labels = [label_from_example(example) for example in examples] + + batch_encoding = tokenizer( + [(example.text_a, example.text_b) for example in examples], + max_length=max_length, + padding="max_length", + truncation=True, + ) + + features = [] + for i in range(len(examples)): + inputs = {k: batch_encoding[k][i] for k in batch_encoding} + + feature = InputFeatures(**inputs, label=labels[i]) + features.append(feature) + + for i, example in enumerate(examples[:5]): + logger.info("*** Example ***") + logger.info(f"guid: {example.guid}") + logger.info(f"features: {features[i]}") + + return features + + +class OutputMode(Enum): + classification = "classification" + regression = "regression" + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence1"].numpy().decode("utf-8"), + tensor_dict["sentence2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + logger.info(f"LOOKING AT {os.path.join(data_dir, 'train.tsv')}") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"{set_type}-{i}" + text_a = line[3] + text_b = line[4] + label = None if set_type == "test" else line[0] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["premise"].numpy().decode("utf-8"), + tensor_dict["hypothesis"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"{set_type}-{line[0]}" + text_a = line[8] + text_b = line[9] + label = None if set_type.startswith("test") else line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI Mismatched data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched") + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence"].numpy().decode("utf-8"), + None, + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + test_mode = set_type == "test" + if test_mode: + lines = lines[1:] + text_index = 1 if test_mode else 3 + examples = [] + for i, line in enumerate(lines): + guid = f"{set_type}-{i}" + text_a = line[text_index] + label = None if test_mode else line[1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence"].numpy().decode("utf-8"), + None, + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + examples = [] + text_index = 1 if set_type == "test" else 0 + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"{set_type}-{i}" + text_a = line[text_index] + label = None if set_type == "test" else line[1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence1"].numpy().decode("utf-8"), + tensor_dict["sentence2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"{set_type}-{line[0]}" + text_a = line[7] + text_b = line[8] + label = None if set_type == "test" else line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the QQP data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["question1"].numpy().decode("utf-8"), + tensor_dict["question2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + test_mode = set_type == "test" + q1_index = 1 if test_mode else 3 + q2_index = 2 if test_mode else 4 + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"{set_type}-{line[0]}" + try: + text_a = line[q1_index] + text_b = line[q2_index] + label = None if test_mode else line[5] + except IndexError: + continue + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the QNLI data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["question"].numpy().decode("utf-8"), + tensor_dict["sentence"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"{set_type}-{line[0]}" + text_a = line[1] + text_b = line[2] + label = None if set_type == "test" else line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence1"].numpy().decode("utf-8"), + tensor_dict["sentence2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"{set_type}-{line[0]}" + text_a = line[1] + text_b = line[2] + label = None if set_type == "test" else line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning) + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence1"].numpy().decode("utf-8"), + tensor_dict["sentence2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"{set_type}-{line[0]}" + text_a = line[1] + text_b = line[2] + label = None if set_type == "test" else line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +glue_tasks_num_labels = { + "cola": 2, + "mnli": 3, + "mrpc": 2, + "sst-2": 2, + "sts-b": 1, + "qqp": 2, + "qnli": 2, + "rte": 2, + "wnli": 2, +} + +glue_processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, + "mrpc": MrpcProcessor, + "sst-2": Sst2Processor, + "sts-b": StsbProcessor, + "qqp": QqpProcessor, + "qnli": QnliProcessor, + "rte": RteProcessor, + "wnli": WnliProcessor, +} + +glue_output_modes = { + "cola": "classification", + "mnli": "classification", + "mnli-mm": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/squad.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/squad.py new file mode 100644 index 0000000000000000000000000000000000000000..5f37eb01813308a0c850e55ac283f13ccf231f68 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/squad.py @@ -0,0 +1,845 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from functools import partial +from multiprocessing import Pool, cpu_count +from multiprocessing.pool import ThreadPool +from typing import Optional + +import numpy as np +from tqdm import tqdm + +from ...models.bert.tokenization_bert import whitespace_tokenize +from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy +from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging +from .utils import DataProcessor + + +# Store the tokenizers which insert 2 separators tokens +MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"} + + +if is_torch_available(): + import torch + from torch.utils.data import TensorDataset + +if is_tf_available(): + import tensorflow as tf + +logger = logging.get_logger(__name__) + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + best_score = None + best_span_index = None + for span_index, doc_span in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def _new_check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + # if len(doc_spans) == 1: + # return True + best_score = None + best_span_index = None + for span_index, doc_span in enumerate(doc_spans): + end = doc_span["start"] + doc_span["length"] - 1 + if position < doc_span["start"]: + continue + if position > end: + continue + num_left_context = position - doc_span["start"] + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"] + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def _is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + +def squad_convert_example_to_features( + example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training +): + features = [] + if is_training and not example.is_impossible: + # Get start and end position + start_position = example.start_position + end_position = example.end_position + + # If the answer cannot be found in the text, then skip this example. + actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)]) + cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'") + return [] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for i, token in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + if tokenizer.__class__.__name__ in [ + "RobertaTokenizer", + "LongformerTokenizer", + "BartTokenizer", + "RobertaTokenizerFast", + "LongformerTokenizerFast", + "BartTokenizerFast", + ]: + sub_tokens = tokenizer.tokenize(token, add_prefix_space=True) + else: + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text + ) + + spans = [] + + truncated_query = tokenizer.encode( + example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length + ) + + # Tokenizers who insert 2 SEP tokens in-between & need to have special handling + # in the way they compute mask of added tokens. + tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower() + sequence_added_tokens = ( + tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1 + if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET + else tokenizer.model_max_length - tokenizer.max_len_single_sentence + ) + sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair + + span_doc_tokens = all_doc_tokens + while len(spans) * doc_stride < len(all_doc_tokens): + # Define the side we want to truncate / pad and the text/pair sorting + if tokenizer.padding_side == "right": + texts = truncated_query + pairs = span_doc_tokens + truncation = TruncationStrategy.ONLY_SECOND.value + else: + texts = span_doc_tokens + pairs = truncated_query + truncation = TruncationStrategy.ONLY_FIRST.value + + encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic + texts, + pairs, + truncation=truncation, + padding=padding_strategy, + max_length=max_seq_length, + return_overflowing_tokens=True, + stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, + return_token_type_ids=True, + ) + + paragraph_len = min( + len(all_doc_tokens) - len(spans) * doc_stride, + max_seq_length - len(truncated_query) - sequence_pair_added_tokens, + ) + + if tokenizer.pad_token_id in encoded_dict["input_ids"]: + if tokenizer.padding_side == "right": + non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] + else: + last_padding_id_position = ( + len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id) + ) + non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :] + + else: + non_padded_ids = encoded_dict["input_ids"] + + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) + + token_to_orig_map = {} + for i in range(paragraph_len): + index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i + token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] + + encoded_dict["paragraph_len"] = paragraph_len + encoded_dict["tokens"] = tokens + encoded_dict["token_to_orig_map"] = token_to_orig_map + encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens + encoded_dict["token_is_max_context"] = {} + encoded_dict["start"] = len(spans) * doc_stride + encoded_dict["length"] = paragraph_len + + spans.append(encoded_dict) + + if "overflowing_tokens" not in encoded_dict or ( + "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0 + ): + break + span_doc_tokens = encoded_dict["overflowing_tokens"] + + for doc_span_index in range(len(spans)): + for j in range(spans[doc_span_index]["paragraph_len"]): + is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) + index = ( + j + if tokenizer.padding_side == "left" + else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + ) + spans[doc_span_index]["token_is_max_context"][index] = is_max_context + + for span in spans: + # Identify the position of the CLS token + cls_index = span["input_ids"].index(tokenizer.cls_token_id) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implementation also keep the classification token (set to 0) + p_mask = np.ones_like(span["token_type_ids"]) + if tokenizer.padding_side == "right": + p_mask[len(truncated_query) + sequence_added_tokens :] = 0 + else: + p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0 + + pad_token_indices = np.where(np.atleast_1d(span["input_ids"] == tokenizer.pad_token_id)) + special_token_indices = np.asarray( + tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True) + ).nonzero() + + p_mask[pad_token_indices] = 1 + p_mask[special_token_indices] = 1 + + # Set the cls index to 0: the CLS index can be used for impossible answers + p_mask[cls_index] = 0 + + span_is_impossible = example.is_impossible + start_position = 0 + end_position = 0 + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = span["start"] + doc_end = span["start"] + span["length"] - 1 + out_of_span = False + + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): + out_of_span = True + + if out_of_span: + start_position = cls_index + end_position = cls_index + span_is_impossible = True + else: + if tokenizer.padding_side == "left": + doc_offset = 0 + else: + doc_offset = len(truncated_query) + sequence_added_tokens + + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + features.append( + SquadFeatures( + span["input_ids"], + span["attention_mask"], + span["token_type_ids"], + cls_index, + p_mask.tolist(), + example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. + unique_id=0, + paragraph_len=span["paragraph_len"], + token_is_max_context=span["token_is_max_context"], + tokens=span["tokens"], + token_to_orig_map=span["token_to_orig_map"], + start_position=start_position, + end_position=end_position, + is_impossible=span_is_impossible, + qas_id=example.qas_id, + ) + ) + return features + + +def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase): + global tokenizer + tokenizer = tokenizer_for_convert + + +def squad_convert_examples_to_features( + examples, + tokenizer, + max_seq_length, + doc_stride, + max_query_length, + is_training, + padding_strategy="max_length", + return_dataset=False, + threads=1, + tqdm_enabled=True, +): + """ + Converts a list of examples into a list of features that can be directly given as input to a model. It is + model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. + + Args: + examples: list of [`~data.processors.squad.SquadExample`] + tokenizer: an instance of a child of [`PreTrainedTokenizer`] + max_seq_length: The maximum sequence length of the inputs. + doc_stride: The stride used when the context is too large and is split across several features. + max_query_length: The maximum length of the query. + is_training: whether to create features for model evaluation or model training. + padding_strategy: Default to "max_length". Which padding strategy to use + return_dataset: Default False. Either 'pt' or 'tf'. + if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset + threads: multiple processing threads. + + + Returns: + list of [`~data.processors.squad.SquadFeatures`] + + Example: + + ```python + processor = SquadV2Processor() + examples = processor.get_dev_examples(data_dir) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + ) + ```""" + + threads = min(threads, cpu_count()) + pool_cls = ThreadPool if is_torch_hpu_available() else Pool + with pool_cls(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: + annotate_ = partial( + squad_convert_example_to_features, + max_seq_length=max_seq_length, + doc_stride=doc_stride, + max_query_length=max_query_length, + padding_strategy=padding_strategy, + is_training=is_training, + ) + features = list( + tqdm( + p.imap(annotate_, examples, chunksize=32), + total=len(examples), + desc="convert squad examples to features", + disable=not tqdm_enabled, + ) + ) + + new_features = [] + unique_id = 1000000000 + example_index = 0 + for example_features in tqdm( + features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled + ): + if not example_features: + continue + for example_feature in example_features: + example_feature.example_index = example_index + example_feature.unique_id = unique_id + new_features.append(example_feature) + unique_id += 1 + example_index += 1 + features = new_features + del new_features + if return_dataset == "pt": + if not is_torch_available(): + raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.") + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) + all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) + all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float) + + if not is_training: + all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long) + dataset = TensorDataset( + all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask + ) + else: + all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) + all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) + dataset = TensorDataset( + all_input_ids, + all_attention_masks, + all_token_type_ids, + all_start_positions, + all_end_positions, + all_cls_index, + all_p_mask, + all_is_impossible, + ) + + return features, dataset + elif return_dataset == "tf": + if not is_tf_available(): + raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.") + + def gen(): + for i, ex in enumerate(features): + if ex.token_type_ids is None: + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "feature_index": i, + "qas_id": ex.qas_id, + }, + { + "start_positions": ex.start_position, + "end_positions": ex.end_position, + "cls_index": ex.cls_index, + "p_mask": ex.p_mask, + "is_impossible": ex.is_impossible, + }, + ) + else: + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + "feature_index": i, + "qas_id": ex.qas_id, + }, + { + "start_positions": ex.start_position, + "end_positions": ex.end_position, + "cls_index": ex.cls_index, + "p_mask": ex.p_mask, + "is_impossible": ex.is_impossible, + }, + ) + + # Why have we split the batch into a tuple? PyTorch just has a list of tensors. + if "token_type_ids" in tokenizer.model_input_names: + train_types = ( + { + "input_ids": tf.int32, + "attention_mask": tf.int32, + "token_type_ids": tf.int32, + "feature_index": tf.int64, + "qas_id": tf.string, + }, + { + "start_positions": tf.int64, + "end_positions": tf.int64, + "cls_index": tf.int64, + "p_mask": tf.int32, + "is_impossible": tf.int32, + }, + ) + + train_shapes = ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "token_type_ids": tf.TensorShape([None]), + "feature_index": tf.TensorShape([]), + "qas_id": tf.TensorShape([]), + }, + { + "start_positions": tf.TensorShape([]), + "end_positions": tf.TensorShape([]), + "cls_index": tf.TensorShape([]), + "p_mask": tf.TensorShape([None]), + "is_impossible": tf.TensorShape([]), + }, + ) + else: + train_types = ( + {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string}, + { + "start_positions": tf.int64, + "end_positions": tf.int64, + "cls_index": tf.int64, + "p_mask": tf.int32, + "is_impossible": tf.int32, + }, + ) + + train_shapes = ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "feature_index": tf.TensorShape([]), + "qas_id": tf.TensorShape([]), + }, + { + "start_positions": tf.TensorShape([]), + "end_positions": tf.TensorShape([]), + "cls_index": tf.TensorShape([]), + "p_mask": tf.TensorShape([None]), + "is_impossible": tf.TensorShape([]), + }, + ) + + return tf.data.Dataset.from_generator(gen, train_types, train_shapes) + else: + return features + + +class SquadProcessor(DataProcessor): + """ + Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and + version 2.0 of SQuAD, respectively. + """ + + train_file = None + dev_file = None + + def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False): + if not evaluate: + answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8") + answer_start = tensor_dict["answers"]["answer_start"][0].numpy() + answers = [] + else: + answers = [ + {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")} + for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"]) + ] + + answer = None + answer_start = None + + return SquadExample( + qas_id=tensor_dict["id"].numpy().decode("utf-8"), + question_text=tensor_dict["question"].numpy().decode("utf-8"), + context_text=tensor_dict["context"].numpy().decode("utf-8"), + answer_text=answer, + start_position_character=answer_start, + title=tensor_dict["title"].numpy().decode("utf-8"), + answers=answers, + ) + + def get_examples_from_dataset(self, dataset, evaluate=False): + """ + Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset. + + Args: + dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")* + evaluate: Boolean specifying if in evaluation mode or in training mode + + Returns: + List of SquadExample + + Examples: + + ```python + >>> import tensorflow_datasets as tfds + + >>> dataset = tfds.load("squad") + + >>> training_examples = get_examples_from_dataset(dataset, evaluate=False) + >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True) + ```""" + + if evaluate: + dataset = dataset["validation"] + else: + dataset = dataset["train"] + + examples = [] + for tensor_dict in tqdm(dataset): + examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) + + return examples + + def get_train_examples(self, data_dir, filename=None): + """ + Returns the training examples from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the training file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + + """ + if data_dir is None: + data_dir = "" + + if self.train_file is None: + raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") + + with open( + os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8" + ) as reader: + input_data = json.load(reader)["data"] + return self._create_examples(input_data, "train") + + def get_dev_examples(self, data_dir, filename=None): + """ + Returns the evaluation example from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the evaluation file has a different name than the original one + which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively. + """ + if data_dir is None: + data_dir = "" + + if self.dev_file is None: + raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") + + with open( + os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8" + ) as reader: + input_data = json.load(reader)["data"] + return self._create_examples(input_data, "dev") + + def _create_examples(self, input_data, set_type): + is_training = set_type == "train" + examples = [] + for entry in tqdm(input_data): + title = entry["title"] + for paragraph in entry["paragraphs"]: + context_text = paragraph["context"] + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position_character = None + answer_text = None + answers = [] + + is_impossible = qa.get("is_impossible", False) + if not is_impossible: + if is_training: + answer = qa["answers"][0] + answer_text = answer["text"] + start_position_character = answer["answer_start"] + else: + answers = qa["answers"] + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + context_text=context_text, + answer_text=answer_text, + start_position_character=start_position_character, + title=title, + is_impossible=is_impossible, + answers=answers, + ) + examples.append(example) + return examples + + +class SquadV1Processor(SquadProcessor): + train_file = "train-v1.1.json" + dev_file = "dev-v1.1.json" + + +class SquadV2Processor(SquadProcessor): + train_file = "train-v2.0.json" + dev_file = "dev-v2.0.json" + + +class SquadExample: + """ + A single training/test example for the Squad dataset, as loaded from disk. + + Args: + qas_id: The example's unique identifier + question_text: The question string + context_text: The context string + answer_text: The answer string + start_position_character: The character position of the start of the answer + title: The title of the example + answers: None by default, this is used during evaluation. Holds answers as well as their start positions. + is_impossible: False by default, set to True if the example has no possible answer. + """ + + def __init__( + self, + qas_id, + question_text, + context_text, + answer_text, + start_position_character, + title, + answers=[], + is_impossible=False, + ): + self.qas_id = qas_id + self.question_text = question_text + self.context_text = context_text + self.answer_text = answer_text + self.title = title + self.is_impossible = is_impossible + self.answers = answers + + self.start_position, self.end_position = 0, 0 + + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + + # Split on whitespace so that different tokens may be attributed to their original position. + for c in self.context_text: + if _is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + self.doc_tokens = doc_tokens + self.char_to_word_offset = char_to_word_offset + + # Start and end positions only has a value during evaluation. + if start_position_character is not None and not is_impossible: + self.start_position = char_to_word_offset[start_position_character] + self.end_position = char_to_word_offset[ + min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1) + ] + + +class SquadFeatures: + """ + Single squad example features to be fed to a model. Those features are model-specific and can be crafted from + [`~data.processors.squad.SquadExample`] using the + :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + token_type_ids: Segment token indices to indicate first and second portions of the inputs. + cls_index: the index of the CLS token. + p_mask: Mask identifying tokens that can be answers vs. tokens that cannot. + Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer + example_index: the index of the example + unique_id: The unique Feature identifier + paragraph_len: The length of the context + token_is_max_context: + List of booleans identifying which tokens have their maximum context in this feature object. If a token + does not have their maximum context in this feature object, it means that another feature object has more + information related to that token and should be prioritized over this feature for that token. + tokens: list of tokens corresponding to the input ids + token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. + start_position: start of the answer token index + end_position: end of the answer token index + encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods. + """ + + def __init__( + self, + input_ids, + attention_mask, + token_type_ids, + cls_index, + p_mask, + example_index, + unique_id, + paragraph_len, + token_is_max_context, + tokens, + token_to_orig_map, + start_position, + end_position, + is_impossible, + qas_id: Optional[str] = None, + encoding: Optional[BatchEncoding] = None, + ): + self.input_ids = input_ids + self.attention_mask = attention_mask + self.token_type_ids = token_type_ids + self.cls_index = cls_index + self.p_mask = p_mask + + self.example_index = example_index + self.unique_id = unique_id + self.paragraph_len = paragraph_len + self.token_is_max_context = token_is_max_context + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + self.qas_id = qas_id + + self.encoding = encoding + + +class SquadResult: + """ + Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset. + + Args: + unique_id: The unique identifier corresponding to that example. + start_logits: The logits corresponding to the start of the answer + end_logits: The logits corresponding to the end of the answer + """ + + def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): + self.start_logits = start_logits + self.end_logits = end_logits + self.unique_id = unique_id + + if start_top_index: + self.start_top_index = start_top_index + self.end_top_index = end_top_index + self.cls_logits = cls_logits diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..462156ebac384e08d78a7b42ea06f35a457e5feb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/utils.py @@ -0,0 +1,349 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import dataclasses +import json +from dataclasses import dataclass +from typing import Optional, Union + +from ...utils import is_tf_available, is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +@dataclass +class InputExample: + """ + A single training/test example for simple sequence classification. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + + guid: str + text_a: str + text_b: Optional[str] = None + label: Optional[str] = None + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(dataclasses.asdict(self), indent=2) + "\n" + + +@dataclass(frozen=True) +class InputFeatures: + """ + A single set of features of data. Property names are the same names as the corresponding inputs to a model. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded) + tokens. + token_type_ids: (Optional) Segment token indices to indicate first and second + portions of the inputs. Only some models use them. + label: (Optional) Label corresponding to the input. Int for classification problems, + float for regression problems. + """ + + input_ids: list[int] + attention_mask: Optional[list[int]] = None + token_type_ids: Optional[list[int]] = None + label: Optional[Union[int, float]] = None + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(dataclasses.asdict(self)) + "\n" + + +class DataProcessor: + """Base class for data converters for sequence classification data sets.""" + + def get_example_from_tensor_dict(self, tensor_dict): + """ + Gets an example from a dict with tensorflow tensors. + + Args: + tensor_dict: Keys and values should match the corresponding Glue + tensorflow_dataset examples. + """ + raise NotImplementedError() + + def get_train_examples(self, data_dir): + """Gets a collection of [`InputExample`] for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of [`InputExample`] for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of [`InputExample`] for the test set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + def tfds_map(self, example): + """ + Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts + examples to the correct format. + """ + if len(self.get_labels()) > 1: + example.label = self.get_labels()[int(example.label)] + return example + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8-sig") as f: + return list(csv.reader(f, delimiter="\t", quotechar=quotechar)) + + +class SingleSentenceClassificationProcessor(DataProcessor): + """Generic processor for a single sentence classification data set.""" + + def __init__(self, labels=None, examples=None, mode="classification", verbose=False): + self.labels = [] if labels is None else labels + self.examples = [] if examples is None else examples + self.mode = mode + self.verbose = verbose + + def __len__(self): + return len(self.examples) + + def __getitem__(self, idx): + if isinstance(idx, slice): + return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx]) + return self.examples[idx] + + @classmethod + def create_from_csv( + cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs + ): + processor = cls(**kwargs) + processor.add_examples_from_csv( + file_name, + split_name=split_name, + column_label=column_label, + column_text=column_text, + column_id=column_id, + skip_first_row=skip_first_row, + overwrite_labels=True, + overwrite_examples=True, + ) + return processor + + @classmethod + def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs): + processor = cls(**kwargs) + processor.add_examples(texts_or_text_and_labels, labels=labels) + return processor + + def add_examples_from_csv( + self, + file_name, + split_name="", + column_label=0, + column_text=1, + column_id=None, + skip_first_row=False, + overwrite_labels=False, + overwrite_examples=False, + ): + lines = self._read_tsv(file_name) + if skip_first_row: + lines = lines[1:] + texts = [] + labels = [] + ids = [] + for i, line in enumerate(lines): + texts.append(line[column_text]) + labels.append(line[column_label]) + if column_id is not None: + ids.append(line[column_id]) + else: + guid = f"{split_name}-{i}" if split_name else str(i) + ids.append(guid) + + return self.add_examples( + texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples + ) + + def add_examples( + self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False + ): + if labels is not None and len(texts_or_text_and_labels) != len(labels): + raise ValueError( + f"Text and labels have mismatched lengths {len(texts_or_text_and_labels)} and {len(labels)}" + ) + if ids is not None and len(texts_or_text_and_labels) != len(ids): + raise ValueError(f"Text and ids have mismatched lengths {len(texts_or_text_and_labels)} and {len(ids)}") + if ids is None: + ids = [None] * len(texts_or_text_and_labels) + if labels is None: + labels = [None] * len(texts_or_text_and_labels) + examples = [] + added_labels = set() + for text_or_text_and_label, label, guid in zip(texts_or_text_and_labels, labels, ids): + if isinstance(text_or_text_and_label, (tuple, list)) and label is None: + text, label = text_or_text_and_label + else: + text = text_or_text_and_label + added_labels.add(label) + examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label)) + + # Update examples + if overwrite_examples: + self.examples = examples + else: + self.examples.extend(examples) + + # Update labels + if overwrite_labels: + self.labels = list(added_labels) + else: + self.labels = list(set(self.labels).union(added_labels)) + + return self.examples + + def get_features( + self, + tokenizer, + max_length=None, + pad_on_left=False, + pad_token=0, + mask_padding_with_zero=True, + return_tensors=None, + ): + """ + Convert examples in a list of `InputFeatures` + + Args: + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length + pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default) + pad_token: Padding token + mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values + and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual + values) + + Returns: + If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the + task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific + `InputFeatures` which can be fed to the model. + + """ + if max_length is None: + max_length = tokenizer.max_len + + label_map = {label: i for i, label in enumerate(self.labels)} + + all_input_ids = [] + for ex_index, example in enumerate(self.examples): + if ex_index % 10000 == 0: + logger.info(f"Tokenizing example {ex_index}") + + input_ids = tokenizer.encode( + example.text_a, + add_special_tokens=True, + max_length=min(max_length, tokenizer.max_len), + ) + all_input_ids.append(input_ids) + + batch_length = max(len(input_ids) for input_ids in all_input_ids) + + features = [] + for ex_index, (input_ids, example) in enumerate(zip(all_input_ids, self.examples)): + if ex_index % 10000 == 0: + logger.info(f"Writing example {ex_index}/{len(self.examples)}") + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = batch_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + else: + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + + if len(input_ids) != batch_length: + raise ValueError(f"Error with input length {len(input_ids)} vs {batch_length}") + if len(attention_mask) != batch_length: + raise ValueError(f"Error with input length {len(attention_mask)} vs {batch_length}") + + if self.mode == "classification": + label = label_map[example.label] + elif self.mode == "regression": + label = float(example.label) + else: + raise ValueError(self.mode) + + if ex_index < 5 and self.verbose: + logger.info("*** Example ***") + logger.info(f"guid: {example.guid}") + logger.info(f"input_ids: {' '.join([str(x) for x in input_ids])}") + logger.info(f"attention_mask: {' '.join([str(x) for x in attention_mask])}") + logger.info(f"label: {example.label} (id = {label})") + + features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label)) + + if return_tensors is None: + return features + elif return_tensors == "tf": + if not is_tf_available(): + raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported") + import tensorflow as tf + + def gen(): + for ex in features: + yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label) + + dataset = tf.data.Dataset.from_generator( + gen, + ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64), + ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])), + ) + return dataset + elif return_tensors == "pt": + if not is_torch_available(): + raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported") + import torch + from torch.utils.data import TensorDataset + + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + if self.mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif self.mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + + dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) + return dataset + else: + raise ValueError("return_tensors should be one of 'tf' or 'pt'") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/xnli.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/xnli.py new file mode 100644 index 0000000000000000000000000000000000000000..4d8ec17a8345db5bf08325a334a4c6eb8af29157 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/xnli.py @@ -0,0 +1,96 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""XNLI utils (dataset loading and evaluation)""" + +import os + +from ...utils import logging +from .utils import DataProcessor, InputExample + + +logger = logging.get_logger(__name__) + + +class XnliProcessor(DataProcessor): + """ + Processor for the XNLI dataset. Adapted from + https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207 + """ + + def __init__(self, language, train_language=None): + self.language = language + self.train_language = train_language + + def get_train_examples(self, data_dir): + """See base class.""" + lg = self.language if self.train_language is None else self.train_language + lines = self._read_tsv(os.path.join(data_dir, f"XNLI-MT-1.0/multinli/multinli.train.{lg}.tsv")) + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = f"train-{i}" + text_a = line[0] + text_b = line[1] + label = "contradiction" if line[2] == "contradictory" else line[2] + if not isinstance(text_a, str): + raise TypeError(f"Training input {text_a} is not a string") + if not isinstance(text_b, str): + raise TypeError(f"Training input {text_b} is not a string") + if not isinstance(label, str): + raise TypeError(f"Training label {label} is not a string") + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + language = line[0] + if language != self.language: + continue + guid = f"test-{i}" + text_a = line[6] + text_b = line[7] + label = line[1] + if not isinstance(text_a, str): + raise TypeError(f"Training input {text_a} is not a string") + if not isinstance(text_b, str): + raise TypeError(f"Training input {text_b} is not a string") + if not isinstance(label, str): + raise TypeError(f"Training label {label} is not a string") + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +xnli_processors = { + "xnli": XnliProcessor, +} + +xnli_output_modes = { + "xnli": "classification", +} + +xnli_tasks_num_labels = { + "xnli": 3, +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4fa8a11d5fc5819fd8e9d2949940aa754a2a281 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/configuration_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/configuration_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0f3795a70e24e399f06f63fd0733cad6b30e823 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/configuration_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19cfc2c83632a743c04a624ddd2e00f50c629a73 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..388a73d22d4c9b561e2a887b50a1897b8cf2def9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.cpp @@ -0,0 +1,40 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#include + +#include +#include + + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ERROR("Not implement on cpu"); +} + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + AT_ERROR("Not implement on cpu"); +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..7eac8c8bcd1bf529bb9c13d54d2d4215c9e4c89f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.h @@ -0,0 +1,32 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#pragma once +#include + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ea1d7fabe2684dbb85f00fae2c47b469687cb2c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cu @@ -0,0 +1,156 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#include +#include "cuda/ms_deform_im2col_cuda.cuh" + +#include +#include +#include +#include + +#pragma once +#include + + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); + + const int batch_n = im2col_step_; + auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto columns = output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { + ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + columns.data()); + + })); + } + + output = output.view({batch, num_query, num_heads*channels}); + + return output; +} + + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto grad_value = at::zeros_like(value); + auto grad_sampling_loc = at::zeros_like(sampling_loc); + auto grad_attn_weight = at::zeros_like(attn_weight); + + const int batch_n = im2col_step_; + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto grad_output_g = grad_output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { + ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), + grad_output_g.data(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + grad_value.data() + n * im2col_step_ * per_value_size, + grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); + + })); + } + + return { + grad_value, grad_sampling_loc, grad_attn_weight + }; +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..34f8ae9cb77bbaa8cb4dd25e0cb86632db9ad05d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cuh @@ -0,0 +1,1467 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); + + const int batch_n = im2col_step_; + auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto columns = output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { + ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + columns.data()); + + })); + } + + output = output.view({batch, num_query, num_heads*channels}); + + return output; +} + + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto grad_value = at::zeros_like(value); + auto grad_sampling_loc = at::zeros_like(sampling_loc); + auto grad_attn_weight = at::zeros_like(attn_weight); + + const int batch_n = im2col_step_; + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto grad_output_g = grad_output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { + ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), + grad_output_g.data(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + grad_value.data() + n * im2col_step_ * per_value_size, + grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); + + })); + } + + return { + grad_value, grad_sampling_loc, grad_attn_weight + }; +} + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N, const int num_threads) +{ + return (N + num_threads - 1) / num_threads; +} + + +template +__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + } + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + *grad_attn_weight = top_grad * val; + *grad_sampling_loc = width * grad_w_weight * top_grad_value; + *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); + atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); +} + + +template +__global__ void ms_deformable_im2col_gpu_kernel(const int n, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + scalar_t *data_col_ptr = data_col + index; + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + scalar_t col = 0; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; + } + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + } + } + *data_col_ptr = col; + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockSize; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockSize/2; s>0; s>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockDim.x; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear_gm( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + grad_sampling_loc, grad_attn_weight); + } + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +void ms_deformable_im2col_cuda(cudaStream_t stream, + const scalar_t* data_value, + const int64_t* data_spatial_shapes, + const int64_t* data_level_start_index, + const scalar_t* data_sampling_loc, + const scalar_t* data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* data_col) +{ + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + const int num_threads = CUDA_NUM_THREADS; + ms_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +template +void ms_deformable_col2im_cuda(cudaStream_t stream, + const scalar_t* grad_col, + const scalar_t* data_value, + const int64_t * data_spatial_shapes, + const int64_t * data_level_start_index, + const scalar_t * data_sampling_loc, + const scalar_t * data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + if (channels > 1024) + { + if ((channels & 1023) == 0) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_gm + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + else{ + switch(channels) + { + case 1: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 2: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 4: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 8: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 16: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 32: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 64: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 128: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 256: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 512: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 1024: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + default: + if (channels < 64) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..fbcf4543e66bb1162f42ce2ae57e1bac92243cb4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.h @@ -0,0 +1,29 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#pragma once +#include + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_im2col_cuda.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_im2col_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c0db0c88c9db2c09d7f601937ea0f6ac480913bf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_im2col_cuda.cuh @@ -0,0 +1,1327 @@ +/*! +************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************** +* Modified from DCN (https://github.com/msracver/Deformable-ConvNets) +* Copyright (c) 2018 Microsoft +************************************************************************** +*/ + +#include +#include +#include + +#include +#include + +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N, const int num_threads) +{ + return (N + num_threads - 1) / num_threads; +} + + +template +__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + } + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + *grad_attn_weight = top_grad * val; + *grad_sampling_loc = width * grad_w_weight * top_grad_value; + *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); + atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); +} + + +template +__global__ void ms_deformable_im2col_gpu_kernel(const int n, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + scalar_t *data_col_ptr = data_col + index; + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + scalar_t col = 0; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; + } + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + } + } + *data_col_ptr = col; + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockSize; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockSize/2; s>0; s>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockDim.x; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear_gm( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + grad_sampling_loc, grad_attn_weight); + } + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +void ms_deformable_im2col_cuda(cudaStream_t stream, + const scalar_t* data_value, + const int64_t* data_spatial_shapes, + const int64_t* data_level_start_index, + const scalar_t* data_sampling_loc, + const scalar_t* data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* data_col) +{ + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + const int num_threads = CUDA_NUM_THREADS; + ms_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +template +void ms_deformable_col2im_cuda(cudaStream_t stream, + const scalar_t* grad_col, + const scalar_t* data_value, + const int64_t * data_spatial_shapes, + const int64_t * data_level_start_index, + const scalar_t * data_sampling_loc, + const scalar_t * data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + if (channels > 1024) + { + if ((channels & 1023) == 0) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_gm + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + else{ + switch(channels) + { + case 1: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 2: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 4: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 8: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 16: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 32: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 64: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 128: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 256: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 512: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 1024: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + default: + if (channels < 64) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/ms_deform_attn.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/ms_deform_attn.h new file mode 100644 index 0000000000000000000000000000000000000000..119b1fa317d1e5fcfb61a4837e560e9248db05f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/ms_deform_attn.h @@ -0,0 +1,61 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#pragma once + +#include "cpu/ms_deform_attn_cpu.h" + +#ifdef WITH_CUDA +#include "cuda/ms_deform_attn_cuda.h" +#endif + + +at::Tensor +ms_deform_attn_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + if (value.type().is_cuda()) + { +#ifdef WITH_CUDA + return ms_deform_attn_cuda_forward( + value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::vector +ms_deform_attn_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + if (value.type().is_cuda()) + { +#ifdef WITH_CUDA + return ms_deform_attn_cuda_backward( + value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/vision.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/vision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6ce3875568b9ba8d660c90acc805077cca98f891 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/vision.cpp @@ -0,0 +1,16 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#include "ms_deform_attn.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); + m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); +} \ No newline at end of file diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..da88e3394f653369a7443245c67dcbe57f2ed23e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .selective_scan_with_ln_interface import mamba_inner_fn diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4949778aaf6f0ec7de038aa63970457b44f254a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/selective_scan_with_ln_interface.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/selective_scan_with_ln_interface.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8103fa82787a7da29d05d64a74c480d9646fc77 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/selective_scan_with_ln_interface.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..4a74986a81a13f9428eab353de5b61a4d101972d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py @@ -0,0 +1,525 @@ +# coding=utf-8 +# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Original code from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/selective_scan_interface.py + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from torch.cuda.amp import custom_bwd, custom_fwd + + +try: + import causal_conv1d_cuda +except ImportError: + causal_conv1d_cuda = None + +import mamba_ssm +import selective_scan_cuda + + +# For BC for old mamba-ssm versions: https://github.com/huggingface/transformers/pull/33195#discussion_r1736401127 +if hasattr(mamba_ssm.ops.triton, "layernorm"): + from mamba_ssm.ops.triton.layernorm import _layer_norm_fwd +else: + from mamba_ssm.ops.triton.layer_norm import _layer_norm_fwd + + +class SelectiveScanFn(torch.autograd.Function): + @staticmethod + def forward( + ctx, u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False + ): + if u.stride(-1) != 1: + u = u.contiguous() + if delta.stride(-1) != 1: + delta = delta.contiguous() + if D is not None: + D = D.contiguous() + if B.stride(-1) != 1: + B = B.contiguous() + if C.stride(-1) != 1: + C = C.contiguous() + if z is not None and z.stride(-1) != 1: + z = z.contiguous() + if B.dim() == 3: + B = rearrange(B, "b dstate l -> b 1 dstate l") + ctx.squeeze_B = True + if C.dim() == 3: + C = rearrange(C, "b dstate l -> b 1 dstate l") + ctx.squeeze_C = True + out, x, *rest = selective_scan_cuda.fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus) + ctx.delta_softplus = delta_softplus + ctx.has_z = z is not None + last_state = x[:, :, -1, 1::2] # (batch, dim, dstate) + if not ctx.has_z: + ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x) + return out if not return_last_state else (out, last_state) + else: + ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out) + out_z = rest[0] + return out_z if not return_last_state else (out_z, last_state) + + @staticmethod + def backward(ctx, dout, *args): + if not ctx.has_z: + u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors + z = None + out = None + else: + u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors + if dout.stride(-1) != 1: + dout = dout.contiguous() + # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the + # backward of selective_scan_cuda with the backward of chunk). + # Here we just pass in None and dz will be allocated in the C++ code. + du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = selective_scan_cuda.bwd( + u, + delta, + A, + B, + C, + D, + z, + delta_bias, + dout, + x, + out, + None, + ctx.delta_softplus, + False, # option to recompute out_z, not used here + ) + dz = rest[0] if ctx.has_z else None + dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB + dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC + return ( + du, + ddelta, + dA, + dB, + dC, + dD if D is not None else None, + dz, + ddelta_bias if delta_bias is not None else None, + None, + None, + ) + + +def rms_norm_forward( + x, + weight, + bias, + eps=1e-6, + is_rms_norm=True, +): + # x (b l) d + if x.stride(-1) != 1: + x = x.contiguous() + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + y = _layer_norm_fwd(x, weight, bias, eps, None, residual_dtype=None, is_rms_norm=is_rms_norm)[0] + # y (b l) d + return y + + +def selective_scan_fn( + u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False +): + """if return_last_state is True, returns (out, last_state) + last_state has shape (batch, dim, dstate). Note that the gradient of the last state is + not considered in the backward pass. + """ + return SelectiveScanFn.apply(u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state) + + +def selective_scan_ref( + u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False +): + """ + u: r(B D L) + delta: r(B D L) + A: c(D N) or r(D N) + B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L) + C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L) + D: r(D) + z: r(B D L) + delta_bias: r(D), fp32 + + out: r(B D L) + last_state (optional): r(B D dstate) or c(B D dstate) + """ + dtype_in = u.dtype + u = u.float() + delta = delta.float() + if delta_bias is not None: + delta = delta + delta_bias[..., None].float() + if delta_softplus: + delta = F.softplus(delta) + batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1] + is_variable_B = B.dim() >= 3 + is_variable_C = C.dim() >= 3 + if A.is_complex(): + if is_variable_B: + B = torch.view_as_complex(rearrange(B.float(), "... (L two) -> ... L two", two=2)) + if is_variable_C: + C = torch.view_as_complex(rearrange(C.float(), "... (L two) -> ... L two", two=2)) + else: + B = B.float() + C = C.float() + x = A.new_zeros((batch, dim, dstate)) + ys = [] + deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A)) + if not is_variable_B: + deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u) + else: + if B.dim() == 3: + deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u) + else: + B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1]) + deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u) + if is_variable_C and C.dim() == 4: + C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1]) + last_state = None + for i in range(u.shape[2]): + x = deltaA[:, :, i] * x + deltaB_u[:, :, i] + if not is_variable_C: + y = torch.einsum("bdn,dn->bd", x, C) + else: + if C.dim() == 3: + y = torch.einsum("bdn,bn->bd", x, C[:, :, i]) + else: + y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i]) + if i == u.shape[2] - 1: + last_state = x + if y.is_complex(): + y = y.real * 2 + ys.append(y) + y = torch.stack(ys, dim=2) # (batch dim L) + out = y if D is None else y + u * rearrange(D, "d -> d 1") + if z is not None: + out = out * F.silu(z) + out = out.to(dtype=dtype_in) + return out if not return_last_state else (out, last_state) + + +class MambaInnerFn(torch.autograd.Function): + @staticmethod + @custom_fwd + def forward( + ctx, + xz, + conv1d_weight, + conv1d_bias, + x_proj_weight, + delta_proj_weight, + out_proj_weight, + out_proj_bias, + A, + B=None, + C=None, + D=None, + delta_bias=None, + B_proj_bias=None, + C_proj_bias=None, + delta_softplus=True, + checkpoint_lvl=1, + b_rms_weight=None, + c_rms_weight=None, + dt_rms_weight=None, + b_c_dt_rms_eps=1e-6, + ): + """ + xz: (batch, dim, seqlen) + """ + assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d." + assert checkpoint_lvl in [0, 1] + L = xz.shape[-1] + delta_rank = delta_proj_weight.shape[1] + d_state = A.shape[-1] * (1 if not A.is_complex() else 2) + if torch.is_autocast_enabled(): + x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype()) + delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype()) + out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype()) + out_proj_bias = ( + out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype()) if out_proj_bias is not None else None + ) + if xz.stride(-1) != 1: + xz = xz.contiguous() + conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w") + x, z = xz.chunk(2, dim=1) + conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None + conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, None, None, True) + # We're being very careful here about the layout, to avoid extra transposes. + # We want delta to have d as the slowest moving dimension + # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects. + x_dbl = F.linear(rearrange(conv1d_out, "b d l -> (b l) d"), x_proj_weight) # (bl d) + delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L) + ctx.is_variable_B = B is None + ctx.is_variable_C = C is None + ctx.B_proj_bias_is_None = B_proj_bias is None + ctx.C_proj_bias_is_None = C_proj_bias is None + if B is None: # variable B + B = x_dbl[:, delta_rank : delta_rank + d_state] # (bl dstate) + if B_proj_bias is not None: + B = B + B_proj_bias.to(dtype=B.dtype) + if not A.is_complex(): + # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous() + B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous() + else: + B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous() + else: + if B.stride(-1) != 1: + B = B.contiguous() + if C is None: # variable C + C = x_dbl[:, -d_state:] # (bl dstate) + if C_proj_bias is not None: + C = C + C_proj_bias.to(dtype=C.dtype) + if not A.is_complex(): + # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous() + C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous() + else: + C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous() + else: + if C.stride(-1) != 1: + C = C.contiguous() + if D is not None: + D = D.contiguous() + + if b_rms_weight is not None: + B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous() + B = rms_norm_forward(B, b_rms_weight, bias=None, eps=b_c_dt_rms_eps) + B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous() + if c_rms_weight is not None: + C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous() + C = rms_norm_forward(C, c_rms_weight, bias=None, eps=b_c_dt_rms_eps) + C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous() + if dt_rms_weight is not None: + delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous() + delta = rms_norm_forward(delta, dt_rms_weight, bias=None, eps=b_c_dt_rms_eps) + delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous() + + out, scan_intermediates, out_z = selective_scan_cuda.fwd( + conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus + ) + ctx.delta_softplus = delta_softplus + ctx.out_proj_bias_is_None = out_proj_bias is None + ctx.checkpoint_lvl = checkpoint_lvl + ctx.b_rms_weight = b_rms_weight + ctx.c_rms_weight = c_rms_weight + ctx.dt_rms_weight = dt_rms_weight + ctx.b_c_dt_rms_eps = b_c_dt_rms_eps + if checkpoint_lvl >= 1: # Will recompute conv1d_out and delta in the backward pass + conv1d_out, delta = None, None + ctx.save_for_backward( + xz, + conv1d_weight, + conv1d_bias, + x_dbl, + x_proj_weight, + delta_proj_weight, + out_proj_weight, + conv1d_out, + delta, + A, + B, + C, + D, + delta_bias, + scan_intermediates, + b_rms_weight, + c_rms_weight, + dt_rms_weight, + out, + ) + return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias) + + @staticmethod + @custom_bwd + def backward(ctx, dout): + # dout: (batch, seqlen, dim) + assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d." + ( + xz, + conv1d_weight, + conv1d_bias, + x_dbl, + x_proj_weight, + delta_proj_weight, + out_proj_weight, + conv1d_out, + delta, + A, + B, + C, + D, + delta_bias, + scan_intermediates, + b_rms_weight, + c_rms_weight, + dt_rms_weight, + out, + ) = ctx.saved_tensors + L = xz.shape[-1] + delta_rank = delta_proj_weight.shape[1] + d_state = A.shape[-1] * (1 if not A.is_complex() else 2) + x, z = xz.chunk(2, dim=1) + if dout.stride(-1) != 1: + dout = dout.contiguous() + if ctx.checkpoint_lvl == 1: + conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, None, None, True) + delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L) + if dt_rms_weight is not None: + delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous() + delta = rms_norm_forward(delta, ctx.dt_rms_weight, None, ctx.b_c_dt_rms_eps) + delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous() + if b_rms_weight is not None: + # Recompute & RMSNorm B + B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous() + B = rms_norm_forward(B, ctx.b_rms_weight, None, ctx.b_c_dt_rms_eps) + B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous() + if c_rms_weight is not None: + # Recompute & RMSNorm C + C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous() + C = rms_norm_forward(C, ctx.c_rms_weight, None, ctx.b_c_dt_rms_eps) + C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous() + + # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the + # backward of selective_scan_cuda with the backward of chunk). + dxz = torch.empty_like(xz) # (batch, dim, seqlen) + dx, dz = dxz.chunk(2, dim=1) + dout = rearrange(dout, "b l e -> e (b l)") + dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L) + dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = selective_scan_cuda.bwd( + conv1d_out, + delta, + A, + B, + C, + D, + z, + delta_bias, + dout_y, + scan_intermediates, + out, + dz, + ctx.delta_softplus, + True, # option to recompute out_z + ) + dout_proj_weight = torch.einsum("eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)")) + dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None + dD = dD if D is not None else None + dx_dbl = torch.empty_like(x_dbl) + dB_proj_bias = None + if ctx.is_variable_B: + if not A.is_complex(): + dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous() + else: + dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous() + dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None + dx_dbl[:, delta_rank : delta_rank + d_state] = dB # (bl d) + dB = None + dC_proj_bias = None + if ctx.is_variable_C: + if not A.is_complex(): + dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous() + else: + dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous() + dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None + dx_dbl[:, -d_state:] = dC # (bl d) + dC = None + ddelta = rearrange(ddelta, "b d l -> d (b l)") + ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank]) + dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight) + dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)") + dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d")) + dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out) + dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1]) + # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the + # backward of conv1d with the backward of chunk). + dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd( + x, conv1d_weight, conv1d_bias, dconv1d_out, None, None, None, dx, False, True + ) + dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None + dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w") + return ( + dxz, + dconv1d_weight, + dconv1d_bias, + dx_proj_weight, + ddelta_proj_weight, + dout_proj_weight, + dout_proj_bias, + dA, + dB, + dC, + dD, + ddelta_bias if delta_bias is not None else None, + # 6-None are delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps + dB_proj_bias, + dC_proj_bias, + None, + None, + None, + None, + None, + None, + ) + + +def mamba_inner_fn( + xz, + conv1d_weight, + conv1d_bias, + x_proj_weight, + delta_proj_weight, + out_proj_weight, + out_proj_bias, + A, + B=None, + C=None, + D=None, + delta_bias=None, + B_proj_bias=None, + C_proj_bias=None, + delta_softplus=True, + checkpoint_lvl=1, + b_rms_weight=None, + c_rms_weight=None, + dt_rms_weight=None, + b_c_dt_rms_eps=1e-6, +): + return MambaInnerFn.apply( + xz, + conv1d_weight, + conv1d_bias, + x_proj_weight, + delta_proj_weight, + out_proj_weight, + out_proj_bias, + A, + B, + C, + D, + delta_bias, + B_proj_bias, + C_proj_bias, + delta_softplus, + checkpoint_lvl, + b_rms_weight, + c_rms_weight, + dt_rms_weight, + b_c_dt_rms_eps, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..87ed89052873813153786bd416a981d3e5279af9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.cu @@ -0,0 +1,383 @@ +#include "cuda_kernel.h" + +////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void index_max_cuda_kernel( + float *index_vals, // [batch_size, 32, num_block] + int *indices, // [batch_size, num_block] + float *max_vals, // [batch_size, A_num_block * 32] + float *max_vals_scatter, // [batch_size, 32, num_block] + long batch_size, + long A_num_block, + long B_num_block, + long num_block +) { + + long batch_idx = blockIdx.x; + + long thread_idx = threadIdx.x; + long num_thread = blockDim.x; + + extern __shared__ float buffer[]; + int *max_buffer = (int*)buffer; + + for (int i = 0; i < A_num_block * 32; i = i + num_thread) { + int idx = i + thread_idx; + if (idx < A_num_block * 32) { + max_buffer[idx] = -1e8; + } + } + __syncthreads(); + + int *indices_pt = &indices[batch_idx * num_block]; + float *index_vals_pt = &index_vals[batch_idx * num_block * 32]; + + for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) { + int idx = idx_start + thread_idx; + int A_block_idx = indices_pt[idx % num_block] / B_num_block; + atomicMax(&max_buffer[A_block_idx * 32 + idx / num_block], (int)(index_vals_pt[idx] * 1000)); + } + __syncthreads(); + + float *max_vals_pt = &max_vals[batch_idx * A_num_block * 32]; + for (int i = 0; i < A_num_block * 32; i = i + num_thread) { + int idx = i + thread_idx; + if (idx < A_num_block * 32) { + max_vals_pt[idx] = (float)max_buffer[idx] / 1000.; + } + } + + float *max_vals_scatter_pt = &max_vals_scatter[batch_idx * num_block * 32]; + for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) { + int idx = idx_start + thread_idx; + int A_block_idx = indices_pt[idx % num_block] / B_num_block; + max_vals_scatter_pt[idx] = (float)max_buffer[A_block_idx * 32 + idx / num_block] / 1000.; + } + +} + +__global__ void mm_to_sparse_cuda_kernel( + float *dense_A, // [batch_size, A_num_block, dim, 32] + float *dense_B, // [batch_size, B_num_block, dim, 32] + int *indices, // [batch_size, num_block] + float *sparse_C, // [batch_size, num_block, 32, 32] + long batch_size, + long A_num_block, + long B_num_block, + long dim, + long num_block +) { + + long batch_idx = blockIdx.y; + long block_idx = blockIdx.x * blockDim.y + threadIdx.y; + + long thread_idx = threadIdx.x; + + __shared__ float buffer[4096]; + float *A_buffer = &buffer[threadIdx.y * 1024]; // [2, 8, 32] + float *B_buffer = &buffer[threadIdx.y * 1024 + 512]; // [2, 8, 32] + + long batch_idx__block_idx = batch_idx * num_block + block_idx; + + long AB_block_idx = indices[batch_idx__block_idx]; + float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * dim * 32]; + float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * dim * 32]; + + int reg_1_idx = thread_idx / 8; // [0000000011111111222222223333333344444444555555556666666677777777] + int reg_2_idx = thread_idx % 8; // [0123456701234567012345670123456701234567012345670123456701234567] + + float reg_1[8]; + float reg_2[8]; + + float reg_array[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + #pragma unroll + for (int i = 0; i < 4; i++) { + A_buffer[i * 64 + thread_idx] = dense_A_pt[i * 64 + thread_idx]; + B_buffer[i * 64 + thread_idx] = dense_B_pt[i * 64 + thread_idx]; + } + + __syncthreads(); + + #pragma unroll + for (int i = 0; i < 4; i++) { + reg_1[i] = A_buffer[reg_1_idx * 4 + i]; + reg_2[i] = B_buffer[reg_2_idx * 4 + i]; + } + + for (int dim_stride = 1; dim_stride < (dim / 8); dim_stride++) { + + #pragma unroll + for (int i = 0; i < 4; i++) { + A_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_A_pt[dim_stride * 256 + i * 64 + thread_idx]; + B_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_B_pt[dim_stride * 256 + i * 64 + thread_idx]; + } + + #pragma unroll + for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) { + #pragma unroll + for (int i = 0; i < 4; i++) { + reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_1_idx * 4 + i]; + reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_2_idx * 4 + i]; + } + #pragma unroll + for (int i = 0; i < 4; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j]; + } + } + } + + __syncthreads(); + + #pragma unroll + for (int i = 0; i < 4; i++) { + reg_1[i] = A_buffer[(dim_stride % 2) * 256 + reg_1_idx * 4 + i]; + reg_2[i] = B_buffer[(dim_stride % 2) * 256 + reg_2_idx * 4 + i]; + } + + #pragma unroll + for (int i = 0; i < 4; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j]; + } + } + + } + + #pragma unroll + for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) { + #pragma unroll + for (int i = 0; i < 4; i++) { + reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[256 + mini_dim_idx * 32 + reg_1_idx * 4 + i]; + reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[256 + mini_dim_idx * 32 + reg_2_idx * 4 + i]; + } + #pragma unroll + for (int i = 0; i < 4; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j]; + } + } + } + #pragma unroll + for (int i = 0; i < 4; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j]; + } + } + __syncthreads(); + + float *C_buffer = &buffer[threadIdx.y * 1024]; // [32, 32] + + #pragma unroll + for (int i = 0; i < 4; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + C_buffer[(reg_2_idx * 4 + j) * 32 + reg_1_idx * 4 + i] = reg_array[i * 4 + j]; + } + } + __syncthreads(); + + float *sparse_C_pt = &sparse_C[batch_idx__block_idx * 1024]; + + #pragma unroll + for (int i = 0; i < 16; i++) { + sparse_C_pt[i * 64 + thread_idx] = C_buffer[i * 64 + thread_idx]; + } + +} + +__global__ void sparse_dense_mm_cuda_kernel( + float *sparse_A, // [batch_size, num_block, 32, 32] + int *indices, // [batch_size, num_block] + float *dense_B, // [batch_size, B_num_block, dim, 32] + float *dense_C, // [batch_size, A_num_block, dim, 32] + long batch_size, + long A_num_block, + long B_num_block, + long dim, + long num_block +) { + + long batch_idx = blockIdx.y; + long block_idx = blockIdx.x * blockDim.y + threadIdx.y; + + long thread_idx = threadIdx.x; + + __shared__ float buffer[6144]; + float *A_buffer = &buffer[threadIdx.y * 3072]; // [32, 32] + float *B_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [32, 64] + + long batch_idx__block_idx = batch_idx * num_block + block_idx; + + float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024]; + #pragma unroll + for (int i = 0; i < 8; i++) { + A_buffer[i * 128 + thread_idx] = sparse_A_pt[i * 128 + thread_idx]; + } + + long AB_block_idx = indices[batch_idx__block_idx]; + float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * 32 * dim]; + float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32 * dim]; + + // [0000000011111111222222223333333344444444555555556666666677777777] + // [0123456701234567012345670123456701234567012345670123456701234567] + int reg_1_idx = thread_idx / 8; + int reg_2_idx = thread_idx % 8; + + float reg_1[8]; + float reg_2[8]; + + float reg_array[16]; + + for (int dim_stride = 0; dim_stride < dim; dim_stride = dim_stride + 64) { + + #pragma unroll + for (int i = 0; i < 16; i++) { + B_buffer[i * 128 + thread_idx] = dense_B_pt[dim_stride * 32 + i * 128 + thread_idx]; + } + + #pragma unroll + for (int i = 0; i < 16; i++) { + reg_array[i] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int i = 0; i < 4; i++) { + reg_1[i] = B_buffer[(reg_1_idx * 4 + i) * 32]; + reg_2[i] = A_buffer[reg_2_idx * 4 + i]; + } + + #pragma unroll + for (int mini_dim_idx = 1; mini_dim_idx < 32; mini_dim_idx++) { + #pragma unroll + for (int i = 0; i < 4; i++) { + reg_1[(mini_dim_idx % 2) * 4 + i] = B_buffer[(reg_1_idx * 4 + i) * 32 + mini_dim_idx]; + reg_2[(mini_dim_idx % 2) * 4 + i] = A_buffer[mini_dim_idx * 32 + reg_2_idx * 4 + i]; + } + #pragma unroll + for (int i = 0; i < 4; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j]; + } + } + } + + #pragma unroll + for (int i = 0; i < 4; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j]; + } + } + + __syncthreads(); + + float *C_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [64, 32] + + #pragma unroll + for (int i = 0; i < 4; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + C_buffer[(reg_1_idx * 4 + i) * 32 + reg_2_idx * 4 + j] = reg_array[i * 4 + j]; + } + } + __syncthreads(); + + #pragma unroll + for (int i = 0; i < 16; i++) { + atomicAdd(&dense_C_pt[dim_stride * 32 + i * 128 + thread_idx], C_buffer[i * 128 + thread_idx]); + } + __syncthreads(); + + } + +} + + +__global__ void reduce_sum_cuda_kernel( + float *sparse_A, // [batch_size, num_block, 32, 32] + int *indices, // [batch_size, num_block] + float *dense_C, // [batch_size, A_num_block, 32] + long batch_size, + long A_num_block, + long B_num_block, + long num_block +) { + + long batch_idx = blockIdx.y; + long block_idx = blockIdx.x * blockDim.y + threadIdx.y; + + long thread_idx = threadIdx.x; + + long batch_idx__block_idx = batch_idx * num_block + block_idx; + + long AB_block_idx = indices[batch_idx__block_idx]; + float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024]; + + float reg_array[16]; + float value = 0; + + #pragma unroll + for (int i = 0; i < 8; i++) { + reg_array[i] = sparse_A_pt[i * 32 + thread_idx]; + } + #pragma unroll + for (int stride = 8; stride < 32; stride = stride + 8) { + #pragma unroll + for (int i = 0; i < 8; i++) { + reg_array[(stride + i) % 16] = sparse_A_pt[(stride + i) * 32 + thread_idx]; + } + #pragma unroll + for (int i = 0; i < 8; i++) { + value = value + reg_array[(stride - 8 + i) % 16]; + } + } + #pragma unroll + for (int i = 0; i < 8; i++) { + value = value + reg_array[8 + i]; + } + + float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32]; + + atomicAdd(&dense_C_pt[thread_idx], value); + +} + +__global__ void scatter_cuda_kernel( + float *dense_A, // [batch_size, A_num_block, 32] + int *indices, // [batch_size, num_block] + float *sparse_C, // [batch_size, num_block, 32, 32] + long batch_size, + long A_num_block, + long B_num_block, + long num_block +) { + + long batch_idx = blockIdx.y; + long block_idx = blockIdx.x * blockDim.y + threadIdx.y; + + long thread_idx = threadIdx.x; + + long batch_idx__block_idx = batch_idx * num_block + block_idx; + + long AB_block_idx = indices[batch_idx__block_idx]; + float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32]; + float *sparse_C_pt = &sparse_C[(batch_idx * num_block + block_idx) * 1024]; + + float value = dense_A_pt[thread_idx]; + + #pragma unroll + for (int i = 0; i < 32; i++) { + sparse_C_pt[i * 32 + thread_idx] = value; + } + +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..a95b46f7d159b11851143710034cf80c20aa6bf8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.h @@ -0,0 +1,59 @@ + +#define WARP_SIZE 32 +#define FULL_MASK 0xffffffff +#define OPTIMAL_THREADS 256 + +__global__ void index_max_cuda_kernel( + float *index_vals, // [batch_size, 32, num_block] + int *indices, // [batch_size, num_block] + float *max_vals, // [batch_size, A_num_block * 32] + float *max_vals_scatter, // [batch_size, 32, num_block] + long batch_size, + long A_num_block, + long B_num_block, + long num_block +); + +__global__ void mm_to_sparse_cuda_kernel( + float *dense_A, // [batch_size, A_num_block, dim, 32] + float *dense_B, // [batch_size, B_num_block, dim, 32] + int *indices, // [batch_size, num_block] + float *sparse_C, // [batch_size, num_block, 32, 32] + long batch_size, + long A_num_block, + long B_num_block, + long dim, + long num_block +); + +__global__ void sparse_dense_mm_cuda_kernel( + float *sparse_A, // [batch_size, num_block, 32, 32] + int *indices, // [batch_size, num_block] + float *dense_B, // [batch_size, B_num_block, dim, 32] + float *dense_C, // [batch_size, A_num_block, dim, 32] + long batch_size, + long A_num_block, + long B_num_block, + long dim, + long num_block +); + +__global__ void reduce_sum_cuda_kernel( + float *sparse_A, // [batch_size, num_block, 32, 32] + int *indices, // [batch_size, num_block] + float *dense_C, // [batch_size, A_num_block, 32] + long batch_size, + long A_num_block, + long B_num_block, + long num_block +); + +__global__ void scatter_cuda_kernel( + float *dense_A, // [batch_size, A_num_block, 32] + int *indices, // [batch_size, num_block] + float *sparse_C, // [batch_size, num_block, 32, 32] + long batch_size, + long A_num_block, + long B_num_block, + long num_block +); diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.cu new file mode 100644 index 0000000000000000000000000000000000000000..ba2a0cacfe614e75e06d2dde80dc77a6e8a4ec1a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.cu @@ -0,0 +1,154 @@ +#include +#include +#include "cuda_launch.h" +#include "cuda_kernel.h" +#include + +////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + +std::vector index_max_kernel( + at::Tensor index_vals, // [batch_size, 32, num_block] + at::Tensor indices, // [batch_size, num_block], + int A_num_block, + int B_num_block +) { + int batch_size = indices.size(0); + int num_block = indices.size(1); + + at::Tensor max_vals = at::zeros({batch_size, A_num_block * 32}, index_vals.options()); + at::Tensor max_vals_scatter = at::zeros({batch_size, 32, num_block}, index_vals.options()); + + dim3 threads(256); + dim3 blocks(batch_size); + int shared_mem = A_num_block * 32 * sizeof(float); + + index_max_cuda_kernel<<>>( + index_vals.data_ptr(), + indices.data_ptr(), + max_vals.data_ptr(), + max_vals_scatter.data_ptr(), + batch_size, + A_num_block, + B_num_block, + num_block + ); + + return {max_vals, max_vals_scatter}; +} + +at::Tensor mm_to_sparse_kernel( + at::Tensor dense_A, // [batch_size, A_num_block, dim, 32] + at::Tensor dense_B, // [batch_size, B_num_block, dim, 32] + at::Tensor indices // [batch_size, num_block] +) { + int batch_size = dense_A.size(0); + int A_num_block = dense_A.size(1); + int B_num_block = dense_B.size(1); + int dim = dense_A.size(2); + int num_block = indices.size(1); + + at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options()); + + dim3 threads(64, 4); + dim3 blocks(num_block / 4, batch_size); + + mm_to_sparse_cuda_kernel<<>>( + dense_A.data_ptr(), + dense_B.data_ptr(), + indices.data_ptr(), + sparse_C.data_ptr(), + batch_size, + A_num_block, + B_num_block, + dim, + num_block + ); + + return sparse_C; +} + +at::Tensor sparse_dense_mm_kernel( + at::Tensor sparse_A, // [batch_size, num_block, 32, 32] + at::Tensor indices, // [batch_size, num_block] + at::Tensor dense_B, // [batch_size, B_num_block, dim, 32] + int A_num_block +) { + int batch_size = sparse_A.size(0); + int num_block = sparse_A.size(1); + int B_num_block = dense_B.size(1); + int dim = dense_B.size(2); + + at::Tensor dense_C = at::zeros({batch_size, A_num_block, dim, 32}, dense_B.options()); + + dim3 threads(128, 2); + dim3 blocks(num_block / 2, batch_size); + + sparse_dense_mm_cuda_kernel<<>>( + sparse_A.data_ptr(), + indices.data_ptr(), + dense_B.data_ptr(), + dense_C.data_ptr(), + batch_size, + A_num_block, + B_num_block, + dim, + num_block + ); + + return dense_C; +} + +at::Tensor reduce_sum_kernel( + at::Tensor sparse_A, // [batch_size, num_block, 32, 32] + at::Tensor indices, // [batch_size, num_block] + int A_num_block, + int B_num_block +) { + int batch_size = sparse_A.size(0); + int num_block = sparse_A.size(1); + + at::Tensor dense_C = at::zeros({batch_size, A_num_block, 32}, sparse_A.options()); + + dim3 threads(32, 4); + dim3 blocks(num_block / 4, batch_size); + + reduce_sum_cuda_kernel<<>>( + sparse_A.data_ptr(), + indices.data_ptr(), + dense_C.data_ptr(), + batch_size, + A_num_block, + B_num_block, + num_block + ); + + return dense_C; +} + +at::Tensor scatter_kernel( + at::Tensor dense_A, // [batch_size, A_num_block, 32] + at::Tensor indices, // [batch_size, num_block] + int B_num_block +) { + int batch_size = dense_A.size(0); + int A_num_block = dense_A.size(1); + int num_block = indices.size(1); + + at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options()); + + dim3 threads(32, 4); + dim3 blocks(num_block / 4, batch_size); + + scatter_cuda_kernel<<>>( + dense_A.data_ptr(), + indices.data_ptr(), + sparse_C.data_ptr(), + batch_size, + A_num_block, + B_num_block, + num_block + ); + + return sparse_C; +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.h new file mode 100644 index 0000000000000000000000000000000000000000..0200140ee337b8c5d9583767bbad1e842e9d4677 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.h @@ -0,0 +1,39 @@ +#include +#include +#include + +#define min(a, b) ((a)<(b)?(a):(b)) +#define max(a, b) ((a)>(b)?(a):(b)) + +std::vector index_max_kernel( + at::Tensor index_vals, + at::Tensor indices, + int A_num_block, + int B_num_block +); + +at::Tensor mm_to_sparse_kernel( + at::Tensor dense_A, + at::Tensor dense_B, + at::Tensor indices +); + +at::Tensor sparse_dense_mm_kernel( + at::Tensor sparse_A, + at::Tensor indices, + at::Tensor dense_B, + int A_num_block +); + +at::Tensor reduce_sum_kernel( + at::Tensor sparse_A, + at::Tensor indices, + int A_num_block, + int B_num_block +); + +at::Tensor scatter_kernel( + at::Tensor dense_A, + at::Tensor indices, + int B_num_block +); diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/torch_extension.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/torch_extension.cpp new file mode 100644 index 0000000000000000000000000000000000000000..60c9262b779270a6e95ae54f53a67daa6d740a9e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/torch_extension.cpp @@ -0,0 +1,78 @@ +#include +#include +#include "cuda_launch.h" +#include + +std::vector index_max( + at::Tensor index_vals, + at::Tensor indices, + int A_num_block, + int B_num_block +) { + return index_max_kernel( + index_vals, + indices, + A_num_block, + B_num_block + ); +} + +at::Tensor mm_to_sparse( + at::Tensor dense_A, + at::Tensor dense_B, + at::Tensor indices +) { + return mm_to_sparse_kernel( + dense_A, + dense_B, + indices + ); +} + +at::Tensor sparse_dense_mm( + at::Tensor sparse_A, + at::Tensor indices, + at::Tensor dense_B, + int A_num_block +) { + return sparse_dense_mm_kernel( + sparse_A, + indices, + dense_B, + A_num_block + ); +} + +at::Tensor reduce_sum( + at::Tensor sparse_A, + at::Tensor indices, + int A_num_block, + int B_num_block +) { + return reduce_sum_kernel( + sparse_A, + indices, + A_num_block, + B_num_block + ); +} + +at::Tensor scatter( + at::Tensor dense_A, + at::Tensor indices, + int B_num_block +) { + return scatter_kernel( + dense_A, + indices, + B_num_block + ); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("index_max", &index_max, "index_max (CUDA)"); + m.def("mm_to_sparse", &mm_to_sparse, "mm_to_sparse (CUDA)"); + m.def("sparse_dense_mm", &sparse_dense_mm, "sparse_dense_mm (CUDA)"); + m.def("reduce_sum", &reduce_sum, "reduce_sum (CUDA)"); + m.def("scatter", &scatter, "scatter (CUDA)"); +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..571d5a8a8307e95aac689eb3c9333d1ad350c7de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda.cu @@ -0,0 +1,187 @@ +#include +#include + +#define MIN_VALUE (-1e38) + +template +__global__ void kernel_forward( + const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u, + const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + F u = _u[_c]; + F w = _w[_c]; + const F *__restrict__ const k = _k + _offset; + const F *__restrict__ const v = _v + _offset; + F *__restrict__ const y = _y + _offset; + + // aa and bb are running sums divided by exp(pp) (to avoid overflow) + F aa = 0, bb = 0, pp = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const F kk = k[ii]; + const F vv = v[ii]; + + F ww = u + kk; + F p = max(pp, ww); + F e1 = exp(pp - p); + F e2 = exp(ww - p); + y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2); + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } +} + +template +__global__ void kernel_forward_with_state( + const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u, + const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y, F *__restrict__ const _s +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset_s = _b * C * 3 + _c * 3; + const int _offset = _b * T * C + _c; + + F u = _u[_c]; + F w = _w[_c]; + const F *__restrict__ const k = _k + _offset; + const F *__restrict__ const v = _v + _offset; + F *__restrict__ const y = _y + _offset; + F *__restrict__ const s = _s + _offset_s; + + // aa and bb are running sums divided by exp(pp) (to avoid overflow) + F aa = s[0], bb = s[1], pp = s[2]; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const F kk = k[ii]; + const F vv = v[ii]; + + F ww = u + kk; + F p = max(pp, ww); + F e1 = exp(pp - p); + F e2 = exp(ww - p); + y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2); + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } + s[0] = aa; + s[1] = bb; + s[2] = pp; +} + +template +__global__ void kernel_backward( + const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u, + const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _y, + const F *__restrict__ const _gy, F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk, + F *__restrict__ const _gv +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + F u = _u[_c]; + F w = _w[_c]; + const F *__restrict__ const k = _k + _offset; + const F *__restrict__ const v = _v + _offset; + const F *__restrict__ const y = _y + _offset; + const F *__restrict__ const gy = _gy + _offset; + F *__restrict__ const gk = _gk + _offset; + F *__restrict__ const gv = _gv + _offset; + + F q[Tmax], r[Tmax]; + + F gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const F kk = k[ii]; + const F vv = v[ii]; + const F yy = y[ii]; + + F ww = u + kk; + F p = max(pp, ww); + F e1 = exp(pp - p); + F e2 = exp(ww - p); + const F qq = gy[ii] / (e1 * bb + e2); + gw += (ga - gb * yy) * e1 * qq; + gu += (vv - yy) * e2 * qq; + q[i] = qq; + r[i] = ww - p; + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + ga = e1 * (aa + ga); + gb = e1 * (bb + gb); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } + const int _offsetBC = _b * C + _c; + _gw[_offsetBC] = gw * _w[_c]; // multiply by w because of w -> -exp(w) in python forward() + _gu[_offsetBC] = gu; + + aa = 0, bb = 0, pp = MIN_VALUE; + for (int i = T - 1; i >= 0; i--) { + const int ii = i * C; + const F kk = k[ii]; + const F vv = v[ii]; + const F yy = y[ii]; + const F qq = q[i]; + const F rr = r[i]; + + F e1 = qq * exp(rr); + F e2 = exp(kk + pp); + gk[ii] = e1 * (vv - yy) + e2 * (aa * vv + bb); + gv[ii] = e1 + e2 * aa; + + const F ww = w + pp; + const F www = rr - u - kk; + const F p = max(ww, www); + e1 = exp(ww - p); + e2 = qq * exp(www - p); + aa = e1 * aa + e2; + bb = e1 * bb - e2 * yy; + pp = p; + } +} + +void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward<<>>(B, T, C, w, u, k, v, y); +} + +void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward_with_state<<>>(B, T, C, w, u, k, v, y, s); +} + +void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_backward<<>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv); +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda_bf16.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda_bf16.cu new file mode 100644 index 0000000000000000000000000000000000000000..042cb4aba1db98be5916aea1de86a7fed0b6510d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda_bf16.cu @@ -0,0 +1,186 @@ +#include +#include +#include "ATen/ATen.h" +#define MIN_VALUE (-1e38) +typedef at::BFloat16 bf16; + +__global__ void kernel_forward_bf16( + const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u, + const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + float u = float(_u[_c]); + float w = _w[_c]; + const bf16 *__restrict__ const k = _k + _offset; + const bf16 *__restrict__ const v = _v + _offset; + bf16 *__restrict__ const y = _y + _offset; + + // aa and bb are running sums divided by exp(pp) (to avoid overflow) + float aa = 0, bb = 0, pp = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const float kk = float(k[ii]); + const float vv = float(v[ii]); + + float ww = u + kk; + float p = max(pp, ww); + float e1 = exp(pp - p); + float e2 = exp(ww - p); + y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2)); + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } +} + +__global__ void kernel_forward_with_state_bf16( + const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u, + const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y, + float *__restrict__ const _s +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset_s = _b * C * 3 + _c * 3; + const int _offset = _b * T * C + _c; + + float u = float(_u[_c]); + float w = _w[_c]; + const bf16 *__restrict__ const k = _k + _offset; + const bf16 *__restrict__ const v = _v + _offset; + bf16 *__restrict__ const y = _y + _offset; + float *__restrict__ const s = _s + _offset_s; + + // aa and bb are running sums divided by exp(pp) (to avoid overflow) + float aa = s[0], bb = s[1], pp = s[2]; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const float kk = float(k[ii]); + const float vv = float(v[ii]); + + float ww = u + kk; + float p = max(pp, ww); + float e1 = exp(pp - p); + float e2 = exp(ww - p); + y[ii] = bf16(e1 * aa + e2 * vv) / (e1 * bb + e2); + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } + s[0] = aa; + s[1] = bb; + s[2] = pp; +} + +__global__ void kernel_backward_bf16( + const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u, + const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, const bf16 *__restrict__ const _y, + const bf16 *__restrict__ const _gy, bf16 *__restrict__ const _gw, bf16 *__restrict__ const _gu, + bf16 *__restrict__ const _gk, bf16 *__restrict__ const _gv +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + float u = float(_u[_c]); + float w = _w[_c]; + const bf16 *__restrict__ const k = _k + _offset; + const bf16 *__restrict__ const v = _v + _offset; + const bf16 *__restrict__ const y = _y + _offset; + const bf16 *__restrict__ const gy = _gy + _offset; + bf16 *__restrict__ const gk = _gk + _offset; + bf16 *__restrict__ const gv = _gv + _offset; + + float q[Tmax], r[Tmax]; + + float gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + const float kk = float(k[ii]); + const float vv = float(v[ii]); + const float yy = float(y[ii]); + + float ww = u + kk; + float p = max(pp, ww); + float e1 = exp(pp - p); + float e2 = exp(ww - p); + const float qq = float(gy[ii]) / (e1 * bb + e2); + gw += (ga - gb * yy) * e1 * qq; + gu += (vv - yy) * e2 * qq; + q[i] = qq; + r[i] = ww - p; + + ww = w + pp; + p = max(ww, kk); + e1 = exp(ww - p); + e2 = exp(kk - p); + ga = e1 * (aa + ga); + gb = e1 * (bb + gb); + aa = e1 * aa + e2 * vv; + bb = e1 * bb + e2; + pp = p; + } + const int _offsetBC = _b * C + _c; + _gw[_offsetBC] = bf16(gw * _w[_c]); // multiply by w because of w -> -exp(w) in python forward() + _gu[_offsetBC] = bf16(gu); + + aa = 0, bb = 0, pp = MIN_VALUE; + for (int i = T - 1; i >= 0; i--) { + const int ii = i * C; + const float kk = float(k[ii]); + const float vv = float(v[ii]); + const float yy = float(y[ii]); + const float qq = q[i]; + const float rr = r[i]; + + float e1 = qq * exp(rr); + float e2 = exp(kk + pp); + gk[ii] = bf16(e1 * (vv - yy) + e2 * (aa * vv + bb)); + gv[ii] = bf16(e1 + e2 * aa); + + const float ww = w + pp; + const float www = rr - u - kk; + const float p = max(ww, www); + e1 = exp(ww - p); + e2 = qq * exp(www - p); + aa = e1 * aa + e2; + bb = e1 * bb - e2 * yy; + pp = p; + } +} + +void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward_bf16<<>>(B, T, C, w, u, k, v, y); +} + +void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward_with_state_bf16<<>>(B, T, C, w, u, k, v, y, s); +} + +void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_backward_bf16<<>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv); +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_op.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..55e7280665927b523a88021d5111daf28a63c905 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_op.cpp @@ -0,0 +1,66 @@ +#include +#include "ATen/ATen.h" +typedef at::BFloat16 bf16; + +void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y); +void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y); +void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s); +void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s); +void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv); +void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv); + +void forward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_forward(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr()); +} +void forward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_forward_bf16(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr()); +} +void forward_with_state(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_forward_with_state(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), s.data_ptr()); +} +void forward_with_state_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_forward_with_state_bf16(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), s.data_ptr()); +} +void backward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_backward(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), gy.data_ptr(), gw.data_ptr(), gu.data_ptr(), gk.data_ptr(), gv.data_ptr()); +} +void backward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) { + const int B = k.size(0); + const int T = k.size(1); + const int C = k.size(2); + cuda_backward_bf16(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), + gy.data_ptr(), gw.data_ptr(), gu.data_ptr(), gk.data_ptr(), gv.data_ptr()); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &forward, "wkv forward"); + m.def("forward_bf16", &forward_bf16, "wkv forward bf16"); + m.def("forward_with_state", &forward_with_state, "wkv forward with state"); + m.def("forward_with_state_bf16", &forward_with_state_bf16, "wkv forward with state bf16"); + m.def("backward", &backward, "wkv backward"); + m.def("backward_bf16", &backward_bf16, "wkv backward bf16"); +} + +TORCH_LIBRARY(wkv, m) { + m.def("forward", forward); + m.def("forward_bf16", forward_bf16); + m.def("forward_with_state", forward_with_state); + m.def("forward_with_state_bf16", forward_with_state_bf16); + m.def("backward", backward); + m.def("backward_bf16", backward_bf16); +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common.h new file mode 100644 index 0000000000000000000000000000000000000000..e5085c88dd3ea9a12eec264a8c48946bf2b80b23 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common.h @@ -0,0 +1,10 @@ + +#define min(a, b) ((a)<(b)?(a):(b)) +#define max(a, b) ((a)>(b)?(a):(b)) +#define ceil_divide(a, b) ((a)/(b)+((a)%(b)!=0)) +#define select(cond, a, b) ((cond)?(a):(b)) +#define PI 3.141592 +#define EPSILON 1e-8 +#define MAX_VAL 1e12 +#define MIN_VAL -1e12 +#define EMPTY_VALUE -1 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..97030870649a2fdac58cb26cf966e8f5c8cc7909 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda.h @@ -0,0 +1,9 @@ + +#define MAX_THREADS_PER_BLOCK 1024 +#define OPTIMAL_THREADS_PER_BLOCK 256 +#define WARP_SIZE 32 +#define MAX_NUM_BLOCK_X 2147483647 +#define MAX_NUM_BLOCK_Y 65535 +#define MAX_NUM_BLOCK_Z 65535 +#define MAX_SHARED_MEM_PER_BLOCK 48000 +#define FULL_MASK 0xffffffff diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda_device.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda_device.h new file mode 100644 index 0000000000000000000000000000000000000000..6674f93afdc25ab35c5d83881d00028bcf2989fc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda_device.h @@ -0,0 +1,79 @@ + +#include "common.h" + +template +__device__ int set_insert(T *set, int set_size, T value) { + int slot = value % set_size; + int start_slot = slot; + while (true) { + T prev = atomicCAS(&set[slot], EMPTY_VALUE, value); + if (prev == EMPTY_VALUE || prev == value) { + return slot; + } + slot = (slot + 1) % set_size; + if (slot == start_slot) { + return -1; + } + } + return -1; +} + +template +__device__ int set_lookup(T *set, int set_size, T value) { + int slot = value % set_size; + int start_slot = slot; + while (true) { + if (set[slot] == value) { + return slot; + } + slot = (slot + 1) % set_size; + if (slot == start_slot) { + return -1; + } + } + return -1; +} + +template +__device__ void init_buffer(T init_value, T *buffer, int buffer_size, int num_threads, int thread_id) { + __syncthreads(); + for (int i = 0; i < buffer_size; i = i + num_threads) { + int offset_idx = i + thread_id; + if (offset_idx < buffer_size) { + buffer[offset_idx] = init_value; + } + } + __syncthreads(); +} + +template +__device__ void copy_data(T *src_pt, T *dist_pt, int data_length, int num_threads, int thread_id) { + __syncthreads(); + for (int i = 0; i < data_length; i = i + num_threads) { + int offset_idx = i + thread_id; + if (offset_idx < data_length) { + dist_pt[offset_idx] = src_pt[offset_idx]; + } + } + __syncthreads(); +} + +template +__device__ void init_buffer_nonblocking(T init_value, T *buffer, int buffer_size, int num_threads, int thread_id) { + for (int i = 0; i < buffer_size; i = i + num_threads) { + int offset_idx = i + thread_id; + if (offset_idx < buffer_size) { + buffer[offset_idx] = init_value; + } + } +} + +template +__device__ void copy_data_nonblocking(T *src_pt, T *dist_pt, int data_length, int num_threads, int thread_id) { + for (int i = 0; i < data_length; i = i + num_threads) { + int offset_idx = i + thread_id; + if (offset_idx < data_length) { + dist_pt[offset_idx] = src_pt[offset_idx]; + } + } +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.cu new file mode 100644 index 0000000000000000000000000000000000000000..c6b13e6cb5f53c9c62e51d2c399a14d14dab7037 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.cu @@ -0,0 +1,588 @@ +// File from https://github.com/mlpen/YOSO/blob/main/encoders/backbones/efficient_attentions/yoso/yoso_v1/cuda/fast_lsh_cumulation.cu + +#include +#include +#include "fast_lsh_cumulation.h" +#include "fast_lsh_cumulation_cuda.h" +#include "common_cuda.h" +#include "common.h" +#include +////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + +std::vector fast_hash_ver1_kernel( + at::Tensor query_mask, + at::Tensor query_vector, + at::Tensor key_mask, + at::Tensor key_vector, + int num_hash_f, + int hash_code_len, + bool use_cuda +) { + + int batch_size = query_vector.size(0); + int num_query = query_vector.size(1); + int num_key = key_vector.size(1); + int vector_dim = query_vector.size(2); + + int num_hash_per_part = vector_dim / hash_code_len; + int num_part = max(1, ceil_divide(num_hash_f, num_hash_per_part)); + + at::Tensor Dmat = 2 * at::randint(0, 2, {batch_size, 3, num_part, vector_dim}, query_mask.options()) - 1; + at::Tensor query_hash_code = at::zeros({batch_size, num_query, num_hash_f}, query_mask.options()); + at::Tensor key_hash_code = at::zeros({batch_size, num_key, num_hash_f}, key_mask.options()); + + int *query_mask_ptr = query_mask.data_ptr(); + float *query_vector_ptr = query_vector.data_ptr(); + int *key_mask_ptr = key_mask.data_ptr(); + float *key_vector_ptr = key_vector.data_ptr(); + + int *Dmat_ptr = Dmat.data_ptr(); + + int *query_hash_code_ptr = query_hash_code.data_ptr(); + int *key_hash_code_ptr = key_hash_code.data_ptr(); + + if (use_cuda) { + { + dim3 threads(vector_dim); + dim3 blocks(num_part, num_query, batch_size); + int shared_mem = vector_dim * sizeof(float); + fast_hash_ver1_cuda_kernel<<>>( + query_mask_ptr, + query_vector_ptr, + Dmat_ptr, + query_hash_code_ptr, + batch_size, + num_query, + vector_dim, + num_part, + num_hash_f, + hash_code_len + ); + } + { + dim3 threads(vector_dim); + dim3 blocks(num_part, num_key, batch_size); + int shared_mem = vector_dim * sizeof(float); + fast_hash_ver1_cuda_kernel<<>>( + key_mask_ptr, + key_vector_ptr, + Dmat_ptr, + key_hash_code_ptr, + batch_size, + num_key, + vector_dim, + num_part, + num_hash_f, + hash_code_len + ); + } + } + + return {query_hash_code, key_hash_code}; + +} + +at::Tensor lsh_cumulation_ver1_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +) { + + int batch_size = query_hash_code.size(0); + int num_hash_f = query_hash_code.size(2); + + int num_query = query_hash_code.size(1); + int num_key = key_hash_code.size(1); + int value_dim = value.size(2); + + at::Tensor hashtable_value = at::empty({batch_size, num_hash_f, hashtable_capacity, WARP_SIZE}, value.options()); + at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options()); + + if (use_cuda) { + int threads_x = WARP_SIZE; + int threads_y = OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE; + int block_x_step1 = num_key / threads_y; + int block_x_step2 = num_query / threads_y; + int block_y = batch_size; + + dim3 threads(threads_x, threads_y); + dim3 blocks_step1(block_x_step1, block_y); + dim3 blocks_step2(block_x_step2, block_y); + + int *query_mask_ptr = query_mask.data_ptr(); + int *query_hash_code_ptr = query_hash_code.data_ptr(); + int *key_mask_ptr = key_mask.data_ptr(); + int *key_hash_code_ptr = key_hash_code.data_ptr(); + float *value_ptr = value.data_ptr(); + float *hashtable_value_ptr = hashtable_value.data_ptr(); + float *cumulation_value_ptr = cumulation_value.data_ptr(); + + for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) { + + cudaMemset(hashtable_value_ptr, 0, (batch_size * num_hash_f * hashtable_capacity * WARP_SIZE) * sizeof(float)); + + lsh_cumulation_ver1_step1_cuda_kernel<<>>( + key_mask_ptr, + key_hash_code_ptr, + value_ptr, + hashtable_value_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_key, + value_dim, + value_offset + ); + + lsh_cumulation_ver1_step2_cuda_kernel<<>>( + query_mask_ptr, + query_hash_code_ptr, + hashtable_value_ptr, + cumulation_value_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_query, + value_dim, + value_offset + ); + } + + } + + return cumulation_value; + +} + +at::Tensor lsh_weighted_cumulation_ver1_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor query_weight, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor key_weight, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +) { + + int batch_size = query_hash_code.size(0); + int num_hash_f = query_hash_code.size(2); + + int num_query = query_hash_code.size(1); + int num_key = key_hash_code.size(1); + int value_dim = value.size(2); + int weight_dim = query_weight.size(2); + + at::Tensor hashtable_value = at::zeros({batch_size, num_hash_f, hashtable_capacity, WARP_SIZE}, value.options()); + at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options()); + + if (use_cuda) { + int threads_x = WARP_SIZE; + int threads_y = OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE; + int block_x_step1 = num_key / threads_y; + int block_x_step2 = num_query / threads_y; + int block_y = batch_size; + + dim3 threads(threads_x, threads_y); + dim3 blocks_step1(block_x_step1, block_y); + dim3 blocks_step2(block_x_step2, block_y); + + int *query_mask_ptr = query_mask.data_ptr(); + int *query_hash_code_ptr = query_hash_code.data_ptr(); + float *query_weight_ptr = query_weight.data_ptr(); + int *key_mask_ptr = key_mask.data_ptr(); + int *key_hash_code_ptr = key_hash_code.data_ptr(); + float *key_weight_ptr = key_weight.data_ptr(); + float *value_ptr = value.data_ptr(); + float *hashtable_value_ptr = hashtable_value.data_ptr(); + float *cumulation_value_ptr = cumulation_value.data_ptr(); + + for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) { + for (int weight_idx = 0; weight_idx < weight_dim; weight_idx++) { + + cudaMemset(hashtable_value_ptr, 0, (batch_size * num_hash_f * hashtable_capacity * WARP_SIZE) * sizeof(float)); + + lsh_weighted_cumulation_ver1_step1_cuda_kernel<<>>( + key_mask_ptr, + key_hash_code_ptr, + key_weight_ptr, + value_ptr, + hashtable_value_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_key, + value_dim, + weight_dim, + value_offset, + weight_idx + ); + + lsh_weighted_cumulation_ver1_step2_cuda_kernel<<>>( + query_mask_ptr, + query_hash_code_ptr, + query_weight_ptr, + hashtable_value_ptr, + cumulation_value_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_query, + value_dim, + weight_dim, + value_offset, + weight_idx + ); + } + } + + } + + return cumulation_value; + +} + +at::Tensor lsh_weighted_cumulation_ver2_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor query_weight, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor key_weight, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +) { + + int batch_size = query_hash_code.size(0); + int num_hash_f = query_hash_code.size(2); + + int num_query = query_hash_code.size(1); + int num_key = key_hash_code.size(1); + int value_dim = value.size(2); + int weight_dim = query_weight.size(2); + + at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options()); + at::Tensor key_sorted_idxes = at::zeros({batch_size, num_hash_f, num_key}, query_hash_code.options()); + at::Tensor query_info = at::zeros({batch_size, num_query, 2, num_hash_f}, query_hash_code.options()); + at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options()); + + if (use_cuda) { + + int *query_mask_ptr = query_mask.data_ptr(); + int *query_hash_code_ptr = query_hash_code.data_ptr(); + float *query_weight_ptr = query_weight.data_ptr(); + int *key_mask_ptr = key_mask.data_ptr(); + int *key_hash_code_ptr = key_hash_code.data_ptr(); + float *key_weight_ptr = key_weight.data_ptr(); + float *value_ptr = value.data_ptr(); + + int *count_sort_table_ptr = count_sort_table.data_ptr(); + int *key_sorted_idxes_ptr = key_sorted_idxes.data_ptr(); + int *query_info_ptr = query_info.data_ptr(); + + float *cumulation_value_ptr = cumulation_value.data_ptr(); + + { + dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f)); + dim3 blocks_step13(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size); + dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK)); + dim3 blocks_step2(num_hash_f, batch_size); + int shared_mem = hashtable_capacity * sizeof(float); + count_sort_step1_cuda_kernel<<>>( + key_mask_ptr, + key_hash_code_ptr, + count_sort_table_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_key + ); + count_sort_step2_cuda_kernel<<>>( + count_sort_table_ptr, + batch_size, + num_hash_f, + hashtable_capacity + ); + count_sort_step3_cuda_kernel<<>>( + key_mask_ptr, + key_hash_code_ptr, + count_sort_table_ptr, + key_sorted_idxes_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_key + ); + } + { + dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f)); + dim3 blocks(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size); + extract_query_info_cuda_kernel<<>>( + query_mask_ptr, + query_hash_code_ptr, + count_sort_table_ptr, + query_info_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_query + ); + } + { + dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE); + dim3 blocks(num_query, num_hash_f, batch_size); + int shared_mem = (weight_dim + WARP_SIZE) * sizeof(float); + lsh_weighted_cumulation_ver2_step2_cuda_kernel<<>>( + query_mask_ptr, + query_info_ptr, + key_sorted_idxes_ptr, + query_weight_ptr, + key_weight_ptr, + value_ptr, + cumulation_value_ptr, + batch_size, + num_hash_f, + num_query, + num_key, + value_dim, + weight_dim + ); + } + } + + return cumulation_value; + +} + +at::Tensor lsh_weighted_cumulation_ver3_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor query_weight, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor key_weight, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +) { + + int batch_size = query_hash_code.size(0); + int num_hash_f = query_hash_code.size(2); + + int num_query = query_hash_code.size(1); + int num_key = key_hash_code.size(1); + int value_dim = value.size(2); + int weight_dim = query_weight.size(2); + + at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options()); + at::Tensor query_sorted_idxes = at::zeros({batch_size, num_hash_f, num_query}, query_hash_code.options()); + at::Tensor key_info = at::zeros({batch_size, num_key, 2, num_hash_f}, query_hash_code.options()); + at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options()); + + if (use_cuda) { + + int *query_mask_ptr = query_mask.data_ptr(); + int *query_hash_code_ptr = query_hash_code.data_ptr(); + float *query_weight_ptr = query_weight.data_ptr(); + int *key_mask_ptr = key_mask.data_ptr(); + int *key_hash_code_ptr = key_hash_code.data_ptr(); + float *key_weight_ptr = key_weight.data_ptr(); + float *value_ptr = value.data_ptr(); + + int *count_sort_table_ptr = count_sort_table.data_ptr(); + int *query_sorted_idxes_ptr = query_sorted_idxes.data_ptr(); + int *key_info_ptr = key_info.data_ptr(); + + float *cumulation_value_ptr = cumulation_value.data_ptr(); + + { + dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f)); + dim3 blocks_step13(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size); + dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK)); + dim3 blocks_step2(num_hash_f, batch_size); + int shared_mem = hashtable_capacity * sizeof(float); + count_sort_step1_cuda_kernel<<>>( + query_mask_ptr, + query_hash_code_ptr, + count_sort_table_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_query + ); + count_sort_step2_cuda_kernel<<>>( + count_sort_table_ptr, + batch_size, + num_hash_f, + hashtable_capacity + ); + count_sort_step3_cuda_kernel<<>>( + query_mask_ptr, + query_hash_code_ptr, + count_sort_table_ptr, + query_sorted_idxes_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_query + ); + } + { + dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f)); + dim3 blocks(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size); + extract_query_info_cuda_kernel<<>>( + key_mask_ptr, + key_hash_code_ptr, + count_sort_table_ptr, + key_info_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_key + ); + } + { + dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE); + dim3 blocks(num_key, num_hash_f, batch_size); + int shared_mem = (weight_dim + value_dim + WARP_SIZE) * sizeof(float); + lsh_weighted_cumulation_ver3_step2_cuda_kernel<<>>( + query_sorted_idxes_ptr, + key_mask_ptr, + key_info_ptr, + query_weight_ptr, + key_weight_ptr, + value_ptr, + cumulation_value_ptr, + batch_size, + num_hash_f, + num_query, + num_key, + value_dim, + weight_dim + ); + } + } + + return cumulation_value; + +} + +at::Tensor lsh_weighted_cumulation_ver4_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor query_weight, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor key_weight, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +) { + + int batch_size = query_hash_code.size(0); + int num_hash_f = query_hash_code.size(2); + + int num_query = query_hash_code.size(1); + int num_key = key_hash_code.size(1); + int value_dim = value.size(2); + int weight_dim = query_weight.size(2); + + at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options()); + at::Tensor query_sorted_idxes = at::zeros({batch_size, num_hash_f, num_query}, query_hash_code.options()); + at::Tensor key_info = at::zeros({batch_size, num_key, 2, num_hash_f}, query_hash_code.options()); + at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options()); + + if (use_cuda) { + + int *query_mask_ptr = query_mask.data_ptr(); + int *query_hash_code_ptr = query_hash_code.data_ptr(); + float *query_weight_ptr = query_weight.data_ptr(); + int *key_mask_ptr = key_mask.data_ptr(); + int *key_hash_code_ptr = key_hash_code.data_ptr(); + float *key_weight_ptr = key_weight.data_ptr(); + float *value_ptr = value.data_ptr(); + + int *count_sort_table_ptr = count_sort_table.data_ptr(); + int *query_sorted_idxes_ptr = query_sorted_idxes.data_ptr(); + int *key_info_ptr = key_info.data_ptr(); + + float *cumulation_value_ptr = cumulation_value.data_ptr(); + + { + dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f)); + dim3 blocks_step13(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size); + dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK)); + dim3 blocks_step2(num_hash_f, batch_size); + int shared_mem = hashtable_capacity * sizeof(float); + count_sort_step1_cuda_kernel<<>>( + query_mask_ptr, + query_hash_code_ptr, + count_sort_table_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_query + ); + count_sort_step2_cuda_kernel<<>>( + count_sort_table_ptr, + batch_size, + num_hash_f, + hashtable_capacity + ); + count_sort_step3_cuda_kernel<<>>( + query_mask_ptr, + query_hash_code_ptr, + count_sort_table_ptr, + query_sorted_idxes_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_query + ); + } + { + dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f)); + dim3 blocks(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size); + extract_query_info_cuda_kernel<<>>( + key_mask_ptr, + key_hash_code_ptr, + count_sort_table_ptr, + key_info_ptr, + batch_size, + num_hash_f, + hashtable_capacity, + num_key + ); + } + { + dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE); + dim3 blocks(num_key, batch_size); + int shared_mem = (weight_dim + value_dim + 2 * num_hash_f) * sizeof(float); + lsh_weighted_cumulation_ver4_step2_cuda_kernel<<>>( + query_sorted_idxes_ptr, + key_mask_ptr, + key_info_ptr, + query_weight_ptr, + key_weight_ptr, + value_ptr, + cumulation_value_ptr, + batch_size, + num_hash_f, + num_query, + num_key, + value_dim, + weight_dim + ); + } + } + + return cumulation_value; + +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.h new file mode 100644 index 0000000000000000000000000000000000000000..dd48de0ed159f49ee3afe93b12aaae719fe87688 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.h @@ -0,0 +1,71 @@ +#include +#include +#include + +std::vector fast_hash_ver1_kernel( + at::Tensor query_mask, + at::Tensor query_vector, + at::Tensor key_mask, + at::Tensor key_vector, + int num_hash_f, + int hash_code_len, + bool use_cuda +); + +at::Tensor lsh_cumulation_ver1_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +); + +at::Tensor lsh_weighted_cumulation_ver1_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor query_weight, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor key_weight, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +); + +at::Tensor lsh_weighted_cumulation_ver2_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor query_weight, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor key_weight, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +); + +at::Tensor lsh_weighted_cumulation_ver3_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor query_weight, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor key_weight, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +); + +at::Tensor lsh_weighted_cumulation_ver4_kernel( + at::Tensor query_mask, + at::Tensor query_hash_code, + at::Tensor query_weight, + at::Tensor key_mask, + at::Tensor key_hash_code, + at::Tensor key_weight, + at::Tensor value, + int hashtable_capacity, + bool use_cuda +); diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..22944e97044659f896451936c6253d5aadd7a769 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu @@ -0,0 +1,825 @@ +// File from https://github.com/mlpen/YOSO/blob/main/encoders/backbones/efficient_attentions/yoso/yoso_v1/cuda/fast_lsh_cumulation_cuda.cu + +#include "fast_lsh_cumulation_cuda.h" +#include "common_cuda_device.h" +#include "common_cuda.h" +#include "common.h" +#include +////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void fast_hadamard_transform(float *vector_buffer, int vector_dim, int dim_idx) { + int stride = vector_dim / 2; + while (stride > (WARP_SIZE / 2)) { + __syncthreads(); + int sign = 1 - ((dim_idx / stride) % 2) * 2; + float val1 = vector_buffer[dim_idx]; + float val2 = vector_buffer[dim_idx + sign * stride]; + __syncthreads(); + vector_buffer[dim_idx] = float(sign) * val1 + val2; + stride = stride / 2; + } + + float val = vector_buffer[dim_idx]; + #pragma unroll + for (stride = (WARP_SIZE / 2); stride > 0; stride = stride / 2) { + int sign = 1 - ((dim_idx / stride) % 2) * 2; + val = float(sign) * val + __shfl_xor_sync(FULL_MASK, val, stride); + } + vector_buffer[dim_idx] = val; +} + +__global__ void fast_hash_ver1_cuda_kernel( + int *mask, // [batch_size, num_vector] + float *vector, // [batch_size, num_vector, vector_dim] + int *Dmat, // [batch_size, 3, num_part, vector_dim] + int *hash_code, // [batch_size, num_vector, num_hash_f] + int batch_size, + int num_vector, + int vector_dim, + int num_part, + int num_hash_f, + int hash_code_len +) { + + int batch_idx = blockIdx.z; + int vector_idx = blockIdx.y; + int part_idx = blockIdx.x; + + int dim_idx = threadIdx.x; + + int batch_idx__vector_idx = batch_idx * num_vector + vector_idx; + if (mask[batch_idx__vector_idx] == 0) { + return; + } + + extern __shared__ float buffer[]; + float *vector_buffer = buffer; + + vector_buffer[dim_idx] = vector[batch_idx__vector_idx * vector_dim + dim_idx]; + + vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 0) * num_part + part_idx) * vector_dim + dim_idx]; + fast_hadamard_transform(vector_buffer, vector_dim, dim_idx); + vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 1) * num_part + part_idx) * vector_dim + dim_idx]; + fast_hadamard_transform(vector_buffer, vector_dim, dim_idx); + vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 2) * num_part + part_idx) * vector_dim + dim_idx]; + fast_hadamard_transform(vector_buffer, vector_dim, dim_idx); + + int num_hash_per_part = vector_dim / hash_code_len; + if (hash_code_len == 8 || hash_code_len == 16) { + int code = select(vector_buffer[dim_idx] > 0, 1 << (dim_idx % hash_code_len), 0); + for (int offset = 1; offset < hash_code_len; offset = offset * 2) { + code += __shfl_xor_sync(FULL_MASK, code, offset); + } + if (dim_idx % hash_code_len == 0) { + int hash_f_idx = part_idx * num_hash_per_part + dim_idx / hash_code_len; + if (hash_f_idx < num_hash_f) { + hash_code[batch_idx__vector_idx * num_hash_f + hash_f_idx] = code; + } + } + } else { + vector_buffer[dim_idx] = select(vector_buffer[dim_idx] > 0, 1 << (dim_idx % hash_code_len), 0); + __syncthreads(); + if (dim_idx < num_hash_per_part) { + int code = 0; + for (int i = 0; i < hash_code_len; i++) { + code += vector_buffer[dim_idx * hash_code_len + i]; + } + int hash_f_idx = part_idx * num_hash_per_part + dim_idx; + if (hash_f_idx < num_hash_f) { + hash_code[batch_idx__vector_idx * num_hash_f + hash_f_idx] = code; + } + } + } +} + +__global__ void lsh_cumulation_ver1_step1_cuda_kernel( + int *key_mask, // [batch_size, num_key] + int *key_hash_code, // [batch_size, num_key, num_hash_f] + float *value, // [batch_size, num_key, value_dim] + float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_key, + int value_dim, + int offset_warp +) { + + int warp_thread_idx = threadIdx.x; + + int batch_idx = blockIdx.y; + int key_idx = blockIdx.x * blockDim.y + threadIdx.y; + + int batch_idx__key_idx = batch_idx * num_key + key_idx; + if (key_mask[batch_idx__key_idx] == 0) { + return; + } + + if (num_hash_f > WARP_SIZE) { + float warp_value = value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx]; + for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) { + int warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_start + warp_thread_idx]; + #pragma unroll + for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) { + int current_hashcode = warp_hashcode; + current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset); + int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode; + atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value); + } + } + } else { + float warp_value = value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx]; + int warp_hashcode = 0; + if (warp_thread_idx < num_hash_f) { + warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + warp_thread_idx]; + } + for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) { + int current_hashcode = warp_hashcode; + current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx); + int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode; + atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value); + } + } + +} + +__global__ void lsh_cumulation_ver1_step2_cuda_kernel( + int *query_mask, // [batch_size, num_query] + int *query_hash_code, // [batch_size, num_query, num_hash_f] + float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_query, + int value_dim, + int offset_warp +) { + + int warp_thread_idx = threadIdx.x; + + int batch_idx = blockIdx.y; + int query_idx = blockIdx.x * blockDim.y + threadIdx.y; + + int batch_idx__query_idx = batch_idx * num_query + query_idx; + if (query_mask[batch_idx__query_idx] == 0) { + return; + } + + if (num_hash_f > WARP_SIZE) { + float warp_value = 0; + for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) { + int warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_start + warp_thread_idx]; + #pragma unroll + for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) { + int current_hashcode = warp_hashcode; + current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset); + int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode; + warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx]; + } + } + cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] = warp_value / float(num_hash_f); + } else { + float warp_value = 0; + int warp_hashcode = 0; + if (warp_thread_idx < num_hash_f) { + warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + warp_thread_idx]; + } + for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) { + int current_hashcode = warp_hashcode; + current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx); + int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode; + warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx]; + } + cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] = warp_value / float(num_hash_f); + } + +} + +__global__ void lsh_weighted_cumulation_ver1_step1_cuda_kernel( + int *key_mask, // [batch_size, num_key] + int *key_hash_code, // [batch_size, num_key, num_hash_f] + float *key_weight, // [batch_size, num_key, weight_dim] + float *value, // [batch_size, num_key, value_dim] + float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_key, + int value_dim, + int weight_dim, + int offset_warp, + int weight_idx +) { + + int warp_thread_idx = threadIdx.x; + + int batch_idx = blockIdx.y; + int key_idx = blockIdx.x * blockDim.y + threadIdx.y; + + int batch_idx__key_idx = batch_idx * num_key + key_idx; + if (key_mask[batch_idx__key_idx] == 0) { + return; + } + + if (num_hash_f > WARP_SIZE) { + float warp_value = key_weight[batch_idx__key_idx * weight_dim + weight_idx] * value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx]; + for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) { + int warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_start + warp_thread_idx]; + #pragma unroll + for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) { + int current_hashcode = warp_hashcode; + current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset); + int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode; + atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value); + } + } + } else { + float warp_value = key_weight[batch_idx__key_idx * weight_dim + weight_idx] * value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx]; + int warp_hashcode = 0; + if (warp_thread_idx < num_hash_f) { + warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + warp_thread_idx]; + } + for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) { + int current_hashcode = warp_hashcode; + current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx); + int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode; + atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value); + } + } + +} + +__global__ void lsh_weighted_cumulation_ver1_step2_cuda_kernel( + int *query_mask, // [batch_size, num_query] + int *query_hash_code, // [batch_size, num_query, num_hash_f] + float *query_weight, // [batch_size, num_query, weight_dim] + float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_query, + int value_dim, + int weight_dim, + int offset_warp, + int weight_idx +) { + + int warp_thread_idx = threadIdx.x; + + int batch_idx = blockIdx.y; + int query_idx = blockIdx.x * blockDim.y + threadIdx.y; + + int batch_idx__query_idx = batch_idx * num_query + query_idx; + if (query_mask[batch_idx__query_idx] == 0) { + return; + } + + if (num_hash_f > WARP_SIZE) { + float warp_value = 0; + for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) { + int warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_start + warp_thread_idx]; + #pragma unroll + for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) { + int current_hashcode = warp_hashcode; + current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset); + int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode; + warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx]; + } + } + float warp_weight = query_weight[batch_idx__query_idx * weight_dim + weight_idx]; + cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] += warp_weight * warp_value / float(num_hash_f); + } else { + float warp_value = 0; + int warp_hashcode = 0; + if (warp_thread_idx < num_hash_f) { + warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + warp_thread_idx]; + } + for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) { + int current_hashcode = warp_hashcode; + current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx); + int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode; + warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx]; + } + float warp_weight = query_weight[batch_idx__query_idx * weight_dim + weight_idx]; + cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] += warp_weight * warp_value / float(num_hash_f); + } + +} + +__global__ void count_sort_step1_cuda_kernel( + int *key_mask, // [batch_size, num_key] + int *key_hash_code, // [batch_size, num_key, num_hash_f] + int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_key +) { + + int batch_idx = blockIdx.y; + int key_idx = blockIdx.x * blockDim.y + threadIdx.y; + int hash_f_idx = threadIdx.x; + + int batch_idx__key_idx = batch_idx * num_key + key_idx; + if (key_mask[batch_idx__key_idx] == 0) { + return; + } + + int hash_code = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_idx]; + atomicAdd(&count_sort_table[(batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + hash_code], 1); + +} + +__global__ void count_sort_step2_cuda_kernel( + int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity] + int batch_size, + int num_hash_f, + int hashtable_capacity +) { + + int batch_idx = blockIdx.y; + int hash_f_idx = blockIdx.x; + + int num_threads = blockDim.x; + int thread_id = threadIdx.x; + + int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx; + + extern __shared__ float buffer[]; + int *table_buffer = (int*)buffer; + + if (thread_id == 0) { + table_buffer[0] = 0; + } + copy_data(&count_sort_table[batch_idx__hash_f_idx * hashtable_capacity], &table_buffer[1], hashtable_capacity - 1, num_threads, thread_id); + + for (int table_idx_start = 0; table_idx_start < hashtable_capacity; table_idx_start = table_idx_start + num_threads) { + int thread_value = table_buffer[table_idx_start + thread_id]; + int next_thread_value = 0; + for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) { + next_thread_value = __shfl_up_sync(FULL_MASK, thread_value, offset); + if (thread_id % WARP_SIZE >= offset) { + thread_value = thread_value + next_thread_value; + } + } + table_buffer[table_idx_start + thread_id] = thread_value; + } + __syncthreads(); + + if (hashtable_capacity > WARP_SIZE) { + if (thread_id < WARP_SIZE) { + for (int table_idx_start = WARP_SIZE; table_idx_start < hashtable_capacity; table_idx_start = table_idx_start + WARP_SIZE) { + table_buffer[table_idx_start + thread_id] += table_buffer[table_idx_start - 1]; + } + } + } + + copy_data(table_buffer, &count_sort_table[batch_idx__hash_f_idx * hashtable_capacity], hashtable_capacity, num_threads, thread_id); + +} + + +__global__ void count_sort_step3_cuda_kernel( + int *key_mask, // [batch_size, num_key] + int *key_hash_code, // [batch_size, num_key, num_hash_f] + int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity] + int *key_sorted_idxes, // [batch_size, num_hash_f, num_key] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_key +) { + + int batch_idx = blockIdx.y; + int key_idx = blockIdx.x * blockDim.y + threadIdx.y; + int hash_f_idx = threadIdx.x; + + int batch_idx__key_idx = batch_idx * num_key + key_idx; + if (key_mask[batch_idx__key_idx] == 0) { + return; + } + + int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx; + + int hash_code = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_idx]; + int sort_idx = atomicAdd(&count_sort_table[batch_idx__hash_f_idx * hashtable_capacity + hash_code], 1); + key_sorted_idxes[batch_idx__hash_f_idx * num_key + sort_idx] = key_idx; + +} + +__global__ void extract_query_info_cuda_kernel( + int *query_mask, // [batch_size, num_query] + int *query_hash_code, // [batch_size, num_query, num_hash_f] + int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity] + int *query_info, // [batch_size, num_query, 2, num_hash_f] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_query +) { + + int batch_idx = blockIdx.y; + int query_idx = blockIdx.x * blockDim.y + threadIdx.y; + int hash_f_idx = threadIdx.x; + + int batch_idx__query_idx = batch_idx * num_query + query_idx; + if (query_mask[batch_idx__query_idx] == 0) { + return; + } + + int hash_code = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_idx]; + int batch_idx__hash_f_idx__hash_code = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + hash_code; + + int key_offset = select(hash_code == 0, 0, count_sort_table[batch_idx__hash_f_idx__hash_code - 1]); + int key_count = count_sort_table[batch_idx__hash_f_idx__hash_code] - key_offset; + + query_info[batch_idx__query_idx * 2 * num_hash_f + hash_f_idx] = key_offset; + query_info[(batch_idx__query_idx * 2 + 1) * num_hash_f + hash_f_idx] = key_count; + +} + +__global__ void lsh_weighted_cumulation_ver2_step2_cuda_kernel( + int *query_mask, // [batch_size, num_query] + int *query_info, // [batch_size, num_query, 2, num_hash_f] + int *key_sorted_idxes, // [batch_size, num_hash_f, num_key] + float *query_weight, // [batch_size, num_query, weight_dim] + float *key_weight, // [batch_size, num_key, weight_dim] + float *value, // [batch_size, num_key, value_dim] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int num_query, + int num_key, + int value_dim, + int weight_dim +) { + + int batch_idx = blockIdx.z; + int hash_f_idx = blockIdx.y; + int query_idx = blockIdx.x; + + int num_threads = blockDim.y * blockDim.x; + int thread_id = threadIdx.y * blockDim.x + threadIdx.x; + + int num_warps = blockDim.y; + int warp_idx = threadIdx.y; + int warp_thread_idx = threadIdx.x; + + int batch_idx__query_idx = batch_idx * num_query + query_idx; + if (query_mask[batch_idx__query_idx] == 0) { + return; + } + + int key_offset = query_info[batch_idx__query_idx * 2 * num_hash_f + hash_f_idx]; + int key_count = query_info[(batch_idx__query_idx * 2 + 1) * num_hash_f + hash_f_idx]; + + if (key_count == 0) { + return; + } + + extern __shared__ float buffer[]; + + if (key_count == 1) { + if (warp_idx == 0) { + int key_idx = key_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_key + key_offset]; + int batch_idx__key_idx = batch_idx * num_key + key_idx; + float weight = 0; + for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) { + int weight_dim_idx = weight_offset + warp_thread_idx; + float val = query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx] * key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx]; + #pragma unroll + for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) { + val += __shfl_xor_sync(FULL_MASK, val, offset); + } + weight = weight + val; + } + weight = weight / float(num_hash_f); + for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) { + int value_dim_idx = value_offset + warp_thread_idx; + float val = value[batch_idx__key_idx * value_dim + value_dim_idx]; + atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val); + } + } + } else { + float *weight_buffer = buffer; + int *key_idxes_buffer = (int*)&buffer[weight_dim]; + + copy_data_nonblocking(&query_weight[batch_idx__query_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id); + + while (key_count > 0) { + int work_size = min(WARP_SIZE, key_count); + copy_data_nonblocking(&key_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_key + key_offset], key_idxes_buffer, work_size, num_threads, thread_id); + __syncthreads(); + for (int work_offset = 0; work_offset < WARP_SIZE; work_offset = work_offset + num_warps) { + int work_idx = work_offset + warp_idx; + if (work_idx < key_count) { + int key_idx = key_idxes_buffer[work_idx]; + int batch_idx__key_idx = batch_idx * num_key + key_idx; + float weight = 0; + for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) { + int weight_dim_idx = weight_offset + warp_thread_idx; + float val = weight_buffer[weight_dim_idx] * key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx]; + #pragma unroll + for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) { + val += __shfl_xor_sync(FULL_MASK, val, offset); + } + weight = weight + val; + } + weight = weight / float(num_hash_f); + for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) { + int value_dim_idx = value_offset + warp_thread_idx; + float val = value[batch_idx__key_idx * value_dim + value_dim_idx]; + atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val); + } + } + } + key_count = key_count - work_size; + key_offset = key_offset + work_size; + } + } + +} + +__global__ void lsh_weighted_cumulation_ver3_step2_cuda_kernel( + int *query_sorted_idxes, // [batch_size, num_hash_f, num_query] + int *key_mask, // [batch_size, num_key] + int *key_info, // [batch_size, num_key, 2, num_hash_f] + float *query_weight, // [batch_size, num_query, weight_dim] + float *key_weight, // [batch_size, num_key, weight_dim] + float *value, // [batch_size, num_key, value_dim] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int num_query, + int num_key, + int value_dim, + int weight_dim +) { + + int batch_idx = blockIdx.z; + int hash_f_idx = blockIdx.y; + int key_idx = blockIdx.x; + + int num_threads = blockDim.y * blockDim.x; + int thread_id = threadIdx.y * blockDim.x + threadIdx.x; + + int num_warps = blockDim.y; + int warp_idx = threadIdx.y; + int warp_thread_idx = threadIdx.x; + + int batch_idx__key_idx = batch_idx * num_key + key_idx; + if (key_mask[batch_idx__key_idx] == 0) { + return; + } + + int query_offset = key_info[batch_idx__key_idx * 2 * num_hash_f + hash_f_idx]; + int query_count = key_info[(batch_idx__key_idx * 2 + 1) * num_hash_f + hash_f_idx]; + + if (query_count == 0) { + return; + } + + extern __shared__ float buffer[]; + + if (query_count == 1) { + if (warp_idx == 0) { + int query_idx = query_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_query + query_offset]; + int batch_idx__query_idx = batch_idx * num_query + query_idx; + float weight = 0; + for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) { + int weight_dim_idx = weight_offset + warp_thread_idx; + float val = key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx]; + #pragma unroll + for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) { + val += __shfl_xor_sync(FULL_MASK, val, offset); + } + weight = weight + val; + } + weight = weight / float(num_hash_f); + for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) { + int value_dim_idx = value_offset + warp_thread_idx; + float val = value[batch_idx__key_idx * value_dim + value_dim_idx]; + atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val); + } + } + } else { + float *weight_buffer = buffer; + float *value_buffer = &buffer[weight_dim]; + int *query_idxes_buffer = (int*)&buffer[weight_dim + value_dim]; + + copy_data_nonblocking(&key_weight[batch_idx__key_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id); + copy_data_nonblocking(&value[batch_idx__key_idx * value_dim], value_buffer, value_dim, num_threads, thread_id); + + while (query_count > 0) { + int work_size = min(WARP_SIZE, query_count); + copy_data_nonblocking(&query_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_query + query_offset], query_idxes_buffer, work_size, num_threads, thread_id); + __syncthreads(); + for (int work_offset = 0; work_offset < WARP_SIZE; work_offset = work_offset + num_warps) { + int work_idx = work_offset + warp_idx; + if (work_idx < query_count) { + int query_idx = query_idxes_buffer[work_idx]; + int batch_idx__query_idx = batch_idx * num_query + query_idx; + float weight = 0; + for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) { + int weight_dim_idx = weight_offset + warp_thread_idx; + float val = weight_buffer[weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx]; + #pragma unroll + for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) { + val += __shfl_xor_sync(FULL_MASK, val, offset); + } + weight = weight + val; + } + weight = weight / float(num_hash_f); + for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) { + int value_dim_idx = value_offset + warp_thread_idx; + float val = value_buffer[value_dim_idx]; + atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val); + } + } + } + query_count = query_count - work_size; + query_offset = query_offset + work_size; + } + } + +} + +__global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel( + int *query_sorted_idxes, // [batch_size, num_hash_f, num_query] + int *key_mask, // [batch_size, num_key] + int *key_info, // [batch_size, num_key, 2, num_hash_f] + float *query_weight, // [batch_size, num_query, weight_dim] + float *key_weight, // [batch_size, num_key, weight_dim] + float *value, // [batch_size, num_key, value_dim] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int num_query, + int num_key, + int value_dim, + int weight_dim +) { + + int batch_idx = blockIdx.y; + int key_idx = blockIdx.x; + + int num_threads = blockDim.y * blockDim.x; + int thread_id = threadIdx.y * blockDim.x + threadIdx.x; + + int num_warps = blockDim.y; + int warp_idx = threadIdx.y; + int warp_thread_idx = threadIdx.x; + + int batch_idx__key_idx = batch_idx * num_key + key_idx; + if (key_mask[batch_idx__key_idx] == 0) { + return; + } + + extern __shared__ float buffer[]; + float *weight_buffer = buffer; + float *value_buffer = &buffer[weight_dim]; + int *key_info_buffer = (int*)&buffer[weight_dim + value_dim]; + + copy_data_nonblocking(&key_weight[batch_idx__key_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id); + copy_data_nonblocking(&value[batch_idx__key_idx * value_dim], value_buffer, value_dim, num_threads, thread_id); + copy_data_nonblocking(&key_info[batch_idx__key_idx * 2 * num_hash_f], key_info_buffer, 2 * num_hash_f, num_threads, thread_id); + + int *query_offset_buffer = key_info_buffer; + int *query_count_buffer = &key_info_buffer[num_hash_f]; + + const int hashtable_size = 1024 + OPTIMAL_THREADS_PER_BLOCK; + __shared__ int hashtable_query[hashtable_size]; + __shared__ int hashtable_count[hashtable_size]; + __shared__ int inserted_query[hashtable_size]; + __shared__ int query_counter[1]; + + int hash_f_idx_base = 0; + + while (true) { + + init_buffer_nonblocking(EMPTY_VALUE, hashtable_query, hashtable_size, num_threads, thread_id); + init_buffer_nonblocking(0, hashtable_count, hashtable_size, num_threads, thread_id); + init_buffer_nonblocking(EMPTY_VALUE, inserted_query, hashtable_size, num_threads, thread_id); + init_buffer_nonblocking(0, query_counter, 1, num_threads, thread_id); + __syncthreads(); + + while (hash_f_idx_base < num_hash_f) { + + int hash_f_idx = hash_f_idx_base + warp_idx; + int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx; + + int stop_flag = 0; + + int query_offset = query_offset_buffer[hash_f_idx]; + int query_count = query_count_buffer[hash_f_idx]; + + while (query_count > 0) { + + int work_size = min(query_count, WARP_SIZE); + + // try inserting query to set and check whether the query is new + int found_new_query = 0; + int query_idx = -1; + if (warp_thread_idx < work_size) { + query_idx = query_sorted_idxes[batch_idx__hash_f_idx * num_query + query_offset + warp_thread_idx]; + int slot = set_insert(hashtable_query, hashtable_size, query_idx); + if (slot >= 0) { + found_new_query = atomicAdd(&hashtable_count[slot], 1) == 0; + } + } + + // compute cumulative offset + int position_offset = found_new_query; + int next_position_offset = 0; + #pragma unroll + for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) { + next_position_offset = __shfl_up_sync(FULL_MASK, position_offset, offset); + if (thread_id % WARP_SIZE >= offset) { + position_offset = position_offset + next_position_offset; + } + } + + // get the inserted query list end index + int inserted_query_base = 0; + if (thread_id % WARP_SIZE == WARP_SIZE - 1) { + inserted_query_base = atomicAdd(query_counter, position_offset); + } + inserted_query_base = __shfl_sync(FULL_MASK, inserted_query_base, WARP_SIZE - 1); + + // insert new queries to list + int insert_idx = inserted_query_base + position_offset - 1; + if (found_new_query) { + inserted_query[insert_idx] = query_idx; + } + + // remove inserted queries from list + query_offset_buffer[hash_f_idx] += work_size; + query_count_buffer[hash_f_idx] -= work_size; + query_offset += work_size; + query_count -= work_size; + + // if list is almost full, stop inserting + if (inserted_query_base + OPTIMAL_THREADS_PER_BLOCK > hashtable_size) { + stop_flag = 1; + break; + } + + } + + if (stop_flag) { + break; + } + + hash_f_idx_base = hash_f_idx_base + num_warps; + + } + + __syncthreads(); + + int num_distinct_query = query_counter[0]; + + if (num_distinct_query > 0) { + for (int idx_base = 0; idx_base < num_distinct_query; idx_base = idx_base + num_warps) { + int idx = idx_base + warp_idx; + if (idx < num_distinct_query) { + int query_idx = inserted_query[idx]; + int batch_idx__query_idx = batch_idx * num_query + query_idx; + + int slot = set_lookup(hashtable_query, hashtable_size, query_idx); + int duplicate_count = hashtable_count[slot]; + + float weight = 0; + for (int weight_idx_base = 0; weight_idx_base < weight_dim; weight_idx_base = weight_idx_base + WARP_SIZE) { + int weight_dim_idx = weight_idx_base + warp_thread_idx; + float val = weight_buffer[weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx]; + #pragma unroll + for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) { + val += __shfl_xor_sync(FULL_MASK, val, offset); + } + weight = weight + val; + } + + weight = (float)duplicate_count * weight / float(num_hash_f); + + for (int value_idx_base = 0; value_idx_base < value_dim; value_idx_base = value_idx_base + WARP_SIZE) { + int value_dim_idx = value_idx_base + warp_thread_idx; + float val = value_buffer[value_dim_idx]; + atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val); + } + } + } + } else { + + // all computation is completed if num_distinct_query == 0 + break; + + } + + __syncthreads(); + + } + +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..b2adc0f735358d0fcb6a056e7d19ba745977e129 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h @@ -0,0 +1,157 @@ +__global__ void fast_hash_ver1_cuda_kernel( + int *mask, // [batch_size, num_vector] + float *vector, // [batch_size, num_vector, vector_dim] + int *Dmat, // [3, num_part, vector_dim] + int *hash_code, // [batch_size, num_vector, num_hash_f] + int batch_size, + int num_vector, + int vector_dim, + int num_part, + int num_hash_f, + int hash_code_len +); + +__global__ void lsh_cumulation_ver1_step1_cuda_kernel( + int *key_mask, // [batch_size, num_key] + int *key_hash_code, // [batch_size, num_key, num_hash_f] + float *value, // [batch_size, num_key, value_dim] + float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, value_dim] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_key, + int value_dim, + int offset_warp +); + +__global__ void lsh_cumulation_ver1_step2_cuda_kernel( + int *query_mask, // [batch_size, num_query] + int *query_hash_code, // [batch_size, num_query, num_hash_f] + float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, value_dim] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_query, + int value_dim, + int offset_warp +); + +__global__ void lsh_weighted_cumulation_ver1_step1_cuda_kernel( + int *key_mask, // [batch_size, num_key] + int *key_hash_code, // [batch_size, num_key, num_hash_f] + float *key_weight, // [batch_size, num_key, weight_dim] + float *value, // [batch_size, num_key, value_dim] + float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_key, + int value_dim, + int weight_dim, + int offset_warp, + int weight_idx +); + +__global__ void lsh_weighted_cumulation_ver1_step2_cuda_kernel( + int *query_mask, // [batch_size, num_query] + int *query_hash_code, // [batch_size, num_query, num_hash_f] + float *query_weight, // [batch_size, num_query, weight_dim] + float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_query, + int value_dim, + int weight_dim, + int offset_warp, + int weight_idx +); + +__global__ void count_sort_step1_cuda_kernel( + int *key_mask, // [batch_size, num_key] + int *key_hash_code, // [batch_size, num_key, num_hash_f] + int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_key +); + +__global__ void count_sort_step2_cuda_kernel( + int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity] + int batch_size, + int num_hash_f, + int hashtable_capacity +); + +__global__ void count_sort_step3_cuda_kernel( + int *key_mask, // [batch_size, num_key] + int *key_hash_code, // [batch_size, num_key, num_hash_f] + int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity] + int *key_sorted_idxes, // [batch_size, num_hash_f, num_key] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_key +); + +__global__ void extract_query_info_cuda_kernel( + int *query_mask, // [batch_size, num_query] + int *query_hash_code, // [batch_size, num_query, num_hash_f] + int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity] + int *query_info, // [batch_size, num_query, 2, num_hash_f] + int batch_size, + int num_hash_f, + int hashtable_capacity, + int num_query +); + +__global__ void lsh_weighted_cumulation_ver2_step2_cuda_kernel( + int *query_mask, // [batch_size, num_query] + int *query_info, // [batch_size, num_query, 2, num_hash_f] + int *key_sorted_idxes, // [batch_size, num_hash_f, num_key] + float *query_weight, // [batch_size, num_query, weight_dim] + float *key_weight, // [batch_size, num_key, weight_dim] + float *value, // [batch_size, num_key, value_dim] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int num_query, + int num_key, + int value_dim, + int weight_dim +); + +__global__ void lsh_weighted_cumulation_ver3_step2_cuda_kernel( + int *query_sorted_idxes, // [batch_size, num_hash_f, num_query] + int *key_mask, // [batch_size, num_key] + int *key_info, // [batch_size, num_key, 2, num_hash_f] + float *query_weight, // [batch_size, num_query, weight_dim] + float *key_weight, // [batch_size, num_key, weight_dim] + float *value, // [batch_size, num_key, value_dim] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int num_query, + int num_key, + int value_dim, + int weight_dim +); + +__global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel( + int *query_sorted_idxes, // [batch_size, num_hash_f, num_query] + int *key_mask, // [batch_size, num_key] + int *key_info, // [batch_size, num_key, 2, num_hash_f] + float *query_weight, // [batch_size, num_query, weight_dim] + float *key_weight, // [batch_size, num_key, weight_dim] + float *value, // [batch_size, num_key, value_dim] + float *cumulation_value, // [batch_size, num_query, value_dim] + int batch_size, + int num_hash_f, + int num_query, + int num_key, + int value_dim, + int weight_dim +); diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e150a2be604b28f600ab345a8cc9e97819cca416 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp @@ -0,0 +1,128 @@ +#include +#include +#include "fast_lsh_cumulation.h" +#include "common_cuda.h" +#include + +std::vector fast_hash( + at::Tensor query_mask, + at::Tensor query_vector, + at::Tensor key_mask, + at::Tensor key_vector, + int num_hash_f, + int hash_code_len, + bool use_cuda, + int version +) { + return fast_hash_ver1_kernel( + query_mask, + query_vector, + key_mask, + key_vector, + num_hash_f, + hash_code_len, + use_cuda + ); +} + +at::Tensor lsh_cumulation( + at::Tensor query_mask, // [batch_size, num_query] + at::Tensor query_hash_code, // [batch_size, num_query, num_hash_f] + at::Tensor key_mask, // [batch_size, num_key] + at::Tensor key_hash_code, // [batch_size, num_key, num_hash_f] + at::Tensor value, // [batch_size, num_key, value_dim] + int hashtable_capacity, + bool use_cuda, + int version +) { + return lsh_cumulation_ver1_kernel( + query_mask, + query_hash_code, + key_mask, + key_hash_code, + value, + hashtable_capacity, + use_cuda + ); +} + +at::Tensor lsh_weighted_cumulation( + at::Tensor query_mask, // [batch_size, num_query] + at::Tensor query_hash_code, // [batch_size, num_query, num_hash_f] + at::Tensor query_weight, // [batch_size, num_query, weight_dim] + at::Tensor key_mask, // [batch_size, num_key] + at::Tensor key_hash_code, // [batch_size, num_key, num_hash_f] + at::Tensor key_weight, // [batch_size, num_key, weight_dim] + at::Tensor value, // [batch_size, num_key, value_dim] + int hashtable_capacity, + bool use_cuda, + int version +) { + if (version == 1) { + return lsh_weighted_cumulation_ver1_kernel( + query_mask, + query_hash_code, + query_weight, + key_mask, + key_hash_code, + key_weight, + value, + hashtable_capacity, + use_cuda + ); + } else if (version == 2) { + return lsh_weighted_cumulation_ver2_kernel( + query_mask, + query_hash_code, + query_weight, + key_mask, + key_hash_code, + key_weight, + value, + hashtable_capacity, + use_cuda + ); + } else if (version == 3) { + return lsh_weighted_cumulation_ver3_kernel( + query_mask, + query_hash_code, + query_weight, + key_mask, + key_hash_code, + key_weight, + value, + hashtable_capacity, + use_cuda + ); + } else if (version == 4) { + return lsh_weighted_cumulation_ver4_kernel( + query_mask, + query_hash_code, + query_weight, + key_mask, + key_hash_code, + key_weight, + value, + hashtable_capacity, + use_cuda + ); + } else { + return lsh_weighted_cumulation_ver3_kernel( + query_mask, + query_hash_code, + query_weight, + key_mask, + key_hash_code, + key_weight, + value, + hashtable_capacity, + use_cuda + ); + } +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("fast_hash", &fast_hash, "Fast Hash (CUDA)"); + m.def("lsh_cumulation", &lsh_cumulation, "LSH Cumulation (CUDA)"); + m.def("lsh_weighted_cumulation", &lsh_weighted_cumulation, "LSH Weighted Cumulation (CUDA)"); +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73b50ac619b49737ba5c95e14df269fce200a3c7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee31bc357bf2c18699de9a156989e6d0f8448161 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29ddb3093bf39cbd46f8bfdac0cec96d659066da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b96a9fac5e2e6947334d3ce1fe4340ccc742e619 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c958dad1b9300b27f5b5f67e16fad9041e3db869 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/configuration_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/configuration_bart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee38568d1d9a571684f6287e6ff130232f75d175 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/configuration_bart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_bart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..222951d5911c1fde84b3ebf6e517cc22c31a318b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_bart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_flax_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_flax_bart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e17105fc8b2ebf45b3a2a65bf61df362635f77f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_flax_bart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_tf_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_tf_bart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..079f9081369b373d69f2e2a99511bd2110af906f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_tf_bart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95d9040f2d466a89c87866df5a9740a03471df4f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5adb5739acdecf033bccf0788fdaf6598d97c57 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6c0673b523a50279987dd33aa5c864ba82461bd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/tokenization_bartpho.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/tokenization_bartpho.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44d91bec4925c7b5c67d01af81ff13a31eca51bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/tokenization_bartpho.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b18d297ebfcca692e60df4bbb71d18e4074bac6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36ccd2340c71300c8c4e21dc8e43222fec3121b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/tokenization_bert_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/tokenization_bert_japanese.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5a77eabd82e0107495108e2d7f90a8c3424623a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/tokenization_bert_japanese.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d0710570ddcfa74fb4ba04508b19e2c1f1d19ec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f2c1e4aa581ab3e7ac9c731df7ba80b20d203ad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe9dc464abea80063f5bdc3ccac46de2c1be744c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/configuration_big_bird.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/configuration_big_bird.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0967de343730870230a3c5394481706fd387bce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/configuration_big_bird.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f414bbc30fbee4d561ce4c0d408bf8451ae1c4b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c242002eb9ff6f06db8451cff6fc7caeb3ad8f63 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d0640b0a2fd331c69de4f802f076af4a1bccc6d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/configuration_bigbird_pegasus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/configuration_bigbird_pegasus.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f049b8909d9149b85b7ef778ff69818a128965b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/configuration_bigbird_pegasus.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd2a9b0d939082c8c9831734a47115d23686386a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ce1330306da65e924f63f6dbad8477628609a3e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/configuration_blip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/configuration_blip_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f90b8cd66599a2fb03e11f19065b229ef9ddce10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/configuration_blip_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/processing_blip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/processing_blip_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0473f22e8c5b04cbf74dc8e03671b9c7b99550c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/processing_blip_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca84a320fdc4aa1f4cf99aef78154849514c8a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bridgetower import * + from .image_processing_bridgetower import * + from .image_processing_bridgetower_fast import * + from .modeling_bridgetower import * + from .processing_bridgetower import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/configuration_bridgetower.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/configuration_bridgetower.py new file mode 100644 index 0000000000000000000000000000000000000000..4c84b0a294dafb35c991654c6e7df79c3fe58452 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/configuration_bridgetower.py @@ -0,0 +1,308 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License=, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing=, software +# distributed under the License is distributed on an "AS IS" BASIS=, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BridgeTower model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BridgeTowerVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the vision configuration of a [`BridgeTowerModel`]. Instantiating a + configuration with the defaults will yield a similar configuration to that of the bridgetower-base + [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in visual encoder model. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + image_size (`int`, *optional*, defaults to 288): + The size (resolution) of each image. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + stop_gradient (`bool`, *optional*, defaults to `False`): + Whether to stop gradient for training. + share_layernorm (`bool`, *optional*, defaults to `True`): + Whether LayerNorm layers are shared. + remove_last_layer (`bool`, *optional*, defaults to `False`): + Whether to remove the last layer from the vision encoder. + + + Example: + + ```python + >>> from transformers import BridgeTowerVisionConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the vision model + >>> configuration = BridgeTowerVisionConfig() + + >>> # Accessing the configuration + >>> configuration + ```""" + + model_type = "bridgetower_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_channels=3, + patch_size=16, + image_size=288, + initializer_factor=1, + layer_norm_eps=1e-05, + stop_gradient=False, + share_layernorm=True, + remove_last_layer=False, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_factor = initializer_factor + self.layer_norm_eps = layer_norm_eps + self.stop_gradient = stop_gradient + self.share_layernorm = share_layernorm + self.remove_last_layer = remove_last_layer + + +class BridgeTowerTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the text configuration of a [`BridgeTowerModel`]. The default values here + are copied from RoBERTa. Instantiating a configuration with the defaults will yield a similar configuration to that + of the bridgetower-base [BridegTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the text part of the model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`BridgeTowerModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 514): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids`. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + + Example: + + ```python + >>> from transformers import BridgeTowerTextConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the text model + >>> configuration = BridgeTowerTextConfig() + + >>> # Accessing the configuration + >>> configuration + ```""" + + model_type = "bridgetower_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=50265, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + initializer_factor=1, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=514, + type_vocab_size=1, + layer_norm_eps=1e-05, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_factor = initializer_factor + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + +class BridgeTowerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BridgeTowerModel`]. It is used to instantiate a + BridgeTower model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the bridgetower-base + [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`): + Whether cross modal transformer layers are shared. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + share_link_tower_layers (`bool`, *optional*, defaults to `False`): + Whether the bride/link tower layers are shared. + link_tower_type (`str`, *optional*, defaults to `"add"`): + Type of the bridge/link layer. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie input and output embeddings. + init_layernorm_from_vision_encoder (`bool`, *optional*, defaults to `False`): + Whether to init LayerNorm from the vision encoder. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BridgeTowerTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BridgeTowerVisionConfig`]. + + Example: + + ```python + >>> from transformers import BridgeTowerModel, BridgeTowerConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration + >>> configuration = BridgeTowerConfig() + + >>> # Initializing a model from the BridgeTower/bridgetower-base style configuration + >>> model = BridgeTowerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bridgetower" + sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig} + + def __init__( + self, + share_cross_modal_transformer_layers=True, + hidden_act="gelu", + hidden_size=768, + initializer_factor=1, + layer_norm_eps=1e-05, + share_link_tower_layers=False, + link_tower_type="add", + num_attention_heads=12, + num_hidden_layers=6, + tie_word_embeddings=False, + init_layernorm_from_vision_encoder=False, + text_config=None, + vision_config=None, + **kwargs, + ): + # TODO: remove this once the Hub files are updated. + _ = kwargs.pop("text_config_dict", None) + _ = kwargs.pop("vision_config_dict", None) + + super().__init__(**kwargs) + self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers + self.hidden_act = hidden_act + self.hidden_size = hidden_size + self.initializer_factor = initializer_factor + self.layer_norm_eps = layer_norm_eps + self.share_link_tower_layers = share_link_tower_layers + self.link_tower_type = link_tower_type + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_hidden_layers + self.tie_word_embeddings = tie_word_embeddings + self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `BridgeTowerTextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `BridgeTowerVisionConfig` with default values.") + + self.text_config = BridgeTowerTextConfig(**text_config) + self.vision_config = BridgeTowerVisionConfig(**vision_config) + + +__all__ = ["BridgeTowerConfig", "BridgeTowerTextConfig", "BridgeTowerVisionConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower.py new file mode 100644 index 0000000000000000000000000000000000000000..cb39ed0975610f65f2b44725ffbbbb43a0c18f3f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower.py @@ -0,0 +1,541 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for BridgeTower.""" + +from collections.abc import Iterable +from typing import Any, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import PaddingMode, center_crop, pad, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices +def max_across_indices(values: Iterable[Any]) -> list[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width +def get_max_height_width( + images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> list[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.vilt.image_processing_vilt.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + shorter: int = 800, + longer: int = 1333, + size_divisor: int = 32, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + input_height, input_width = get_image_size(input_image, input_data_format) + min_size, max_size = shorter, longer + + scale = min_size / min(input_height, input_width) + + if input_height < input_width: + new_height = min_size + new_width = scale * input_width + else: + new_height = scale * input_height + new_width = min_size + + if max(new_height, new_width) > max_size: + scale = max_size / max(new_height, new_width) + new_height = scale * new_height + new_width = scale * new_width + + new_height, new_width = int(new_height + 0.5), int(new_width + 0.5) + new_height = new_height // size_divisor * size_divisor + new_width = new_width // size_divisor * size_divisor + + return new_height, new_width + + +class BridgeTowerImageProcessor(BaseImageProcessor): + r""" + Constructs a BridgeTower image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{'shortest_edge': 288}`): + Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under + `int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if + `do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method. + size_divisor (`int`, *optional*, defaults to 32): + The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize` + is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image. Can be overridden by the `do_center_crop` parameter in the `preprocess` + method. + crop_size (`dict[str, int]`, *optional*): + Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`. + Can be overridden by the `crop_size` parameter in the `preprocess` method. If unset defaults to `size`, + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by + the `do_pad` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + size_divisor: int = 32, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_pad: bool = True, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 288} + size = get_size_dict(size, default_to_square=False) + + self.do_resize = do_resize + self.size = size + self.size_divisor = size_divisor + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_pad = do_pad + self.do_center_crop = do_center_crop + self.crop_size = crop_size + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + size_divisor: int = 32, + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. + + Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the + longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then + resized to the max size while preserving the aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Controls the size of the output image. Should be of the form `{"shortest_edge": int}`. + size_divisor (`int`, *optional*, defaults to 32): + The image is resized to a size that is a multiple of this value. + resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size, default_to_square=False) + if "shortest_edge" not in size: + raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") + shorter = size["shortest_edge"] + longer = int(1333 / 800 * shorter) + output_size = get_resize_output_image_size( + image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def center_crop( + self, + image: np.ndarray, + size: dict[str, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along + any edge, the image is padded with 0's and then center cropped. + + Args: + image (`np.ndarray`): + Image to center crop. + size (`dict[str, int]`): + Size of the output image in the form `{"height": h, "width": w}`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input + image. + """ + output_size = size["shortest_edge"] + return center_crop( + image, + size=(output_size, output_size), + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + return padded_image + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad + def pad( + self, + images: list[np.ndarray], + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + image (`np.ndarray`): + Image to pad. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + pad_size = get_max_height_width(images, input_data_format=input_data_format) + + padded_images = [ + self._pad_image( + image, + pad_size, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + for image in images + ] + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + return BatchFeature(data=data, tensor_type=return_tensors) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + size_divisor: Optional[int] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[dict[str, int]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + size_divisor (`int`, *optional*, defaults to `self.size_divisor`): + The image is resized to a size that is a multiple of this value. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also + created and returned. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the + image is padded with 0's and then center cropped. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be + padded with zeros and then cropped + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size_divisor = size_divisor if size_divisor is not None else self.size_divisor + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_pad = do_pad if do_pad is not None else self.do_pad + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + # For backwards compatibility. Initial version of this processor was cropping to the "size" argument, which + # it should default to if crop_size is undefined. + crop_size = ( + crop_size if crop_size is not None else (self.crop_size if self.crop_size is not None else self.size) + ) + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + # Here, crop_size is used only if it is set, else size will be used. + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if do_resize: + images = [ + self.resize( + image=image, + size=size, + size_divisor=size_divisor, + resample=resample, + input_data_format=input_data_format, + ) + for image in images + ] + + if do_center_crop: + images = [ + self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + if do_pad: + encoded_outputs = self.pad( + images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format + ) + else: + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + +__all__ = ["BridgeTowerImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..5be6f9f6c54b7bf6e973b9102179b63cbfe353d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower_fast.py @@ -0,0 +1,280 @@ +# coding=utf-8 +# Copyright 2025 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for BridgeTower.""" + +from collections.abc import Iterable +from typing import Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + BatchFeature, + DefaultFastImageProcessorKwargs, + ImageInput, + SizeDict, + TensorType, + Unpack, + group_images_by_shape, + reorder_images, +) +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import auto_docstring + + +def make_pixel_mask( + image: "torch.Tensor", + output_size: tuple[int, int], +) -> "torch.Tensor": + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = image.shape[-2:] + batch_size = image.size(0) + mask = torch.zeros((batch_size, *output_size), dtype=torch.long) + mask[:input_height, :input_width] = 1 + return mask + + +def get_resize_output_image_size( + input_image: "torch.Tensor", + shorter: int = 800, + longer: int = 1333, + size_divisor: int = 32, +) -> tuple[int, int]: + input_height, input_width = input_image.shape[-2:] + min_size, max_size = shorter, longer + + scale = min_size / min(input_height, input_width) + + if input_height < input_width: + new_height = min_size + new_width = scale * input_width + else: + new_height = scale * input_height + new_width = min_size + + if max(new_height, new_width) > max_size: + scale = max_size / max(new_height, new_width) + new_height = scale * new_height + new_width = scale * new_width + + new_height, new_width = int(new_height + 0.5), int(new_width + 0.5) + new_height = new_height // size_divisor * size_divisor + new_width = new_width // size_divisor * size_divisor + + return new_height, new_width + + +class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + Args: + size_divisor (`int`, *optional*, defaults to 32): + The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize` + is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method. + """ + + size_divisor: Optional[int] + + +@auto_docstring +class BridgeTowerImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 288} + default_to_square = False + crop_size = {"shortest_edge": 288} + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_pad = True + size_divisor = 32 + valid_kwargs = BridgeTowerFastImageProcessorKwargs + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + size_divisor: int = 32, + interpolation: Optional["F.InterpolationMode"] = None, + antialias: bool = True, + **kwargs, + ) -> "torch.Tensor": + """ + Resize an image. + + Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the + longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then + resized to the max size while preserving the aspect ratio. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + size_divisor (`int`, *optional*, defaults to 32): + The image is resized to a size that is a multiple of this value. + resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + + Returns: + `torch.Tensor`: The resized image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if not size.shortest_edge: + raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") + shorter = size.shortest_edge + longer = int(1333 / 800 * shorter) + output_height, output_width = get_resize_output_image_size( + image, + shorter=shorter, + longer=longer, + size_divisor=size_divisor, + ) + return super().resize( + image=image, + size=SizeDict(height=output_height, width=output_width), + interpolation=interpolation, + antialias=antialias, + ) + + def center_crop( + self, + image: "torch.Tensor", + size: dict[str, int], + **kwargs, + ) -> "torch.Tensor": + """ + Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along + any edge, the image is padded with 0's and then center cropped. + + Args: + image (`torch.Tensor`): + Image to center crop. + size (`dict[str, int]`): + Size of the output image in the form `{"height": h, "width": w}`. + """ + output_size = size.shortest_edge + return F.center_crop( + image, + output_size=(output_size, output_size), + **kwargs, + ) + + def _pad_image( + self, + image: "torch.Tensor", + output_size: tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + ) -> "torch.Tensor": + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = image.shape[-2:] + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = (0, 0, pad_right, pad_bottom) + padded_image = F.pad( + image, + padding, + fill=constant_values, + ) + return padded_image + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + size_divisor: Optional[int], + interpolation: Optional["F.InterpolationMode"], + do_pad: bool, + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize( + image=stacked_images, size=size, size_divisor=size_divisor, interpolation=interpolation + ) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + + data = {} + if do_pad: + processed_images, processed_masks = self.pad( + processed_images, return_mask=True, disable_grouping=disable_grouping + ) + processed_masks = torch.stack(processed_masks, dim=0) if return_tensors else processed_masks + data["pixel_mask"] = processed_masks + + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + data["pixel_values"] = processed_images + + return BatchFeature(data=data, tensor_type=return_tensors) + + def to_dict(self): + encoder_dict = super().to_dict() + encoder_dict.pop("_valid_processor_keys", None) + encoder_dict.pop("crop_size", None) + return encoder_dict + + +__all__ = ["BridgeTowerImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db2957f2ef98ff23397c91c9eadb93795eb22a9f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/configuration_bros.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/configuration_bros.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa9e6d5efddbd435936c37948ecd0930426c4131 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/configuration_bros.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/modeling_bros.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/modeling_bros.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6edfae2e587252f3bd2122f985ccd9b9d9d659cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/modeling_bros.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/processing_bros.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/processing_bros.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d9470bfa7a80a1f2ddca41d8b82bb0957b8d462 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/processing_bros.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9d90f64de97f78ccaf1592c3c5cad40a9b2d5dcb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_camembert import * + from .modeling_camembert import * + from .modeling_tf_camembert import * + from .tokenization_camembert import * + from .tokenization_camembert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..3979e54874439aa41feb6abe3815df5c7a997419 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CamemBERT configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CamembertConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`CamembertModel`] or a [`TFCamembertModel`]. It is + used to instantiate a Camembert model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Camembert + [almanach/camembert-base](https://huggingface.co/almanach/camembert-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CamembertModel`] or [`TFCamembertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`CamembertModel`] or [`TFCamembertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Example: + + ```python + >>> from transformers import CamembertConfig, CamembertModel + + >>> # Initializing a Camembert almanach/camembert-base style configuration + >>> configuration = CamembertConfig() + + >>> # Initializing a model (with random weights) from the almanach/camembert-base style configuration + >>> model = CamembertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "camembert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class CamembertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["CamembertConfig", "CamembertOnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..3a07402f739a603ad3b8a50a197b7784bfc1d2a7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py @@ -0,0 +1,1576 @@ +# coding=utf-8 +# Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CamemBERT model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, gelu +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_camembert import CamembertConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Camembert +class CamembertEmbeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Camembert +class CamembertSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = self.query(hidden_states) + query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + + is_updated = False + is_cross_attention = encoder_hidden_states is not None + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_layer = curr_past_key_value.layers[self.layer_idx].keys + value_layer = curr_past_key_value.layers[self.layer_idx].values + else: + key_layer = self.key(current_states) + key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + value_layer = self.value(current_states) + value_layer = value_layer.view( + batch_size, -1, self.num_attention_heads, self.attention_head_size + ).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_layer, value_layer = curr_past_key_value.update( + key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if past_key_values is not None: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSdpaSelfAttention with Roberta->Camembert +class CamembertSdpaSelfAttention(CamembertSelfAttention): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx) + self.dropout_prob = config.attention_probs_dropout_prob + + # Adapted from CamembertSelfAttention + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. + logger.warning_once( + "CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " + "the manual attention implementation, but specifying the manual implementation will be required from " + "Transformers version v5.0.0 onwards. This warning can be removed using the argument " + '`attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + past_key_values, + output_attentions, + cache_position, + ) + + bsz, tgt_len, _ = hidden_states.size() + + query_layer = ( + self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) + ) + + is_updated = False + is_cross_attention = encoder_hidden_states is not None + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_layer = curr_past_key_value.layers[self.layer_idx].keys + value_layer = curr_past_key_value.layers[self.layer_idx].values + else: + key_layer = ( + self.key(current_states) + .view(bsz, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(current_states) + .view(bsz, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_layer, value_layer = curr_past_key_value.update( + key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create + # a causal mask in case tgt_len == 1. + is_causal = self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attention_mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) + + return attn_output, None + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->Camembert +class CamembertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +CAMEMBERT_SELF_ATTENTION_CLASSES = { + "eager": CamembertSelfAttention, + "sdpa": CamembertSdpaSelfAttention, +} + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert,ROBERTA->CAMEMBERT +class CamembertAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + self.self = CAMEMBERT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, + position_embedding_type=position_embedding_type, + layer_idx=layer_idx, + ) + self.output = CamembertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Roberta->Camembert +class CamembertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Roberta->Camembert +class CamembertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->Camembert +class CamembertLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = CamembertAttention(config, layer_idx=layer_idx) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = CamembertAttention(config, position_embedding_type="absolute", layer_idx=layer_idx) + self.intermediate = CamembertIntermediate(config) + self.output = CamembertOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->Camembert +class CamembertEncoder(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + self.config = config + self.layer = nn.ModuleList([CamembertLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and self.config.is_decoder and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + + if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler +class CamembertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@auto_docstring +class CamembertPreTrainedModel(PreTrainedModel): + config: CamembertConfig + base_model_prefix = "roberta" + supports_gradient_checkpointing = True + _supports_sdpa = True + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->CamembertLMHead + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, CamembertLMHead): + module.bias.data.zero_() + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Camembert +class CamembertClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Camembert +class CamembertLMHead(nn.Module): + """Camembert Head for masked language modeling.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.decoder.bias = self.bias + + def forward(self, features, **kwargs): + x = self.dense(features) + x = gelu(x) + x = self.layer_norm(x) + + # project back to size of vocabulary with bias + x = self.decoder(x) + + return x + + def _tie_weights(self): + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + self.bias = self.decoder.bias + + +@auto_docstring +class CamembertModel(CamembertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in *Attention is + all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to + `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762 + + """ + + _no_split_modules = [] + + # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.__init__ with Roberta->Camembert + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + self.embeddings = CamembertEmbeddings(config) + self.encoder = CamembertEncoder(config) + + self.pooler = CamembertPooler(config) if add_pooling_layer else None + + self.attn_implementation = config._attn_implementation + self.position_embedding_type = config.position_embedding_type + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.forward + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values[0][0].shape[-2] + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device) + + use_sdpa_attention_masks = ( + self.attn_implementation == "sdpa" + and self.position_embedding_type == "absolute" + and head_mask is None + and not output_attentions + ) + + # Expand the attention mask + if use_sdpa_attention_masks and attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + if self.config.is_decoder: + extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + embedding_output, + past_key_values_length, + ) + else: + extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForMaskedLM(CamembertPreTrainedModel): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.lm_head = CamembertLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MaskedLMOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + masked_lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(prediction_scores.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForSequenceClassification(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.classifier = CamembertClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForMultipleChoice(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.roberta = CamembertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + flat_inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.roberta( + flat_input_ids, + position_ids=flat_position_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(reshaped_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForTokenClassification(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta = CamembertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForQuestionAnswering(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + CamemBERT Model with a `language modeling` head on top for CLM fine-tuning. + """ +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, FacebookAI/roberta-base->almanach/camembert-base +class CamembertForCausalLM(CamembertPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.lm_head = CamembertLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Example: + + ```python + >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base") + >>> config = AutoConfig.from_pretrained("almanach/camembert-base") + >>> config.is_decoder = True + >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(prediction_scores.device) + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +__all__ = [ + "CamembertForCausalLM", + "CamembertForMaskedLM", + "CamembertForMultipleChoice", + "CamembertForQuestionAnswering", + "CamembertForSequenceClassification", + "CamembertForTokenClassification", + "CamembertModel", + "CamembertPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..0869902aa96245ead1bb22f2e818517288a35534 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py @@ -0,0 +1,1800 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 CamemBERT model.""" + +from __future__ import annotations + +import math +import warnings + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPoolingAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_camembert import CamembertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "almanach/camembert-base" +_CONFIG_FOR_DOC = "CamembertConfig" + + +CAMEMBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`CamembertConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CAMEMBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings +class TFCamembertEmbeddings(keras.layers.Layer): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.padding_idx = 1 + self.config = config + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding + symbols are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + input_ids: tf.Tensor + Returns: tf.Tensor + """ + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask + + return incremental_indices + self.padding_idx + + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + past_key_values_length=0, + training=False, + ): + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = self.create_position_ids_from_input_ids( + input_ids=input_ids, past_key_values_length=past_key_values_length + ) + else: + position_ids = tf.expand_dims( + tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert +class TFCamembertPooler(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert +class TFCamembertSelfAttention(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + + self.is_decoder = config.is_decoder + self.config = config + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFCamembertModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert +class TFCamembertSelfOutput(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert +class TFCamembertAttention(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFCamembertSelfAttention(config, name="self") + self.dense_output = TFCamembertSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + self_outputs = self.self_attention( + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training + ) + # add attentions (possibly with past_key_value) if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert +class TFCamembertIntermediate(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert +class TFCamembertOutput(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert +class TFCamembertLayer(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFCamembertAttention(config, name="attention") + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = TFCamembertAttention(config, name="crossattention") + self.intermediate = TFCamembertIntermediate(config, name="intermediate") + self.bert_output = TFCamembertOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_value: tuple[tf.Tensor] | None, + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + input_tensor=attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + outputs # add attentions if we output them + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert +class TFCamembertEncoder(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFCamembertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_values: tuple[tuple[tf.Tensor]] | None, + use_cache: bool | None, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert +class TFCamembertMainLayer(keras.layers.Layer): + config_class = CamembertConfig + + def __init__(self, config, add_pooling_layer=True, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.is_decoder = config.is_decoder + + self.num_hidden_layers = config.num_hidden_layers + self.initializer_range = config.initializer_range + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + self.encoder = TFCamembertEncoder(config, name="encoder") + self.pooler = TFCamembertPooler(config, name="pooler") if add_pooling_layer else None + # The embeddings must be the last declaration in order to follow the weights order + self.embeddings = TFCamembertEmbeddings(config, name="embeddings") + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings + def get_input_embeddings(self) -> keras.layers.Layer: + return self.embeddings + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + @unpack_inputs + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]: + if not self.config.is_decoder: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + + if past_key_values is None: + past_key_values_length = 0 + past_key_values = [None] * len(self.encoder.layer) + else: + past_key_values_length = shape_list(past_key_values[0][0])[-2] + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + training=training, + ) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask_shape = shape_list(attention_mask) + + mask_seq_length = seq_length + past_key_values_length + # Copied from `modeling_tf_t5.py` + # Provided a padding mask of dimensions [batch_size, mask_seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + if self.is_decoder: + seq_ids = tf.range(mask_seq_length) + causal_mask = tf.less_equal( + tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), + seq_ids[None, :, None], + ) + causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype) + extended_attention_mask = causal_mask * attention_mask[:, None, :] + attention_mask_shape = shape_list(extended_attention_mask) + extended_attention_mask = tf.reshape( + extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2]) + ) + if past_key_values[0] is not None: + # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length] + extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :] + else: + extended_attention_mask = tf.reshape( + attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + one_cst = tf.constant(1.0, dtype=embedding_output.dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + # Copied from `modeling_tf_t5.py` with -1e9 -> -10000 + if self.is_decoder and encoder_attention_mask is not None: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None + + if not return_dict: + return ( + sequence_output, + pooled_output, + ) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + + +class TFCamembertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CamembertConfig + base_model_prefix = "roberta" + + +@add_start_docstrings( + "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertModel(TFCamembertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.roberta = TFCamembertMainLayer(config, name="roberta") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutputWithPoolingAndCrossAttentions: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + """ + outputs = self.roberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert +class TFCamembertLMHead(keras.layers.Layer): + """Camembert Head for masked language modeling.""" + + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.hidden_size = config.hidden_size + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.act = get_tf_activation("gelu") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = input_embeddings + + def build(self, input_shape=None): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + + def get_output_embeddings(self): + return self.decoder + + def set_output_embeddings(self, value): + self.decoder.weight = value + self.decoder.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.layer_norm(hidden_states) + + # project back to size of vocabulary with bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top.""", + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.lm_head = TFCamembertLMHead(config, self.roberta.embeddings, name="lm_head") + + def get_lm_head(self): + return self.lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.lm_head.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="", + expected_output="' Paris'", + expected_loss=0.1, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMaskedLMOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead +class TFCamembertClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + self.config = config + + def call(self, features, training=False): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, training=training) + x = self.dense(x) + x = self.dropout(x, training=training) + x = self.out_proj(x) + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenceClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.classifier = TFCamembertClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="cardiffnlp/twitter-roberta-base-emotion", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'optimism'", + expected_loss=0.08, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output, training=training) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) + + +@add_start_docstrings( + """ + CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="ydshieh/roberta-large-ner-english", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']", + expected_loss=0.01, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFTokenClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"lm_head"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.roberta = TFCamembertMainLayer(config, name="roberta") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward( + CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + outputs = self.roberta( + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, training=training) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsweringLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="ydshieh/roberta-base-squad2", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="' puppet'", + expected_loss=0.86, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + loss = None + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"] + + def __init__(self, config: CamembertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if not config.is_decoder: + logger.warning("If you want to use `TFCamembertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.lm_head = TFCamembertLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head") + + def get_lm_head(self): + return self.lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.lm_head.name + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = tf.ones(input_shape) + + # cut decoder_input_ids if past is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values} + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFCausalLMOutputWithCrossAttentions | tuple[tf.Tensor]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + outputs = self.roberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + logits = self.lm_head(hidden_states=sequence_output, training=training) + loss = None + + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) + + +__all__ = [ + "TFCamembertForCausalLM", + "TFCamembertForMaskedLM", + "TFCamembertForMultipleChoice", + "TFCamembertForQuestionAnswering", + "TFCamembertForSequenceClassification", + "TFCamembertForTokenClassification", + "TFCamembertModel", + "TFCamembertPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..cd6e399f208df858d647dce23fe0ad74248d80b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py @@ -0,0 +1,323 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for Camembert model.""" + +import os +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + + +SPIECE_UNDERLINE = "▁" + + +@requires(backends=("sentencepiece",)) +class CamembertTokenizer(PreTrainedTokenizer): + """ + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a CamemBERT tokenizer. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`list[str]`, *optional*, defaults to `['NOTUSED', 'NOTUSED', 'NOTUSED']`): + Additional special tokens used by the tokenizer. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + additional_special_tokens=["NOTUSED", "NOTUSED", "NOTUSED"], + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True) + if isinstance(mask_token, str) + else mask_token + ) + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # HACK: These tokens were added by the author for an obscure reason as they were already part of the + # sentencepiece vocabulary (this is the case for and and ). + # In this case it is recommended to properly set the tokens by hand. + self._added_tokens_decoder = { + 0: AddedToken("NOTUSED", special=True), + 1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token, + 2: AddedToken("NOTUSED", special=True), + 3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token, + 4: AddedToken("NOTUSED", special=True), + } + + self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4 + + # legacy: camemebert is a particular case were we have to make sure `"NOTUSED"` is here + if "added_tokens_decoder" in kwargs: + # this is the only class that requires this unfortunately..... + # the reason is that the fast version has a whole. + kwargs["added_tokens_decoder"].update(self._added_tokens_decoder) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self): + # The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning. + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> list[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + # specific to camembert, both 3 and 4 point to the unk token. + if self.sp_model.PieceToId(token) == 0: + # Convert sentence piece unk token to fairseq unk token index + return self.unk_token_id + return self.fairseq_offset + self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + # TODO decode outputs do not match between fast and slow + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An CamemBERT sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like + RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["CamembertTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..423058ed959aeb3e7122fb3730a6d0f1f57b5982 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Fast tokenization classes for Camembert model.""" + +import os +from shutil import copyfile +from typing import Optional + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_camembert import CamembertTokenizer +else: + CamembertTokenizer = None + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class CamembertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from + [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = CamembertTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + additional_special_tokens=["NOTUSED", "NOTUSED", "NOTUSED"], + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it. Will have normalized = False + mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An CamemBERT sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like + RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["CamembertTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6ad11a90a24bc4e8c9fd744bca6297e5388fd52e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_chameleon import * + from .image_processing_chameleon import * + from .image_processing_chameleon_fast import * + from .modeling_chameleon import * + from .processing_chameleon import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..34436a5288c8187d893d7ca4775b812b4c4d7961 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py @@ -0,0 +1,282 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""chameleon model configuration""" + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ChameleonVQVAEConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ChameleonVQModel`]. It is used to instantiate a + `ChameleonVQModel` according to the specified arguments, defining the model architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a + configuration with the defaults will yield a similar configuration to the VQModel of the + [meta/chameleon-7B](https://huggingface.co/meta/chameleon-7B). + + Args: + embed_dim (`int`, *optional*, defaults to 256): + Dimensionality of each embedding vector. + num_embeddings (`int`, *optional*, defaults to 8192): + Number of codebook embeddings. + double_latent (`bool`, *optional*, defaults to `False`): + Whether to use double z channels. + latent_channels (`int`, *optional*, defaults to 256): + Number of channels for the latent space. + resolution (`int`, *optional*, defaults to 512): + Resolution of the input images. + in_channels (`int`, *optional*, defaults to 3): + Number of input channels. + base_channels (`int`, *optional*, defaults to 128): + Base channel count. + channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`): + Channel multipliers for each resolution. + num_res_blocks (`int`, *optional*, defaults to 2): + Number of residual blocks. + attn_resolutions (`list[int]`, *optional*): + Resolutions to apply attention. + dropout (`float`, *optional*, defaults to 0.0): + Dropout rate. + attn_type (`str`, *optional*, defaults to `"vanilla"`): + Attention type used in VQ-GAN encoder. Can be "vanilla" or None. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + """ + + model_type = "chameleon_vqgan" + base_config_key = "vq_config" + + def __init__( + self, + embed_dim: int = 256, + num_embeddings: int = 8192, + double_latent: bool = False, + latent_channels: int = 256, + resolution: int = 512, + in_channels: int = 3, + base_channels: int = 128, + channel_multiplier: list[int] = [1, 1, 2, 2, 4], + num_res_blocks: int = 2, + attn_resolutions: Optional[list[int]] = None, + dropout: float = 0.0, + attn_type: str = "vanilla", + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + self.num_embeddings = num_embeddings + self.double_latent = double_latent + self.latent_channels = latent_channels + self.resolution = resolution + self.in_channels = in_channels + self.base_channels = base_channels + self.channel_multiplier = channel_multiplier + self.num_res_blocks = num_res_blocks + self.attn_resolutions = attn_resolutions + self.dropout = dropout + self.attn_type = attn_type + self.initializer_range = initializer_range + + +class ChameleonConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ChameleonModel`]. It is used to instantiate a + chameleon model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the + [meta/chameleon-7B](https://huggingface.co/meta/chameleon-7B). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 65536): + Vocabulary size of the chameleon model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ChameleonModel`]; this includes text and image tokens. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Chameleon supports up to 4096 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/Localchameleon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + model_parallel_size (`int`, *optional*, defaults to 1): + Number of shards used when training the model. This will be used in qk layernorm because the original Chameleon inference + doesn't do reduction in those layers and each rank has its own biases. + swin_norm (`bool`, *optional*, defaults to `False`): + Use Swin Transformer normalization. + vq_config (`dict`, *optional*): + ChameleonVQConfig instance containing the configuration for the VQ-VAE model. + vocabulary_map (`dict`, *optional*): + A dictionary containing the vocabulary map from the tokenizer. Used to obtain tokens from the image inputs. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + + + ```python + >>> from transformers import ChameleonModel, ChameleonConfig + + >>> # Initializing a chameleon chameleon-7b style configuration + >>> configuration = ChameleonConfig() + + >>> # Initializing a model from the chameleon-7b style configuration + >>> model = ChameleonModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "chameleon" + sub_configs = {"vq_config": ChameleonVQVAEConfig} + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=65536, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + model_parallel_size=1, + swin_norm=False, + vq_config=None, + vocabulary_map=None, + mlp_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_bias = mlp_bias + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.model_parallel_size = model_parallel_size + self.swin_norm = swin_norm + + if vq_config is None: + vq_config = {} + logger.info("vq_config is None. initializing the ChameleonVQConfig with default values.") + + self.vq_config = ChameleonVQVAEConfig(**vq_config) + + self.vocabulary_map = vocabulary_map + self.image_token_id = vocabulary_map.get("") if vocabulary_map is not None else None + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + + +__all__ = ["ChameleonConfig", "ChameleonVQVAEConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..9cae9d7bdd34d8b7a2292059ae1e92b587891272 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py @@ -0,0 +1,341 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Chameleon.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import get_resize_output_image_size, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +logger = logging.get_logger(__name__) + +if is_vision_available(): + import PIL + + +class ChameleonImageProcessor(BaseImageProcessor): + r""" + Constructs a Chameleon image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 512}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to 1): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to {"height": 512, "width": 512}): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to 0.0078): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PIL.Image.LANCZOS, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 0.0078, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 512} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 512, "width": 512} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else [1.0, 1.0, 1.0] + self.image_std = image_std if image_std is not None else [1.0, 1.0, 1.0] + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + default_to_square = True + if "shortest_edge" in size: + size = size["shortest_edge"] + default_to_square = False + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + default_to_square=default_to_square, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [self.blend_rgba(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + all_images.append(image) + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def blend_rgba(self, image: ImageInput) -> ImageInput: + """ + Convert image to RGB by blending the transparency layer if it's in RGBA format. + If image is not `PIL.Image`, it si simply returned without modifications. + + Args: + image (`ImageInput`): + Image to convert. + """ + + if not isinstance(image, PIL.Image.Image): + return image + elif image.mode == "RGB": + return image + + img_rgba = np.array(image.convert("RGBA")) + + # If there is no transparency layer, simple convert and return. + if not (img_rgba[:, :, 3] < 255).any(): + return image.convert("RGB") + + # There is a transparency layer, blend it with a white background. + # Calculate the alpha proportion for blending. + alpha = img_rgba[:, :, 3] / 255.0 + img_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[:, :, np.newaxis] * img_rgba[:, :, :3] + return PIL.Image.fromarray(img_rgb.astype("uint8"), "RGB") + + +__all__ = ["ChameleonImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..1d102614f7df3700845c00f9d8bfa217930c776b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py @@ -0,0 +1,112 @@ +# coding=utf-8 +# Copyright 2025 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for Chameleon.""" + +from typing import Optional + +import numpy as np +import PIL +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_utils import ImageInput, PILImageResampling, SizeDict +from ...utils import auto_docstring, logging + + +logger = logging.get_logger(__name__) + + +@auto_docstring +class ChameleonImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.LANCZOS + image_mean = [1.0, 1.0, 1.0] + image_std = [1.0, 1.0, 1.0] + size = {"shortest_edge": 512} + default_to_square = False + crop_size = {"height": 512, "width": 512} + do_resize = True + do_center_crop = True + do_rescale = True + rescale_factor = 0.0078 + do_normalize = True + do_convert_rgb = True + + def convert_to_rgb(self, image: ImageInput) -> ImageInput: + """ + Convert image to RGB by blending the transparency layer if it's in RGBA format. + If image is not `PIL.Image`, it si simply returned without modifications. + + Args: + image (`ImageInput`): + Image to convert. + """ + + if not isinstance(image, PIL.Image.Image): + return image + elif image.mode == "RGB": + return image + + img_rgba = np.array(image.convert("RGBA")) + + # If there is no transparency layer, simple convert and return. + if not (img_rgba[:, :, 3] < 255).any(): + return image.convert("RGB") + + # There is a transparency layer, blend it with a white background. + # Calculate the alpha proportion for blending. + alpha = img_rgba[:, :, 3] / 255.0 + img_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[:, :, np.newaxis] * img_rgba[:, :, :3] + return PIL.Image.fromarray(img_rgb.astype("uint8"), "RGB") + + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + interpolation: Optional["F.InterpolationMode"] = None, + **kwargs, + ) -> "torch.Tensor": + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + + Returns: + `torch.Tensor`: The resized image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if interpolation == F.InterpolationMode.LANCZOS: + logger.warning_once( + "You have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. " + "BICUBIC resample will be used as an alternative. Please fall back to slow image processor if you " + "want full consistency with the original model." + ) + interpolation = F.InterpolationMode.BICUBIC + + return super().resize( + image=image, + size=size, + interpolation=interpolation, + **kwargs, + ) + + +__all__ = ["ChameleonImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..d481a62b6fc6608bd3088e4766b91ff843680ea0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py @@ -0,0 +1,196 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Chameleon. +""" + +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ( + MultiModalData, + ProcessingKwargs, + ProcessorMixin, + TextKwargs, + Unpack, +) +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class ChameleonTextKwargs(TextKwargs, total=False): + return_for_text_completion: bool + + +class ChameleonProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: ChameleonTextKwargs + _defaults = { + "text_kwargs": { + "padding": False, + "return_for_text_completion": False, + "return_mm_token_type_ids": False, + }, + "common_kwargs": { + "return_tensors": "pt", + }, + } + + +class ChameleonProcessor(ProcessorMixin): + r""" + Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single + processor. + + [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`]. + See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information. + + Args: + image_processor ([`ChameleonImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + image_seq_length (`int`, *optional*, defaults to 1024): + Sequence length of one image embedding. + image_token (`str`, *optional*, defaults to `""`): + The special token used to indicate image in the text. + """ + + attributes = ["image_processor", "tokenizer"] + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + image_processor_class = "ChameleonImageProcessor" + + def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): + self.image_seq_length = image_seq_length + self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.image_start_token = ( + tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "" + ) # fixed tokens for start and end, so can hardcode + self.image_end_token = tokenizer.eoi_token if hasattr(tokenizer, "eoi_token") else "" + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.image_start_token_id = tokenizer.convert_tokens_to_ids(self.image_start_token) + self.image_end_token_id = tokenizer.convert_tokens_to_ids(self.image_end_token) + self.image_ids = [self.image_token_id, self.image_start_token_id, self.image_end_token_id] + + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + audio=None, + videos=None, + **kwargs: Unpack[ChameleonProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise TypeError("Invalid input text. Please provide a string, or a list of strings") + if text is None and images is None: + raise ValueError("You must provide either text or images") + + output_kwargs = self._merge_kwargs( + ChameleonProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + return_for_text_completion = output_kwargs["text_kwargs"].pop("return_for_text_completion", False) + + # Replace the image token with the expanded image token sequence + prompt_strings = [] + one_img_tokens = self.image_start_token + (self.image_token * self.image_seq_length) + self.image_end_token + for sample in text: + sample = sample.replace(self.image_token, one_img_tokens) + if not return_for_text_completion: + sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode + prompt_strings.append(sample) + + image_inputs = {} + if images is not None: + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + # add 2 for BOI and EOI tokens + num_image_tokens = [self.image_seq_length + 2] * len(image_sizes) + num_image_patches = [1] * len(image_sizes) + + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + +__all__ = ["ChameleonProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6d54ee86aecef2cbe5b9bfdee321a0375d977880 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_clap import * + from .feature_extraction_clap import * + from .modeling_clap import * + from .processing_clap import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/configuration_clap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/configuration_clap.py new file mode 100644 index 0000000000000000000000000000000000000000..900e8d373f5ad15d3d90f2da655abe10f3279f85 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/configuration_clap.py @@ -0,0 +1,382 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CLAP model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ClapTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ClapTextModel`]. It is used to instantiate a CLAP + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the CLAP + [calp-hsat-fused](https://huggingface.co/laion/clap-hsat-fused) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the CLAP model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ClapTextModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"relu"`, + `"relu"`, `"silu"` and `"relu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`]. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + projection_hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + projection_dim (`int`, *optional*, defaults to 512) + Dimension of the projection head of the `ClapTextModelWithProjection`. + + Examples: + + ```python + >>> from transformers import ClapTextConfig, ClapTextModel + + >>> # Initializing a CLAP text configuration + >>> configuration = ClapTextConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = ClapTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "clap_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=50265, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=514, + type_vocab_size=1, + initializer_factor=1.0, + layer_norm_eps=1e-12, + projection_dim=512, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + projection_hidden_act="relu", + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_factor = initializer_factor + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.projection_hidden_act = projection_hidden_act + self.projection_dim = projection_dim + + +class ClapAudioConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ClapAudioModel`]. It is used to instantiate a + CLAP audio encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the audio encoder of the CLAP + [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + window_size (`int`, *optional*, defaults to 8): + Image size of the spectrogram + num_mel_bins (`int`, *optional*, defaults to 64): + Number of mel features used per frames. Should correspond to the value used in the `ClapProcessor` class. + spec_size (`int`, *optional*, defaults to 256): + Desired input size of the spectrogram that the model supports. It can be different from the output of the + `ClapFeatureExtractor`, in which case the input features will be resized. Corresponds to the `image_size` + of the audio models. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + patch_size (`int`, *optional*, defaults to 4): + Patch size for the audio spectrogram + patch_stride (`list`, *optional*, defaults to `[4, 4]`): + Patch stride for the audio spectrogram + num_classes (`int`, *optional*, defaults to 527): + Number of classes used for the head training + hidden_size (`int`, *optional*, defaults to 768): + Hidden size of the output of the audio encoder. Correspond to the dimension of the penultimate layer's + output,which is sent to the projection MLP layer. + projection_dim (`int`, *optional*, defaults to 512): + Hidden size of the projection layer. + depths (`list`, *optional*, defaults to `[2, 2, 6, 2]`): + Depths used for the Swin Layers of the audio model + num_attention_heads (`list`, *optional*, defaults to `[4, 8, 16, 32]`): + Number of attention heads used for the Swin Layers of the audio model + enable_fusion (`bool`, *optional*, defaults to `False`): + Whether or not to enable patch fusion. This is the main contribution of the authors, and should give the + best results. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the encoder. + fusion_type (`[type]`, *optional*): + Fusion type used for the patch fusion. + patch_embed_input_channels (`int`, *optional*, defaults to 1): + Number of channels used for the input spectrogram + flatten_patch_embeds (`bool`, *optional*, defaults to `True`): + Whether or not to flatten the patch embeddings + patch_embeds_hidden_size (`int`, *optional*, defaults to 96): + Hidden size of the patch embeddings. It is used as the number of output channels. + enable_patch_layer_norm (`bool`, *optional*, defaults to `True`): + Whether or not to enable layer normalization for the patch embeddings + drop_path_rate (`float`, *optional*, defaults to 0.0): + Drop path rate for the patch fusion + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether or not to add a bias to the query, key, value projections. + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of the mlp hidden dim to embedding dim. + aff_block_r (`int`, *optional*, defaults to 4): + downsize_ratio used in the AudioFF block + num_hidden_layers (`int`, *optional*, defaults to 4): + Number of hidden layers in the Transformer encoder. + projection_hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + layer_norm_eps (`[type]`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + + Example: + + ```python + >>> from transformers import ClapAudioConfig, ClapAudioModel + + >>> # Initializing a ClapAudioConfig with laion/clap-htsat-fused style configuration + >>> configuration = ClapAudioConfig() + + >>> # Initializing a ClapAudioModel (with random weights) from the laion/clap-htsat-fused style configuration + >>> model = ClapAudioModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "clap_audio_model" + base_config_key = "audio_config" + + def __init__( + self, + window_size=8, + num_mel_bins=64, + spec_size=256, + hidden_act="gelu", + patch_size=4, + patch_stride=[4, 4], + num_classes=527, + hidden_size=768, + projection_dim=512, + depths=[2, 2, 6, 2], + num_attention_heads=[4, 8, 16, 32], + enable_fusion=False, + hidden_dropout_prob=0.1, + fusion_type=None, + patch_embed_input_channels=1, + flatten_patch_embeds=True, + patch_embeds_hidden_size=96, + enable_patch_layer_norm=True, + drop_path_rate=0.0, + attention_probs_dropout_prob=0.0, + qkv_bias=True, + mlp_ratio=4.0, + aff_block_r=4, + num_hidden_layers=4, + projection_hidden_act="relu", + layer_norm_eps=1e-5, + initializer_factor=1.0, + **kwargs, + ): + super().__init__(**kwargs) + self.window_size = window_size + self.num_mel_bins = num_mel_bins + self.spec_size = spec_size + self.patch_size = patch_size + self.patch_stride = patch_stride + self.num_classes = num_classes + self.hidden_size = hidden_size + self.depths = depths + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.window_size = window_size + self.enable_fusion = enable_fusion + self.fusion_type = fusion_type + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.projection_dim = projection_dim + self.flatten_patch_embeds = flatten_patch_embeds + self.patch_embeds_hidden_size = patch_embeds_hidden_size + self.enable_patch_layer_norm = enable_patch_layer_norm + self.drop_path_rate = drop_path_rate + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.qkv_bias = qkv_bias + self.mlp_ratio = mlp_ratio + self.patch_embed_input_channels = patch_embed_input_channels + self.aff_block_r = aff_block_r + self.layer_norm_eps = layer_norm_eps + self.initializer_factor = initializer_factor + self.projection_hidden_act = projection_hidden_act + + +class ClapConfig(PretrainedConfig): + r""" + [`ClapConfig`] is the configuration class to store the configuration of a [`ClapModel`]. It is used to instantiate + a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a + configuration with the defaults will yield a similar configuration to that of the CLAP + [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`ClapTextConfig`]. + audio_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`ClapAudioConfig`]. + logit_scale_init_value (`float`, *optional*, defaults to 14.29): + The initial value of the *logit_scale* parameter. Default is used as per the original CLAP implementation. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and audio projection layers. + projection_hidden_act (`str`, *optional*, defaults to `"relu"`): + Activation function for the projection layers. + initializer_factor (`float`, *optional*, defaults to 1.0): + Factor to scale the initialization of the model weights. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import ClapConfig, ClapModel + + >>> # Initializing a ClapConfig with laion-ai/base style configuration + >>> configuration = ClapConfig() + + >>> # Initializing a ClapModel (with random weights) from the laion-ai/base style configuration + >>> model = ClapModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a ClapConfig from a ClapTextConfig and a ClapAudioConfig + >>> from transformers import ClapTextConfig, ClapAudioConfig + + >>> # Initializing a ClapText and ClapAudioConfig configuration + >>> config_text = ClapTextConfig() + >>> config_audio = ClapAudioConfig() + + >>> config = ClapConfig.from_text_audio_configs(config_text, config_audio) + ```""" + + model_type = "clap" + sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig} + + def __init__( + self, + text_config=None, + audio_config=None, + logit_scale_init_value=(1 / 0.07), + projection_dim=512, + projection_hidden_act="relu", + initializer_factor=1.0, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the ClapTextConfig with default values.") + + if audio_config is None: + audio_config = {} + logger.info("audio_config is None. initializing the ClapAudioConfig with default values.") + + self.text_config = ClapTextConfig(**text_config) + self.audio_config = ClapAudioConfig(**audio_config) + self.text_config.projection_dim = projection_dim + self.audio_config.projection_dim = projection_dim + + self.text_config.projection_hidden_act = projection_hidden_act + self.audio_config.projection_hidden_act = projection_hidden_act + + self.projection_dim = projection_dim + self.projection_hidden_act = projection_hidden_act + self.hidden_size = self.text_config.hidden_size + + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = initializer_factor + self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths) + + +__all__ = ["ClapAudioConfig", "ClapConfig", "ClapTextConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/feature_extraction_clap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/feature_extraction_clap.py new file mode 100644 index 0000000000000000000000000000000000000000..33daac615c07ab879d8515e04e09d01fe27b37fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/feature_extraction_clap.py @@ -0,0 +1,367 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for CLAP.""" + +import copy +from typing import Any, Optional, Union + +import numpy as np +import torch + +from ...audio_utils import mel_filter_bank, spectrogram, window_function +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import TensorType, logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + + +@requires(backends=("torch",)) +class ClapFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs a CLAP feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains + most of the main methods. Users should refer to this superclass for more information regarding those methods. + + This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the *Short Time + Fourier Transform* (STFT) which should match pytorch's `torch.stft` equivalent. + + Args: + feature_size (`int`, *optional*, defaults to 64): + The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters + (`n_mels`). + sampling_rate (`int`, *optional*, defaults to 48000): + The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves + to warn users if the audio fed to the feature extractor does not have the same sampling rate. + hop_length (`int`,*optional*, defaults to 480): + Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split + in smaller `frames` with a step of `hop_length` between each frame. + max_length_s (`int`, *optional*, defaults to 10): + The maximum input length of the model in seconds. This is used to pad the audio. + fft_window_size (`int`, *optional*, defaults to 1024): + Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency + resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. + padding_value (`float`, *optional*, defaults to 0.0): + Padding value used to pad the audio. Should correspond to silences. + return_attention_mask (`bool`, *optional*, defaults to `False`): + Whether or not the model should return the attention masks corresponding to the input. + frequency_min (`float`, *optional*, defaults to 0): + The lowest frequency of interest. The STFT will not be computed for values below this. + frequency_max (`float`, *optional*, defaults to 14000): + The highest frequency of interest. The STFT will not be computed for values above this. + top_db (`float`, *optional*): + The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the + `audio_utils.power_to_db` function + truncation (`str`, *optional*, defaults to `"fusion"`): + Truncation pattern for long audio inputs. Two patterns are available: + - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a + downsampled version of the entire mel spectrogram. + If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy + of the original mel obtained from the padded audio. + - `rand_trunc` will select a random crop of the mel spectrogram. + padding (`str`, *optional*, defaults to `"repeatpad"`): + Padding pattern for shorter audio inputs. Three patterns were originally implemented: + - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`. + - `repeat`: the audio is repeated and then cut to fit the `max_length` + - `pad`: the audio is padded. + """ + + model_input_names = ["input_features", "is_longer"] + + def __init__( + self, + feature_size=64, + sampling_rate=48_000, + hop_length=480, + max_length_s=10, + fft_window_size=1024, + padding_value=0.0, + return_attention_mask=False, # pad inputs to max length with silence token (zero) and no attention mask + frequency_min: float = 0, + frequency_max: float = 14_000, + top_db: Optional[int] = None, + truncation: str = "fusion", + padding: str = "repeatpad", + **kwargs, + ): + super().__init__( + feature_size=feature_size, + sampling_rate=sampling_rate, + padding_value=padding_value, + return_attention_mask=return_attention_mask, + **kwargs, + ) + self.top_db = top_db + self.truncation = truncation + self.padding = padding + self.fft_window_size = fft_window_size + self.nb_frequency_bins = (fft_window_size >> 1) + 1 + self.hop_length = hop_length + self.max_length_s = max_length_s + self.nb_max_samples = max_length_s * sampling_rate + self.sampling_rate = sampling_rate + self.frequency_min = frequency_min + self.frequency_max = frequency_max + self.mel_filters = mel_filter_bank( + num_frequency_bins=self.nb_frequency_bins, + num_mel_filters=feature_size, + min_frequency=frequency_min, + max_frequency=frequency_max, + sampling_rate=sampling_rate, + norm=None, + mel_scale="htk", + ) + self.mel_filters_slaney = mel_filter_bank( + num_frequency_bins=self.nb_frequency_bins, + num_mel_filters=feature_size, + min_frequency=frequency_min, + max_frequency=frequency_max, + sampling_rate=sampling_rate, + norm="slaney", + mel_scale="slaney", + ) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + + Returns: + `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the + mel filter banks, which do not need to be saved or printed as they are too long. + """ + output = copy.deepcopy(self.__dict__) + output["feature_extractor_type"] = self.__class__.__name__ + if "mel_filters" in output: + del output["mel_filters"] + if "mel_filters_slaney" in output: + del output["mel_filters_slaney"] + return output + + def _np_extract_fbank_features(self, waveform: np.ndarray, mel_filters: Optional[np.ndarray] = None) -> np.ndarray: + """ + Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter + banks are used depending on the truncation pattern: + - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from + calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation` + is set to `"fusion"`. + - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used + `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original + implementation when the truncation mode is not `"fusion"`. + """ + log_mel_spectrogram = spectrogram( + waveform, + window_function(self.fft_window_size, "hann"), + frame_length=self.fft_window_size, + hop_length=self.hop_length, + power=2.0, + mel_filters=mel_filters, + log_mel="dB", + ) + return log_mel_spectrogram.T + + def _random_mel_fusion(self, mel, total_frames, chunk_frames): + ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3) + if len(ranges[1]) == 0: + # if the audio is too short, we just use the first chunk + ranges[1] = [0] + if len(ranges[2]) == 0: + # if the audio is too short, we just use the first chunk + ranges[2] = [0] + # randomly choose index for each part + idx_front = np.random.choice(ranges[0]) + idx_middle = np.random.choice(ranges[1]) + idx_back = np.random.choice(ranges[2]) + + mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :] + mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :] + mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :] + + mel = torch.tensor(mel[None, None, :]) + mel_shrink = torch.nn.functional.interpolate( + mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False + ) + mel_shrink = mel_shrink[0][0].numpy() + mel_fusion = np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0) + return mel_fusion + + def _get_input_mel(self, waveform: np.ndarray, max_length, truncation, padding) -> np.ndarray: + """ + Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments. + Four different path are possible: + - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram + will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram + are then stacked together. They will later be used for `feature_fusion`. + - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is + padded based on `padding`. + - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded + based on `padding`, and is repeated `4` times. + - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel + spectrogram will be computed on a random crop of the waveform. + + """ + if waveform.shape[0] > max_length: + if truncation == "rand_trunc": + longer = True + # random crop to max_length (for compatibility) -> this should be handled by self.pad + overflow = len(waveform) - max_length + idx = np.random.randint(0, overflow + 1) + waveform = waveform[idx : idx + max_length] + input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :] + elif truncation == "fusion": + mel = self._np_extract_fbank_features(waveform, self.mel_filters) + chunk_frames = max_length // self.hop_length + 1 # the +1 related to how the spectrogram is computed + total_frames = mel.shape[0] + if chunk_frames == total_frames: + # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_length. + # In this case, we just use the whole audio. + input_mel = np.stack([mel, mel, mel, mel], axis=0) + longer = False + else: + input_mel = self._random_mel_fusion(mel, total_frames, chunk_frames) + longer = True + else: + raise NotImplementedError(f"data_truncating {truncation} not implemented") + + else: + longer = False + # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding + if waveform.shape[0] < max_length: + if padding == "repeat": + n_repeat = int(max_length / len(waveform)) + waveform = np.tile(waveform, n_repeat + 1)[:max_length] + if padding == "repeatpad": + n_repeat = int(max_length / len(waveform)) + waveform = np.tile(waveform, n_repeat) + waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0) + + if truncation == "fusion": + input_mel = self._np_extract_fbank_features(waveform, self.mel_filters) + input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0) + else: + input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :] + + return input_mel, longer + + def __call__( + self, + raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + truncation: Optional[str] = None, + padding: Optional[str] = None, + max_length: Optional[int] = None, + sampling_rate: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). + + Args: + raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`): + The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. + truncation (`str`, *optional*): + Truncation pattern for long audio inputs. Two patterns are available: + - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and + a downsampled version of the entire mel spectrogram. + If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a + copy of the original mel obtained from the padded audio. + - `rand_trunc` will select a random crop of the mel spectrogram. + padding (`str`, *optional*): + Padding pattern for shorter audio inputs. Three patterns were originally implemented: + - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`. + - `repeat`: the audio is repeated and then cut to fit the `max_length` + - `pad`: the audio is padded. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.np.array` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + sampling_rate (`int`, *optional*): + The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition + pipeline. + """ + truncation = truncation if truncation is not None else self.truncation + padding = padding if padding else self.padding + + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a" + f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input" + f" was sampled with {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") + is_batched = is_batched_numpy or ( + isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) + ) + + if is_batched: + raw_speech = [np.asarray(speech, dtype=np.float64) for speech in raw_speech] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech, dtype=np.float64) + elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64): + raw_speech = raw_speech.astype(np.float64) + + # always return batch + if not is_batched: + raw_speech = [np.asarray(raw_speech)] + + # convert to mel spectrogram, truncate and pad if needed. + padded_inputs = [ + self._get_input_mel(waveform, max_length if max_length else self.nb_max_samples, truncation, padding) + for waveform in raw_speech + ] + + input_mel = [] + is_longer = [] + for mel, longer in padded_inputs: + input_mel.append(mel) + is_longer.append(longer) + + if truncation == "fusion" and sum(is_longer) == 0: + # if no audio is longer than 10s, then randomly select one audio to be longer + rand_idx = np.random.randint(0, len(input_mel)) + is_longer[rand_idx] = True + + if isinstance(input_mel[0], list): + input_mel = [np.asarray(feature, dtype=np.float64) for feature in input_mel] + + # is_longer is a list of bool + is_longer = [[longer] for longer in is_longer] + + input_features = {"input_features": input_mel, "is_longer": is_longer} + input_features = BatchFeature(input_features) + + if return_tensors is not None: + input_features = input_features.convert_to_tensors(return_tensors) + + return input_features + + +__all__ = ["ClapFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.py new file mode 100644 index 0000000000000000000000000000000000000000..9d81a26581dd35dcfcc06b0f1881640acd47a070 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.py @@ -0,0 +1,1929 @@ +# coding=utf-8 +# Copyright 2023 The LAION-AI Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CLAP model.""" + +import collections +import math +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs, logging, torch_int +from .configuration_clap import ClapAudioConfig, ClapConfig, ClapTextConfig + + +logger = logging.get_logger(__name__) + + +# Adapted from: https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191 +def interpolate(hidden_states, ratio): + """ + Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN. + + Args: + hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)): + Input hidden states + ratio (`int`): + The ratio of the length of the output to the length of the input. + """ + (batch_size, time_length, classes_num) = hidden_states.shape + upsampled = hidden_states[:, :, None, :].repeat(1, 1, ratio, 1) + upsampled = upsampled.reshape(batch_size, time_length * ratio, classes_num) + return upsampled + + +# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249 +def window_partition(hidden_states, window_size): + """ + Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size, + num_channels)` + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`): + Input hidden states + window_size (`int`): + Window size + """ + batch_size, height, width, num_channels = hidden_states.shape + + hidden_states = hidden_states.view( + batch_size, height // window_size, window_size, width // window_size, window_size, num_channels + ) + windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels) + return windows + + +# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263 +def window_reverse(windows, window_size, height, width): + """ + Merges windows to produce higher resolution features. + Args: + windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`): + Input windows + window_size (`int`): + Window size + height (`int`): + Height of the resized audio + width (`int`): + Width of the resized audio + """ + num_channels = windows.shape[-1] + windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels) + windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels) + return windows + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +# contrastive loss function, adapted from +# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html#CLIP-loss-function +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + labels = torch.arange(len(logits), device=logits.device) + return nn.functional.cross_entropy(logits, labels) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for text model's outputs that also contains a pooling of the last hidden states. + """ +) +# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap +class ClapTextModelOutput(ModelOutput): + r""" + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The text embeddings obtained by applying the projection layer to the pooler_output. + """ + + text_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + ClapAudio model output to mimic the output of the original implementation. + """ +) +class ClapAudioModelOutput(ModelOutput): + r""" + audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + The Audio embeddings obtained by applying the projection layer to the pooler_output. + """ + + audio_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring +# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio +class ClapOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for audio-text similarity. + logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`): + The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`): + The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`]. + audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`]. + text_model_output (`BaseModelOutputWithPooling`): + The output of the [`ClapTextModel`]. + audio_model_output (`BaseModelOutputWithPooling`): + The output of the [`ClapAudioModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_audio: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + audio_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + audio_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "audio_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +# Adapted from transformers.models.swin.modeling_swin.SwinDropPath +class ClapDropPath(nn.Module): + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly + refactored version of the `SwinDropPath` implementation. + """ + + def __init__(self, drop_prob=None): + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states): + if self.drop_prob == 0.0 or not self.training: + return hidden_states + + keep_prob = 1 - self.drop_prob + # work with diff dim tensors, not just 2D ConvNets + shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1) + + random_tensor = keep_prob + torch.rand(shape, dtype=hidden_states.dtype, device=hidden_states.device) + random_tensor.floor_() # binarize + output = hidden_states.div(keep_prob) * random_tensor + return output + + +# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133 +class ClapAudioAFFBlock(nn.Module): + r""" + ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement + the 1D version. + """ + + def __init__(self, config: ClapAudioConfig): + super().__init__() + channels = config.patch_embeds_hidden_size + downsize_ratio = config.aff_block_r + inter_channels = int(channels // downsize_ratio) + + self.local_att = nn.Sequential( + nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2d(inter_channels), + nn.ReLU(inplace=True), + nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2d(channels), + ) + self.global_att = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2d(inter_channels), + nn.ReLU(inplace=True), + nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2d(channels), + ) + + self.sigmoid = nn.Sigmoid() + + def forward(self, hidden_states, residual): + attention_input = hidden_states + residual + + fused_layer_output = self.local_att(attention_input) + self.global_att(attention_input) + fused_layer_output = self.sigmoid(fused_layer_output) + + output = 2 * hidden_states * fused_layer_output + 2 * residual * (1 - fused_layer_output) + return output + + +class ClapAudioPatchEmbed(nn.Module): + """ + This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the + Transformer block. + """ + + def __init__(self, config: ClapAudioConfig): + super().__init__() + img_size = (config.spec_size, config.spec_size) if isinstance(config.spec_size, int) else config.spec_size + patch_size = ( + (config.patch_size, config.patch_size) if isinstance(config.patch_size, int) else config.patch_size + ) + patch_stride = ( + (config.patch_stride, config.patch_stride) if isinstance(config.patch_stride, int) else config.patch_stride + ) + + self.img_size = img_size + self.patch_stride = patch_stride + + self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + + self.flatten = config.flatten_patch_embeds + self.enable_fusion = config.enable_fusion + + padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2) + + scale_factor = 4 if (self.enable_fusion) and (config.fusion_type == "channel_map") else 1 + + self.proj = nn.Conv2d( + config.patch_embed_input_channels * scale_factor, + config.patch_embeds_hidden_size, + kernel_size=patch_size, + stride=patch_stride, + padding=padding, + ) + + self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity() + if self.enable_fusion: + self.fusion_model = ClapAudioAFFBlock(config) + self.mel_conv2d = nn.Conv2d( + config.patch_embed_input_channels, + config.patch_embeds_hidden_size, + kernel_size=(patch_size[0], patch_size[1] * 3), + stride=(patch_stride[0], patch_stride[1] * 3), + padding=padding, + ) + + def forward(self, hidden_states, is_longer_idx=None): + if self.enable_fusion: + # retrieve the last mel as we have transposed the input + global_hidden_states = hidden_states[:, 0:1, :, :] + + # global processing + batch_size, num_channels, height, width = global_hidden_states.shape + + if height != self.img_size[0] or width != self.img_size[1]: + raise ValueError( + f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + ) + + global_hidden_states = self.proj(global_hidden_states) + output_width = global_hidden_states.size(-1) + if len(is_longer_idx) > 0: + # local processing + local_hidden_states = hidden_states[is_longer_idx, 1:, :, :].contiguous() + batch_size, num_channels, height, width = local_hidden_states.shape + local_hidden_states = local_hidden_states.view(batch_size * num_channels, 1, height, width) + + local_hidden_states = self.mel_conv2d(local_hidden_states) + + _, features, height, width = local_hidden_states.shape + local_hidden_states = local_hidden_states.view(batch_size, num_channels, features, height, width) + local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3) + + local_width = local_hidden_states.size(-1) + local_hidden_states = torch.nn.functional.pad( + local_hidden_states, (0, output_width - local_width), "constant", 0 + ) + + global_hidden_states[is_longer_idx] = self.fusion_model( + global_hidden_states[is_longer_idx], local_hidden_states + ) + hidden_states = global_hidden_states + else: + _, _, height, width = hidden_states.shape + if height != self.img_size[0] or width != self.img_size[1]: + raise ValueError( + f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + ) + hidden_states = self.proj(hidden_states) + + if self.flatten: + hidden_states = hidden_states.flatten(2).transpose(1, 2) + hidden_states = self.norm(hidden_states) + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->ClapAudio +class ClapAudioSelfAttention(nn.Module): + def __init__(self, config, dim, num_heads, window_size): + super().__init__() + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.window_size = ( + window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size) + ) + + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads) + ) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij")) + coords_flatten = torch.flatten(coords, 1) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = relative_coords.permute(1, 2, 0).contiguous() + relative_coords[:, :, 0] += self.window_size[0] - 1 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) + self.register_buffer("relative_position_index", relative_position_index) + + self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + batch_size, dim, num_channels = hidden_states.shape + hidden_shape = (batch_size, dim, -1, self.attention_head_size) + + query_layer = self.query(hidden_states).view(hidden_shape).transpose(1, 2) + key_layer = self.key(hidden_states).view(hidden_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(hidden_shape).transpose(1, 2) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)] + relative_position_bias = relative_position_bias.view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) + + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() + attention_scores = attention_scores + relative_position_bias.unsqueeze(0) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in ClapAudioModel forward() function) + mask_shape = attention_mask.shape[0] + attention_scores = attention_scores.view( + batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim + ) + attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0) + attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->ClapAudio +class ClapAudioSelfOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, dim) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->ClapAudio +class ClapAudioAttention(nn.Module): + def __init__(self, config, dim, num_heads, window_size): + super().__init__() + self.self = ClapAudioSelfAttention(config, dim, num_heads, window_size) + self.output = ClapAudioSelfOutput(config, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->ClapAudio +class ClapAudioIntermediate(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinOutput with Swin->ClapAudio +class ClapAudioOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio +class ClapAudioLayer(nn.Module): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.shift_size = shift_size + self.window_size = config.window_size + self.input_resolution = input_resolution + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size) + self.drop_path = ClapDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.intermediate = ClapAudioIntermediate(config, dim) + self.output = ClapAudioOutput(config, dim) + + def set_shift_and_window_size(self, input_resolution): + if min(input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = torch_int(0) + self.window_size = ( + torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution) + ) + + def get_attn_mask(self, height, width, dtype, device): + if self.shift_size > 0: + # calculate attention mask for SW-MSA + img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device=device) + height_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + width_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + count = 0 + for height_slice in height_slices: + for width_slice in width_slices: + img_mask[:, height_slice, width_slice, :] = count + count += 1 + + mask_windows = window_partition(img_mask, self.window_size) + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, -100.0).masked_fill(attn_mask == 0, 0.0) + else: + attn_mask = None + return attn_mask + + def maybe_pad(self, hidden_states, height, width): + pad_right = (self.window_size - width % self.window_size) % self.window_size + pad_bottom = (self.window_size - height % self.window_size) % self.window_size + pad_values = (0, 0, 0, pad_right, 0, pad_bottom) + hidden_states = nn.functional.pad(hidden_states, pad_values) + return hidden_states, pad_values + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: tuple[int, int], + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + always_partition: Optional[bool] = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + if not always_partition: + self.set_shift_and_window_size(input_dimensions) + else: + pass + height, width = input_dimensions + batch_size, _, channels = hidden_states.size() + shortcut = hidden_states + + hidden_states = self.layernorm_before(hidden_states) + + hidden_states = hidden_states.view(batch_size, height, width, channels) + + # pad hidden_states to multiples of window size + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + # cyclic shift + if self.shift_size > 0: + shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + else: + shifted_hidden_states = hidden_states + + # partition windows + hidden_states_windows = window_partition(shifted_hidden_states, self.window_size) + hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels) + attn_mask = self.get_attn_mask( + height_pad, width_pad, dtype=hidden_states.dtype, device=hidden_states_windows.device + ) + + attention_outputs = self.attention( + hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions + ) + + attention_output = attention_outputs[0] + + attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels) + shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad) + + # reverse cyclic shift + if self.shift_size > 0: + attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + attention_windows = shifted_windows + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_windows = attention_windows[:, :height, :width, :].contiguous() + + attention_windows = attention_windows.view(batch_size, height * width, channels) + + hidden_states = shortcut + self.drop_path(attention_windows) + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + layer_output = hidden_states + self.output(layer_output) + + layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) + return layer_outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->ClapAudio +class ClapAudioStage(GradientCheckpointingLayer): + def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample): + super().__init__() + self.config = config + self.dim = dim + self.blocks = nn.ModuleList( + [ + ClapAudioLayer( + config=config, + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + drop_path_rate=drop_path[i], + shift_size=0 if (i % 2 == 0) else config.window_size // 2, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm) + else: + self.downsample = None + + self.pointing = False + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: tuple[int, int], + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + always_partition: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + height, width = input_dimensions + for i, layer_module in enumerate(self.blocks): + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition + ) + + hidden_states = layer_outputs[0] + + hidden_states_before_downsampling = hidden_states + if self.downsample is not None: + height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2 + output_dimensions = (height, width, height_downsampled, width_downsampled) + hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions) + else: + output_dimensions = (height, width, height, width) + + stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->ClapAudio +class ClapAudioPatchMerging(nn.Module): + """ + Patch Merging Layer. + + Args: + input_resolution (`tuple[int]`): + Resolution of input feature. + dim (`int`): + Number of input channels. + norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer class. + """ + + def __init__(self, input_resolution: tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None: + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def maybe_pad(self, input_feature, height, width): + should_pad = (height % 2 == 1) or (width % 2 == 1) + if should_pad: + pad_values = (0, 0, 0, width % 2, 0, height % 2) + input_feature = nn.functional.pad(input_feature, pad_values) + + return input_feature + + def forward(self, input_feature: torch.Tensor, input_dimensions: tuple[int, int]) -> torch.Tensor: + height, width = input_dimensions + # `dim` is height * width + batch_size, dim, num_channels = input_feature.shape + + input_feature = input_feature.view(batch_size, height, width, num_channels) + # pad input to be divisible by width and height, if needed + input_feature = self.maybe_pad(input_feature, height, width) + # [batch_size, height/2, width/2, num_channels] + input_feature_0 = input_feature[:, 0::2, 0::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_1 = input_feature[:, 1::2, 0::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_2 = input_feature[:, 0::2, 1::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_3 = input_feature[:, 1::2, 1::2, :] + # batch_size height/2 width/2 4*num_channels + input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1) + input_feature = input_feature.view(batch_size, -1, 4 * num_channels) # batch_size height/2*width/2 4*C + + input_feature = self.norm(input_feature) + input_feature = self.reduction(input_feature) + + return input_feature + + +class ClapAudioEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.num_layers = len(config.depths) + + self.config = config + self.patch_embed = ClapAudioPatchEmbed(config) + self.enable_fusion = config.enable_fusion + self.patch_stride = self.patch_embed.patch_stride + self.spec_size = config.spec_size + self.freq_ratio = config.spec_size // config.num_mel_bins + + self.num_features = int(config.patch_embeds_hidden_size * 2 ** (self.num_layers - 1)) + + drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] + + grid_size = self.patch_embed.grid_size + self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)] + + self.layers = nn.ModuleList( + [ + ClapAudioStage( + config=config, + dim=int(config.patch_embeds_hidden_size * 2**i_layer), + input_resolution=self.input_resolutions[i_layer], + depth=config.depths[i_layer], + num_heads=config.num_attention_heads[i_layer], + drop_path=drop_path_rate[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None, + ) + for i_layer in range(self.num_layers) + ] + ) + + self.gradient_checkpointing = False + + self.batch_norm = nn.BatchNorm2d(config.num_mel_bins) + self.norm = nn.LayerNorm(self.num_features) + self.depths = config.depths + self.avgpool = nn.AdaptiveAvgPool1d(1) + + def reshape_mel2img(self, normalized_input_features): + """ + The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel + should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`]. + """ + _, _, time_length, freq_length = normalized_input_features.shape + + spec_width = int(self.spec_size * self.freq_ratio) + spec_height = self.spec_size // self.freq_ratio + + if time_length > spec_width or freq_length > spec_height: + raise ValueError("the wav size should be less than or equal to the swin input size") + + # to avoid bicubic zero error + if time_length < spec_width: + normalized_input_features = nn.functional.interpolate( + normalized_input_features, (spec_width, freq_length), mode="bicubic", align_corners=True + ) + if freq_length < spec_height: + normalized_input_features = nn.functional.interpolate( + normalized_input_features, (time_length, spec_height), mode="bicubic", align_corners=True + ) + + batch, channels, time, freq = normalized_input_features.shape + + # batch_size, channels, spec_width, spec_height --> batch_size, channels, spec_height * freq_ratio, spec_width // freq_ratio + normalized_input_features = normalized_input_features.reshape( + batch, channels * self.freq_ratio, time // self.freq_ratio, freq + ) + normalized_input_features = normalized_input_features.permute(0, 1, 3, 2).contiguous() + normalized_input_features = normalized_input_features.reshape( + batch, channels, freq * self.freq_ratio, time // self.freq_ratio + ) + + return normalized_input_features + + def forward( + self, + input_features, + is_longer: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + output_hidden_states_before_downsampling: Optional[bool] = False, + always_partition: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple, ClapAudioModelOutput]: + input_features = input_features.transpose(1, 3) + normalized_input_features = self.batch_norm(input_features) + normalized_input_features = normalized_input_features.transpose(1, 3) + + is_longer_list_idx = None + if self.enable_fusion: + is_longer_list = is_longer.to(input_features.device) + is_longer_list_idx = torch.where(is_longer_list == 1)[0] + + hidden_states = self.reshape_mel2img(normalized_input_features) + + frames_num = hidden_states.shape[2] + + hidden_states = self.patch_embed(hidden_states, is_longer_list_idx) + + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + input_dimensions = self.input_resolutions[0] + + if output_hidden_states: + batch_size, _, hidden_size = hidden_states.shape + # rearrange batch_size (height width) channels -> batch_size channel height width + reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + + input_dimensions = self.input_resolutions[i] + + layer_outputs = layer_module( + hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition + ) + + hidden_states = layer_outputs[0] + + hidden_states_before_downsampling = layer_outputs[1] + output_dimensions = layer_outputs[2] + + input_dimensions = (output_dimensions[-2], output_dimensions[-1]) + + if output_hidden_states and output_hidden_states_before_downsampling: + batch_size, _, hidden_size = hidden_states_before_downsampling.shape + # rearrange batch_size (height width) channels -> batch_size channel height width + # here we use the original (not downsampled) height and width + reshaped_hidden_state = hidden_states_before_downsampling.view( + batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size + ) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states_before_downsampling,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + elif output_hidden_states and not output_hidden_states_before_downsampling: + batch_size, _, hidden_size = hidden_states.shape + # rearrange batch_size (height width) channels -> batch_size channel height width + reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[3:] + + last_hidden_state = self.norm(hidden_states) + + batch_size, _, n_channels = last_hidden_state.shape + + freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0] + temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1] + + last_hidden_state = ( + last_hidden_state.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape) + ) + + batch_size, n_channels, n_frequencies, n_temp = last_hidden_state.shape + # group 2D CNN + c_freq_bin = n_frequencies // self.freq_ratio + last_hidden_state = last_hidden_state.reshape( + batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp + ) + last_hidden_state = ( + last_hidden_state.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1) + ) + latent_output = self.avgpool(torch.flatten(last_hidden_state, 2)) + latent_output = torch.flatten(latent_output, 1) + + if not return_dict: + return tuple( + v + for v in [ + last_hidden_state, + latent_output, + all_reshaped_hidden_states, + all_self_attentions, + ] + if v is not None + ) + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=latent_output, + hidden_states=all_reshaped_hidden_states, + attentions=all_self_attentions, + ) + + +class ClapProjectionLayer(nn.Module): + def __init__(self, config: Union[ClapAudioConfig, ClapTextConfig]): + super().__init__() + self.config = config + hidden_size = config.hidden_size + projection_dim = config.projection_dim + + self.linear1 = nn.Linear(hidden_size, projection_dim) + self.activation = ACT2FN[config.projection_hidden_act] + self.linear2 = nn.Linear(projection_dim, projection_dim) + + def forward(self, hidden_states): + hidden_states = self.linear1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.linear2(hidden_states) + return hidden_states + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText, persistent=False->persistent=True +class ClapTextEmbeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.align.modeling_align.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + if head_mask is not None: + attn_weights = attn_weights * head_mask.view(1, -1, 1, 1) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +# Copied from transformers.models.align.modeling_align.AlignTextSelfAttention with Align->Clap +class ClapTextSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.attention_dropout = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.attention_head_size) + + query_states = self.query(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.key(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + head_mask=head_mask, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +class ClapTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.align.modeling_align.AlignTextAttention with Align->Clap +class ClapTextAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = ClapTextSelfAttention(config) + self.output = ClapTextSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + **kwargs, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate +class ClapTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput +class ClapTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.align.modeling_align.AlignTextLayer with Align->Clap +class ClapTextLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ClapTextAttention(config) + self.intermediate = ClapTextIntermediate(config) + self.output = ClapTextOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + **kwargs, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.align.modeling_align.AlignTextEncoder with Align->Clap +class ClapTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([ClapTextLayer(config) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + @can_return_tuple + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + **kwargs, + ) -> Union[tuple[torch.Tensor], BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=layer_head_mask, + output_attentions=output_attentions, + **kwargs, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler +class ClapTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@auto_docstring +class ClapPreTrainedModel(PreTrainedModel): + config: ClapConfig + base_model_prefix = "clap" + supports_gradient_checkpointing = False + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + factor = self.config.initializer_factor + + if isinstance(module, ClapTextEmbeddings): + module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02) + module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02) + elif isinstance(module, ClapModel): + module.logit_scale_a.data.fill_(math.log(self.config.logit_scale_init_value)) + module.logit_scale_t.data.fill_(math.log(self.config.logit_scale_init_value)) + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=factor * 0.02) + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, (nn.Conv2d, nn.Linear)): + in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor + nn.init.normal_(module.weight, std=in_proj_std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, ClapAudioSelfAttention): + module.relative_position_bias_table.data.zero_() + + +class ClapAudioModel(ClapPreTrainedModel): + config: ClapAudioConfig + main_input_name = "input_features" + + def __init__(self, config: ClapAudioConfig): + super().__init__(config) + self.audio_encoder = ClapAudioEncoder(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.audio_encoder.patch_embed.proj + + @auto_docstring + def forward( + self, + input_features: Optional[torch.FloatTensor] = None, + is_longer: Optional[torch.BoolTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): + Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance + the features. + + Examples: + + ```python + >>> from datasets import load_dataset + >>> from transformers import AutoProcessor, ClapAudioModel + + >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example") + >>> audio_sample = dataset["train"]["audio"][0]["array"] + + >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused") + >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused") + + >>> inputs = processor(audios=audio_sample, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + return self.audio_encoder( + input_features=input_features, + is_longer=is_longer, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +@auto_docstring( + custom_intro=""" + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in *Attention is + all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762 + """ +) +class ClapTextModel(ClapPreTrainedModel): + config: ClapTextConfig + + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + self.embeddings = ClapTextEmbeddings(config) + self.encoder = ClapTextEncoder(config) + + self.pooler = ClapTextPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring +class ClapModel(ClapPreTrainedModel): + config: ClapConfig + + def __init__(self, config: ClapConfig): + super().__init__(config) + + if not isinstance(config.text_config, ClapTextConfig): + raise TypeError( + "config.text_config is expected to be of type ClapTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.audio_config, ClapAudioConfig): + raise TypeError( + "config.audio_config is expected to be of type ClapAudioConfig but is of type" + f" {type(config.audio_config)}." + ) + + text_config = config.text_config + audio_config = config.audio_config + + self.logit_scale_a = nn.Parameter(torch.tensor(math.log(config.logit_scale_init_value))) + self.logit_scale_t = nn.Parameter(torch.tensor(math.log(config.logit_scale_init_value))) + + self.projection_dim = config.projection_dim + + self.text_model = ClapTextModel(text_config) + self.text_projection = ClapProjectionLayer(text_config) + + self.audio_model = ClapAudioModel(audio_config) + self.audio_projection = ClapProjectionLayer(audio_config) + + # Initialize weights and apply final processing + self.post_init() + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_text_features( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`ClapTextModel`]. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, ClapModel + + >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused") + >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused") + + >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt") + >>> with torch.inference_mode(): + ... text_features = model.get_text_features(**inputs) + ```""" + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids + ) + text_features = self.text_projection(text_outputs.pooler_output) + text_features = F.normalize(text_features, dim=-1) + + return text_features + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_audio_features( + self, + input_features: torch.Tensor, + is_longer: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): + Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance + the features. + + Returns: + audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by + applying the projection layer to the pooled output of [`ClapAudioModel`]. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, ClapModel + + >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused") + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused") + >>> random_audio = torch.rand((16_000)) + + >>> inputs = feature_extractor(random_audio, return_tensors="pt") + >>> with torch.inference_mode(): + ... audio_features = model.get_audio_features(**inputs) + ```""" + audio_outputs: BaseModelOutputWithPooling = self.audio_model( + input_features=input_features, is_longer=is_longer + ) + audio_features = self.audio_projection(audio_outputs.pooler_output) + audio_features = F.normalize(audio_features, dim=-1) + + return audio_features + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + input_features: Optional[torch.FloatTensor] = None, + is_longer: Optional[torch.BoolTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ClapOutput]: + r""" + is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): + Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance + the features. + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + + Examples: + + ```python + >>> from datasets import load_dataset + >>> from transformers import AutoProcessor, ClapModel + + >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example") + >>> audio_sample = dataset["train"]["audio"][0]["array"] + + >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused") + >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused") + + >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"] + + >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True) + + >>> outputs = model(**inputs) + >>> logits_per_audio = outputs.logits_per_audio # this is the audio-text similarity score + >>> probs = logits_per_audio.softmax(dim=-1) # we can take the softmax to get the label probabilities + ```""" + # Use CLAP model's config for some fields (if specified) instead of those of audio & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + audio_outputs = self.audio_model( + input_features=input_features, + is_longer=is_longer, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + audio_embeds = audio_outputs[1] if not return_dict else audio_outputs.pooler_output + audio_embeds = self.audio_projection(audio_embeds) + + text_embeds = text_outputs[1] if not return_dict else text_outputs.pooler_output + text_embeds = self.text_projection(text_embeds) + + # normalized features + audio_embeds = audio_embeds / audio_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale_text = self.logit_scale_t.exp() + logit_scale_audio = self.logit_scale_a.exp() + logits_per_text = torch.matmul(text_embeds, audio_embeds.t()) * logit_scale_text + logits_per_audio = torch.matmul(audio_embeds, text_embeds.t()) * logit_scale_audio + + loss = None + if return_loss: + caption_loss = contrastive_loss(logits_per_text) + audio_loss = contrastive_loss(logits_per_audio.t()) + loss = (caption_loss + audio_loss) / 2.0 + + return ClapOutput( + loss=loss, + logits_per_audio=logits_per_audio, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + audio_embeds=audio_embeds, + text_model_output=text_outputs, + audio_model_output=audio_outputs, + ) + + +@auto_docstring +class ClapTextModelWithProjection(ClapPreTrainedModel): + config: ClapTextConfig + + def __init__(self, config: ClapTextConfig): + super().__init__(config) + self.text_model = ClapTextModel(config) + self.text_projection = ClapProjectionLayer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.text_model.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.text_model.embeddings.word_embeddings = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ClapTextModelOutput]: + r""" + Examples: + + ```python + >>> from transformers import AutoTokenizer, ClapTextModelWithProjection + + >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused") + >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused") + + >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + pooled_output = text_outputs[1] if not return_dict else text_outputs.pooler_output + + text_embeds = self.text_projection(pooled_output) + + return ClapTextModelOutput( + text_embeds=text_embeds, + last_hidden_state=text_outputs.last_hidden_state, + hidden_states=text_outputs.hidden_states, + attentions=text_outputs.attentions, + ) + + +@auto_docstring +class ClapAudioModelWithProjection(ClapPreTrainedModel): + config: ClapAudioConfig + main_input_name = "input_features" + + def __init__(self, config: ClapAudioConfig): + super().__init__(config) + self.audio_model = ClapAudioModel(config) + self.audio_projection = ClapProjectionLayer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.audio_model.audio_encoder.patch_embed.proj + + @can_return_tuple + @auto_docstring + def forward( + self, + input_features: Optional[torch.FloatTensor] = None, + is_longer: Optional[torch.BoolTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ClapAudioModelOutput]: + r""" + is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): + Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance + the features. + + Examples: + + ```python + >>> from datasets import load_dataset + >>> from transformers import ClapAudioModelWithProjection, ClapProcessor + + >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused") + >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused") + + >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example") + >>> audio_sample = dataset["train"]["audio"][0]["array"] + + >>> inputs = processor(audios=audio_sample, return_tensors="pt") + >>> outputs = model(**inputs) + >>> audio_embeds = outputs.audio_embeds + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + audio_outputs = self.audio_model( + input_features=input_features, + is_longer=is_longer, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output + + audio_embeds = self.audio_projection(pooled_output) + + return ClapAudioModelOutput( + audio_embeds=audio_embeds, + last_hidden_state=audio_outputs.last_hidden_state, + attentions=audio_outputs.attentions, + hidden_states=audio_outputs.hidden_states, + ) + + +__all__ = [ + "ClapModel", + "ClapPreTrainedModel", + "ClapTextModel", + "ClapTextModelWithProjection", + "ClapAudioModel", + "ClapAudioModelWithProjection", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/processing_clap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/processing_clap.py new file mode 100644 index 0000000000000000000000000000000000000000..6524a87158418206b8b96a7b57f6c1b7392e56cf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/processing_clap.py @@ -0,0 +1,75 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Audio/Text processor class for CLAP +""" + +from typing import Optional, Union + +from ...audio_utils import AudioInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg + + +logger = logging.get_logger(__name__) + + +class ClapProcessor(ProcessorMixin): + r""" + Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor. + + [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the + [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information. + + Args: + feature_extractor ([`ClapFeatureExtractor`]): + The audio processor is a required input. + tokenizer ([`RobertaTokenizerFast`]): + The tokenizer is a required input. + """ + + feature_extractor_class = "ClapFeatureExtractor" + tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast") + + def __init__(self, feature_extractor, tokenizer): + super().__init__(feature_extractor, tokenizer) + + @deprecate_kwarg("audios", version="v4.59.0", new_name="audio") + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + audios: Optional[AudioInput] = None, + audio: Optional[AudioInput] = None, + **kwargs: Unpack[ProcessingKwargs], + ): + """ + Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text` + argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more + information. + """ + # The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check + # again that the correct naming is used + if audios is not None and audio is None: + logger.warning( + "Using `audios` keyword argument is deprecated when calling ClapProcessor, instead use `audio`." + ) + audio = audios + + return super().__call__(text=text, audio=audio, **kwargs) + + +__all__ = ["ClapProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad2d57500c44322b6c749a313f07c07e11f8f20f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_cohere import * + from .modeling_cohere import * + from .tokenization_cohere_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py new file mode 100644 index 0000000000000000000000000000000000000000..c78d1e9bf8a11c45246a5cb391769566dc69ce50 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py @@ -0,0 +1,217 @@ +# coding=utf-8 +# Copyright 2024 Cohere team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Cohere model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CohereConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere + model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model. + + + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CohereModel`] + hidden_size (`int`, *optional*, defaults to 8192): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22528): + Dimension of the MLP representations. + logit_scale (`float`, *optional*, defaults to 0.0625): + The scaling factor for the output logits. + num_hidden_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 64): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 5): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 255001): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + use_qk_norm (`bool`, *optional*, defaults to `False`): + Whether to use query-key normalization in the attention + + ```python + >>> from transformers import CohereModel, CohereConfig + + >>> # Initializing a Cohere model configuration + >>> configuration = CohereConfig() + + >>> # Initializing a model from the Cohere configuration + >>> model = CohereModel(configuration) # doctest: +SKIP + + >>> # Accessing the model configuration + >>> configuration = model.config # doctest: +SKIP + ```""" + + model_type = "cohere" + keys_to_ignore_at_inference = ["past_key_values"] + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=256000, + hidden_size=8192, + intermediate_size=22528, + logit_scale=0.0625, + num_hidden_layers=40, + num_attention_heads=64, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=8192, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=5, + eos_token_id=255001, + tie_word_embeddings=True, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + use_qk_norm=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.logit_scale = logit_scale + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.use_qk_norm = use_qk_norm + + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["CohereConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py new file mode 100644 index 0000000000000000000000000000000000000000..1dfa0ce0be33a6f3383e76d81de4f3895ab70bea --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py @@ -0,0 +1,534 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/cohere/modular_cohere.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_cohere.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Cohere team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is based on the LLama model definition file in transformers + + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_cohere import CohereConfig + + +class CohereLayerNorm(nn.Module): + def __init__(self, hidden_size=None, eps=1e-5, bias=False): + """The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim""" + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + mean = hidden_states.mean(-1, keepdim=True) + variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True) + hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon) + hidden_states = self.weight.to(torch.float32) * hidden_states + return hidden_states.to(input_dtype) + + +class CohereRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: CohereConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat() + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class CohereMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def rotate_half(x): + # Split and rotate. Note that this function is different from e.g. Llama. + x1 = x[..., ::2] + x2 = x[..., 1::2] + rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2) + return rot_x + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + dtype = q.dtype + q = q.float() + k = k.float() + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype) + + +class CohereAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.use_qk_norm = config.use_qk_norm + if self.use_qk_norm: + # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads + self.q_norm = CohereLayerNorm( + hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps + ) + self.k_norm = CohereLayerNorm( + hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape) + key_states = self.k_proj(hidden_states).view(hidden_shape) + value_states = self.v_proj(hidden_states).view(hidden_shape) + + if self.use_qk_norm: # main diff from Llama + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; position_ids needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class CohereDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: CohereConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = CohereAttention(config=config, layer_idx=layer_idx) + self.mlp = CohereMLP(config) + self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + past_key_values (`Cache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + """ + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states_attention, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states_mlp = self.mlp(hidden_states) + hidden_states = residual + hidden_states_attention + hidden_states_mlp + return hidden_states + + +@auto_docstring +class CoherePreTrainedModel(PreTrainedModel): + config: CohereConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["CohereDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": CohereDecoderLayer, + "attentions": CohereAttention, + } + + +@auto_docstring +class CohereModel(CoherePreTrainedModel): + def __init__(self, config: CohereConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + self.rotary_emb = CohereRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = CohereModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.logit_scale = config.logit_scale + self.tie_word_embeddings = config.tie_word_embeddings + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >> from transformers import AutoTokenizer, CohereForCausalLM + + >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01") + >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") + + >> prompt = "Hey, are you conscious? Can you talk to me?" + >> inputs = tokenizer(prompt, return_tensors="pt") + + >> # Generate + >> generate_ids = model.generate(inputs.input_ids, max_length=30) + >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + logits = logits * self.logit_scale # main diff from Llama + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["CohereForCausalLM", "CohereModel", "CoherePreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py new file mode 100644 index 0000000000000000000000000000000000000000..daa12a15ed268967df7a3852424ba98944a646b1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py @@ -0,0 +1,355 @@ +# coding=utf-8 +# Copyright 2024 Cohere team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is based on the LLama model definition file in transformers + +"""PyTorch Cohere model.""" + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...cache_utils import Cache +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, logging +from ...utils.deprecation import deprecate_kwarg +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaForCausalLM, + LlamaMLP, + LlamaModel, + LlamaRotaryEmbedding, + eager_attention_forward, +) +from .configuration_cohere import CohereConfig + + +logger = logging.get_logger(__name__) + + +class CohereLayerNorm(nn.Module): + def __init__(self, hidden_size=None, eps=1e-5, bias=False): + """The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim""" + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + mean = hidden_states.mean(-1, keepdim=True) + variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True) + hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon) + hidden_states = self.weight.to(torch.float32) * hidden_states + return hidden_states.to(input_dtype) + + +class CohereRotaryEmbedding(LlamaRotaryEmbedding): + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat() + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + # Split and rotate. Note that this function is different from e.g. Llama. + x1 = x[..., ::2] + x2 = x[..., 1::2] + rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2) + return rot_x + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + dtype = q.dtype + q = q.float() + k = k.float() + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype) + + +class CohereMLP(LlamaMLP): + def __init__(self, config): + super().__init__(config) + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + +class CohereAttention(LlamaAttention): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.use_qk_norm = config.use_qk_norm + if self.use_qk_norm: + # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads + self.q_norm = CohereLayerNorm( + hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps + ) + self.k_norm = CohereLayerNorm( + hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape) + key_states = self.k_proj(hidden_states).view(hidden_shape) + value_states = self.v_proj(hidden_states).view(hidden_shape) + + if self.use_qk_norm: # main diff from Llama + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; position_ids needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class CohereDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: CohereConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = CohereAttention(config=config, layer_idx=layer_idx) + self.mlp = CohereMLP(config) + self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + past_key_values (`Cache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + """ + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states_attention, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states_mlp = self.mlp(hidden_states) + hidden_states = residual + hidden_states_attention + hidden_states_mlp + return hidden_states + + +class CohereModel(LlamaModel): + def __init__(self, config: CohereConfig): + super().__init__(config) + self.layers = nn.ModuleList( + [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.rotary_emb = CohereRotaryEmbedding(config=config) + self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + + +class CohereForCausalLM(LlamaForCausalLM): + def __init__(self, config): + super().__init__(config) + self.model = CohereModel(config) + self.logit_scale = config.logit_scale + self.tie_word_embeddings = config.tie_word_embeddings + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >> from transformers import AutoTokenizer, CohereForCausalLM + + >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01") + >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") + + >> prompt = "Hey, are you conscious? Can you talk to me?" + >> inputs = tokenizer(prompt, return_tensors="pt") + + >> # Generate + >> generate_ids = model.generate(inputs.input_ids, max_length=30) + >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + logits = logits * self.logit_scale # main diff from Llama + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "CohereForCausalLM", + "CohereModel", + "CoherePreTrainedModel", # noqa: F822 +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..fd240b97848033ab77e984f85d91aa63e190df0c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py @@ -0,0 +1,512 @@ +# coding=utf-8 +# Copyright 2024 Cohere team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is based on the tokenization_llama_fast.py file in transformers + +import pickle +from typing import Literal, Union + +from tokenizers import processors + +from ...tokenization_utils_base import BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging + + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "tokenizer_file": { + "Cohere/Command-nightly": "https://huggingface.co/Cohere/Command-nightly/blob/main/tokenizer.json", + }, +} + +# fmt: off +DEFAULT_SYSTEM_PROMPT = "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere." +DEFAULT_RAG_PREAMBLE = """## Task and Context +You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + +## Style Guide +Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.""" +# fmt: on + + +class CohereTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a Cohere tokenizer. Based on byte-level Byte-Pair-Encoding. + + This uses notably ByteFallback and NFC normalization. + + ```python + >>> from transformers import AutoTokenizer + + >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") + >>> tokenizer.encode("Hello this is a test") + [5, 28339, 2075, 1801, 1671, 3282] + ``` + + If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or + call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the + values of the first token and final token of an encoded sequence will not be correct). For more details, checkout + [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation. + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. + tokenizer_file (`str`, *optional*): + [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|END_OF_TURN_TOKEN|>"`): + The end of sequence token. + add_bos_token (`bool`, *optional*, defaults to `True`): + Whether or not to add an `bos_token` at the start of sequences. + add_eos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an `eos_token` at the end of sequences. + use_default_system_prompt (`bool`, *optional*, defaults to `False`): + Whether or not the default system prompt for Cohere tokenizer should be used. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not the tokenizer should automatically add a prefix space + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + padding_side = "left" + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None + # No `max_model_input_sizes` + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + clean_up_tokenization_spaces=False, + unk_token="", + bos_token="", + eos_token="<|END_OF_TURN_TOKEN|>", + add_bos_token=True, + add_eos_token=False, + use_default_system_prompt=False, + add_prefix_space=False, + **kwargs, + ): + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + use_default_system_prompt=use_default_system_prompt, + add_prefix_space=add_prefix_space, + **kwargs, + ) + self._add_bos_token = add_bos_token + self._add_eos_token = add_eos_token + self.update_post_processor() + self.use_default_system_prompt = use_default_system_prompt + self.vocab_file = vocab_file + self.grounded_generation_template = kwargs.pop("grounded_generation_template", None) + self.tool_use_template = kwargs.pop("tool_use_template", None) + + # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly + # check this as they were green before. + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + decoder_state = pickle.dumps(self.backend_tokenizer.decoder) + + if add_prefix_space: + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) + self.backend_tokenizer.decoder = pickle.loads(decoder_state) + + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def update_post_processor(self): + """ + Updates the underlying post processor with the current `bos_token` and `eos_token`. + """ + bos = self.bos_token + bos_token_id = self.bos_token_id + if bos is None and self.add_bos_token: + raise ValueError("add_bos_token = True but bos_token = None") + + eos = self.eos_token + eos_token_id = self.eos_token_id + if eos is None and self.add_eos_token: + raise ValueError("add_eos_token = True but eos_token = None") + + single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}" + pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}" + + special_tokens = [] + if self.add_bos_token: + special_tokens.append((bos, bos_token_id)) + if self.add_eos_token: + special_tokens.append((eos, eos_token_id)) + self._tokenizer.post_processor = processors.TemplateProcessing( + single=single, pair=pair, special_tokens=special_tokens + ) + + @property + def add_eos_token(self): + return self._add_eos_token + + @property + def add_bos_token(self): + return self._add_bos_token + + @add_eos_token.setter + def add_eos_token(self, value): + self._add_eos_token = value + self.update_post_processor() + + @add_bos_token.setter + def add_bos_token(self, value): + self._add_bos_token = value + self.update_post_processor() + + def apply_tool_use_template( + self, + conversation: list[dict[str, str]], + tools: list[dict], + **kwargs, + ) -> Union[str, list[int]]: + """Create a Command-R tool-use prompt. + + Once rendered, the prompt instructs the model to generate a list of actions to perform on a set of user supplied tools + to help carry out the user's requests. + + Conceptually, this works in the same way as `apply_chat_format`, but takes an additional `tools` parameter. + + Converts a chat in the form of a list of dictionaries with `"role"` and `"content"` keys and a list of available + tools for the model to use into a prompt string, or a list of token ids. + This method will use the tokenizer's `default_tool_use_template` template specified at the class level. + You can override the default template using the `tool_use_template` kwarg but the quality of your results may decrease. + + Args: + conversation (list[dict[str, str]]): A list of dicts + with "role" and "content" keys, representing the chat history so far. + tools (list[Dict]): a list of tools to render into the prompt for the model to choose from. + See an example at the bottom of the docstring. + The format should be: + * name (str): The name of the tool to be called. Valid names contain only the characters a-z, + A-Z, 0-9, _ and must not begin with a digit. + * description (str): The description of what the tool does, the model uses the description to + choose when and how to call the function. + * parameter_definitions (list[Dict]): The input parameters of the tool. Accepts a dictionary + where the key is the name of the parameter and the value is the parameter spec. + Valid parameter names contain only the characters a-z, A-Z, 0-9, _ and must not begin with a digit. + Parameter specs are as follows: + * description (str): The description of the parameter. + * type (str): the type of the parameter - most effective for python builtin data types, such as 'str', 'bool' + * required: boolean: Denotes whether the parameter is always present (required) or not. Defaults to not required. + add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate + the start of an assistant message. This is useful when you want to generate a response from the model. + Note that this argument will be passed to the chat template, and so it must be supported in the + template for this argument to have any effect. + tokenize (`bool`, defaults to `True`): + Whether to tokenize the output. If `False`, the output will be a string. + padding (`bool`, defaults to `False`): + Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`. + truncation (`bool`, defaults to `False`): + Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`. + max_length (`int`, *optional*): + Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If + not specified, the tokenizer's `max_length` attribute will be used as a default. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable + values are: + - `'tf'`: Return TensorFlow `tf.Tensor` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + return_dict (`bool`, *optional*, defaults to `False`): + Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. + **tokenizer_kwargs: Additional kwargs to pass to the tokenizer. + + Returns: + `str`: A rendered prompt string. + or if tokenize=True: + `list[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This + output is ready to pass to the model, either directly or via methods like `generate()`. + + Examples: + + ```python + >> tokenizer = CohereTokenizerFast.from_pretrained("CohereForAI/c4ai-command-r-v01") + >> tools = [ + { + "name": "internet_search", + "description": "Returns a list of relevant document snippets for a textual query retrieved from the internet", + "parameter_definitions": { + "query": { + "description": "Query to search the internet with", + "type": "str", + "required": True + } + } + }, + { + "name': "directly_answer", + "description": "Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history", + "parameter_definitions": {} + } + ] + >> conversation = [ + {"role": "user", "content": "Whats the biggest penguin in the world?"} + ] + >> # render the prompt, ready for user to inspect, or for input into the model: + >> prompt = tokenizer.apply_tool_use_template(conversation, tools=tools, tokenize=False, add_generation_prompt=True) + >> print(prompt) + <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble + The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral. + + # System Preamble + ## Basic Rules + You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions. + + # User Preamble + ## Task and Context + You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + + ## Style Guide + Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. + + ## Available Tools + Here is a list of tools that you have available to you: + + \\`\\`\\`python + def internet_search(query: str) -> list[Dict]: + \"\"\"Returns a list of relevant document snippets for a textual query retrieved from the internet + + Args: + query (str): Query to search the internet with + \"\"\" + pass + \\`\\`\\` + + \\`\\`\\`python + def directly_answer() -> list[Dict]: + \"\"\"Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history + \"\"\" + pass + \\`\\`\\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example: + \\`\\`\\`json + [ + { + "tool_name": title of the tool in the specification, + "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters + } + ]\\`\\`\\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> + ``` + >> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt') + >> outputs = model.generate(inputs, max_new_tokens=128) + >> print(tokenizer.decode(outputs[0])) + Action: ```json + [ + { + "tool_name": "internet_search", + "parameters": { + "query": "biggest penguin in the world" + } + } + ] + ``` + """ + return self.apply_chat_template( + conversation, + chat_template="tool_use", + tools=tools, + **kwargs, + ) + + def apply_grounded_generation_template( + self, + conversation: list[dict[str, str]], + documents: list[dict], + citation_mode: Literal["fast", "accurate"] = "accurate", + **kwargs, + ) -> Union[str, list[int]]: + """Create a Command-R grounded generation (aka RAG) prompt. + + Once rendered, the prompt instructs the model to generate a response with citations in, based on supplied documents. + + Conceptually, this works in the same way as `apply_chat_format`, but takes additional `documents` + and parameter `citation_mode` parameters. + + Converts a list of dictionaries with `"role"` and `"content"` keys and a list of + documents for the model to ground its response on into a prompt string, or a list of token ids. + This method will use the tokenizer's `grounded_generation_template` template specified at the class level. + You can override the default template using the `grounded_generation_template` kwarg but the quality of your results may decrease. + + Args: + conversation (list[dict[str, str]]): A list of dicts + with "role" and "content" keys, representing the chat history so far. + documents (list[dict[str, str]): A list of dicts, representing documents or tool outputs to ground your + generation on. A document is a semistructured dict, with a string to string mapping. Common fields are + `url`, `title`, `snippet` etc but should be descriptive of the key. They will get rendered into the prompt. + citation_mode: either "accurate" (prompt the model to generate an answer first, then rewrite it with citation + spans in) or "fast", where the prompt instructs the model to generate an answer with citations in directly. + The former has higher quality citations, the latter requires fewer tokens to be generated. + add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate + the start of an assistant message. This is useful when you want to generate a response from the model. + Note that this argument will be passed to the chat template, and so it must be supported in the + template for this argument to have any effect. + tokenize (`bool`, defaults to `True`): + Whether to tokenize the output. If `False`, the output will be a string. + padding (`bool`, defaults to `False`): + Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`. + truncation (`bool`, defaults to `False`): + Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`. + max_length (`int`, *optional*): + Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If + not specified, the tokenizer's `max_length` attribute will be used as a default. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable + values are: + - `'tf'`: Return TensorFlow `tf.Tensor` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + return_dict (`bool`, *optional*, defaults to `False`): + Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. + **tokenizer_kwargs: Additional kwargs to pass to the tokenizer. + + Returns: + `str`: A rendered prompt string. + or if tokenize=True: + `list[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This + output is ready to pass to the model, either directly or via methods like `generate()`. + + Examples: + + ```python + >> tokenizer = CohereTokenizerFast.from_pretrained('CohereForAI/c4ai-command-r-v01') + + >> # define documents: + >> documents = [ + { "title": "Tall penguins", "text": "Emperor penguins are the tallest." }, + { "title": "Penguin habitats", "text": "Emperor penguins only live in Antarctica."} + ] + >> # define a conversation: + >> conversation = [ + {"role": "user", "content": "Whats the biggest penguin in the world?"} + ] + >> # render the prompt, ready for user to inspect, or for input into the model: + >> grounded_generation_prompt = tokenizer.apply_grounded_generation_template(conversation, documents=documents, tokenize=False, add_generation_prompt=True) + >> print(grounded_generation_prompt) + <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble + The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral. + + ## Basic Rules + You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions. + + # User Preamble + ## Task and Context + You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + + ## Style Guide + Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|> + Document: 0 + title: Tall penguins + text: Emperor penguins are the tallest. + + Document: 1 + title: Penguin habitats + text: Emperor penguins only live in Antarctica. + <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Carefully perform the following instructions, in order, starting each with a new line. + Firstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'. + Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'. + Thirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup. + Finally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>''' + ``` + >> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt') + >> outputs = model.generate(inputs, max_new_tokens=128) + >> print(tokenizer.decode(outputs[0])) + Relevant Documents: 0,1 + Cited Documents: 0,1 + Answer: The Emperor Penguin is the tallest or biggest penguin in the world. It is a bird that lives only in Antarctica and grows to a height of around 122 centimetres. + Grounded answer: The Emperor Penguin is the tallest or biggest penguin in the world. It is a bird that lives only in Antarctica and grows to a height of around 122 centimetres. + """ + return self.apply_chat_template( + conversation, + chat_template="rag", + documents=documents, + citation_mode=citation_mode, + **kwargs, + ) + + # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output + + +__all__ = ["CohereTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b20eb3c1e0a8b42669761c9c45ca292b490df27 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_cohere2_vision import * + from .image_processing_cohere2_vision_fast import * + from .modeling_cohere2_vision import * + from .processing_cohere2_vision import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..acc40fcf85711961a12f9536e6f4c0c97e94f59e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py @@ -0,0 +1,82 @@ +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING, AutoConfig + + +class Cohere2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Cohere2VisionForConditionalGeneration`]. It is used to instantiate an + Cohere2 Vision model according to the specified arguments, defining the model architecture. + + [CohereLabs/command-a-vision-07-2025](https://huggingface.co/CohereLabs/command-a-vision-07-2025) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Cohere2Config`): + The config object or dictionary of the text backbone. + downsample_factor (`int`, *optional*, defaults to 2): + The factor by which to downsample the input image. + image_token_id (`int`, *optional*, defaults to 255036): + The token ID to use as placeholder for the image input. + alignment_intermediate_size (`int`, *optional*, defaults to 36864): + The size of the intermediate layer for alignment. + """ + + model_type = "cohere2_vision" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + downsample_factor=2, + image_token_id=255036, + alignment_intermediate_size=36864, + **kwargs, + ): + super().__init__(**kwargs) + self.downsample_factor = downsample_factor + self.image_token_id = image_token_id + self.alignment_intermediate_size = alignment_intermediate_size + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip_vision_model"]( + hidden_size=1152, + intermediate_size=3072, + image_size=512, + num_hidden_layers=27, + num_attention_heads=12, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "cohere2") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True) + + self.text_config = text_config + + +__all__ = ["Cohere2VisionConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..322e98dbd0f592fd152a2f87c3c5b9849d4dee29 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py @@ -0,0 +1,306 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_cohere2_vision.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import lru_cache +from typing import Optional, Union + +import numpy as np +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring + + +class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + crop_to_patches (`bool`, *optional*, defaults to `False`): + Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the + `preprocess` method. + min_patches (`int`, *optional*, defaults to 1): + The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. + max_patches (`int`, *optional*, defaults to 12): + The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. + """ + + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + + +@lru_cache(maxsize=10) +def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]: + """ + Computes all allowed aspect ratios for a given maximum number of input tiles. + + This function calculates all possible arrangements of tiles that can be formed + within the constraint of the maximum number of tiles. Each arrangement is + represented by its aspect ratio (width/height) and the corresponding tile configuration. + + Args: + max_image_tiles (`int`): + The maximum number of tiles allowed. + + Returns: + `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height) + configuration in terms of number of tiles. + + Example: + >>> get_all_supported_aspect_ratios(4) + [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)] + + """ + aspect_ratios = [] + for width in range(1, max_image_tiles + 1): + for height in range(1, max_image_tiles + 1): + if width * height <= max_image_tiles: + aspect_ratios.append((width, height)) + return aspect_ratios + + +def get_optimal_tiled_canvas( + original_image_size: tuple[int, int], + target_tile_size: tuple[int, int], + min_image_tiles: int, + max_image_tiles: int, +) -> tuple[int, int]: + possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles) + possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1]) + image_height, image_width = original_image_size + patch_size_height, patch_size_width = target_tile_size # (height == width) + + candidate_resolutions = np.array(possible_resolutions) * patch_size_height + original_size = np.stack([image_height, image_width]) + required_scales = candidate_resolutions / original_size + required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1] + if np.all(required_scale < 1): + # We are forced to downscale, so try to minimize the amount of downscaling + best_grid = possible_resolutions[np.argmax(required_scale)] + else: + # Pick the resolution that required the least upscaling so that it most closely fits the image + required_scale = np.where(required_scale < 1.0, 10e9, required_scale) + best_grid = possible_resolutions[np.argmin(required_scale)] + return best_grid + + +@auto_docstring +class Cohere2VisionImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"height": 512, "width": 512} + do_resize = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + crop_to_patches = True + min_patches = 1 + max_patches = 12 + valid_kwargs = Cohere2VisionFastImageProcessorKwargs + patch_size = 16 + + def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + def crop_image_to_patches( + self, + images: "torch.Tensor", + min_patches: int, + max_patches: int, + use_thumbnail: bool = True, + patch_size: Optional[Union[tuple, int, dict]] = None, + interpolation: Optional["F.InterpolationMode"] = None, + ): + """ + Crop the images to patches and return a list of cropped images. + The number of patches and their grid arrangement are determined by the original image size, + the target patch size and the minimum and maximum number of patches. + The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio. + + Args: + images (`torch.Tensor`): + The images to be cropped. + min_patches (`int`): + The minimum number of patches to be extracted from the image. + max_patches (`int`): + The maximum number of patches to be extracted from the image. + use_thumbnail (`bool`, *optional*, defaults to `True`): + Whether to add a thumbnail image to the list of cropped patches. + patch_size (`int`, `tuple[int, int]`, `dict`, *optional*): + The size of the output patches. + The format of the image data. If `None`, the format is inferred from the input image. + + Returns: + list[`PIL.Image.Image`] or list[np.ndarray]: The list of cropped images. + """ + patch_size_height, patch_size_width = patch_size.height, patch_size.width + original_height, original_width = images.shape[-2:] + # find the closest aspect ratio to the target + num_columns, num_rows = get_optimal_tiled_canvas( + (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches + ) + + # calculate the target width and height + target_width = patch_size_width * num_columns + target_height = patch_size_height * num_rows + num_blocks = num_columns * num_rows + + # resize the image so that each patch is of patch_size + resized_image = self.resize( + images, SizeDict(height=target_height, width=target_width), interpolation=interpolation + ) + # split the image into patches + processed_images = [] + for i in range(num_blocks): + column = i % num_columns + row = i // num_columns + box = ( + column * patch_size_width, + row * patch_size_height, + (column + 1) * patch_size_width, + (row + 1) * patch_size_height, + ) + # split the image + patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]] + processed_images.append(patch_image) + + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = self.resize(images, patch_size, interpolation=interpolation) + processed_images.append(thumbnail_img) + + processed_images = torch.stack(processed_images, dim=0).transpose(0, 1).contiguous() + + return processed_images + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + crop_to_patches: bool, + min_patches: int, + max_patches: int, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + if crop_to_patches: + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + processed_images_grouped = {} + num_patches = {} + for shape, stacked_images in grouped_images.items(): + stacked_images = self.crop_image_to_patches( + stacked_images, + min_patches, + max_patches, + patch_size=size, + interpolation=interpolation, + ) + processed_images_grouped[shape] = stacked_images + num_patches[shape] = [stacked_images.shape[1]] * stacked_images.shape[0] + images = reorder_images(processed_images_grouped, grouped_images_index) + images = [image for images_list in images for image in images_list] + num_patches = reorder_images(num_patches, grouped_images_index) + else: + num_patches = [1] * len(images) + + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature( + data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors + ) + + def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None): + """ + A utility that returns number patches for a given image size. + + Args: + height (`int`): + Height of the input image. + width (`int`): + Width of the input image. + images_kwargs (`dict`, *optional*) + Any kwargs to override defaults of the image processor. + Returns: + `int`: Number of patches per image. + """ + min_patches = images_kwargs.get("min_patches", self.min_patches) + max_patches = images_kwargs.get("max_patches", self.max_patches) + patch_size = images_kwargs.get("patch_size", self.size) + crop_to_patches = images_kwargs.get("crop_to_patches", self.crop_to_patches) + + num_patches = 1 + if crop_to_patches and max_patches > 1: + num_columns, num_rows = get_optimal_tiled_canvas( + (height, width), (patch_size["height"], patch_size["width"]), min_patches, max_patches + ) + if num_columns * num_rows > 1: + num_patches += num_columns * num_rows + + return num_patches + + +__all__ = ["Cohere2VisionImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc993967b5c767fe93f63fccb954b42a52a1c28 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py @@ -0,0 +1,425 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_cohere2_vision.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring +from ...utils.generic import check_model_inputs +from ..auto import AutoModel +from .configuration_cohere2_vision import Cohere2VisionConfig + + +class Cohere2VisionMultiModalProjector(nn.Module): + def __init__(self, config: Cohere2VisionConfig): + super().__init__() + self.config = config + self.downsample_factor = config.downsample_factor + self.intermediate_size = config.alignment_intermediate_size + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True + ) + self.act = nn.SiLU() + self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True) + + def pixel_shuffle(self, image_features): # B, S, D + batch_size, seq_length, feature_dim = image_features.shape + height = width = int(seq_length**0.5) + image_features = image_features.reshape(image_features.shape[0], width, height, -1) + channels = image_features.shape[-1] + image_features = image_features.reshape( + batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor) + ) + image_features = image_features.permute(0, 2, 1, 3) + image_features = image_features.reshape( + batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1 + ) + image_features = image_features.permute(0, 2, 1, 3) + return image_features + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + hidden_states = self.linear_1(image_features) + + # Split along last dimension and apply SwiGLU + x, gate = hidden_states.chunk(2, dim=-1) + hidden_states = self.act(gate) * x + + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Cohere2Vision outputs, with hidden states and attentions. + """ +) +class Cohere2VisionModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Cohere2Vision causal language model (or autoregressive) outputs. + """ +) +class Cohere2VisionCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring +class Cohere2VisionPreTrainedModel(PreTrainedModel): + config: Cohere2VisionConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + _can_compile_fullgraph = False + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": "DecoderLayer", + "attentions": "Attention", + } + + +@auto_docstring( + custom_intro=""" + The Cohere2Vision model which consists of a vision backbone and a language model, without a language modeling head. + """ +) +class Cohere2VisionModel(Cohere2VisionPreTrainedModel): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: Cohere2VisionConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = Cohere2VisionMultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_features.last_hidden_state + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, Cohere2VisionModelOutputWithPast]: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return Cohere2VisionModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + The COHERE2_VISION model which consists of a vision backbone and a language model. + """ +) +class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: Cohere2VisionConfig): + super().__init__(config) + self.model = Cohere2VisionModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features(self, pixel_values: torch.FloatTensor): + return self.model.get_image_features(pixel_values=pixel_values) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration + >>> import torch + + >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True) + >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto") + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg", + ... }, + ... {"type": "text", "text": "what is in this image?"}, + ... ], + ... }, + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", + ... ).to(model.device) + + >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3) + >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + image_sizes=image_sizes, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return Cohere2VisionCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["Cohere2VisionForConditionalGeneration", "Cohere2VisionPreTrainedModel", "Cohere2VisionModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..7ef20305b99e6e6bd68bf02a7e67cd642e2c9ca1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py @@ -0,0 +1,318 @@ +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch AyaVision model.""" + +from functools import lru_cache +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn + +from transformers.models.aya_vision.modeling_aya_vision import ( + AyaVisionCausalLMOutputWithPast, + AyaVisionForConditionalGeneration, + AyaVisionModel, + AyaVisionModelOutputWithPast, +) +from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast + +from ...cache_utils import Cache +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils.generic import check_model_inputs +from .configuration_cohere2_vision import Cohere2VisionConfig + + +logger = logging.get_logger(__name__) + + +class Cohere2VisionMultiModalProjector(nn.Module): + def __init__(self, config: Cohere2VisionConfig): + super().__init__() + self.config = config + self.downsample_factor = config.downsample_factor + self.intermediate_size = config.alignment_intermediate_size + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True + ) + self.act = nn.SiLU() + self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True) + + def pixel_shuffle(self, image_features): # B, S, D + batch_size, seq_length, feature_dim = image_features.shape + height = width = int(seq_length**0.5) + image_features = image_features.reshape(image_features.shape[0], width, height, -1) + channels = image_features.shape[-1] + image_features = image_features.reshape( + batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor) + ) + image_features = image_features.permute(0, 2, 1, 3) + image_features = image_features.reshape( + batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1 + ) + image_features = image_features.permute(0, 2, 1, 3) + return image_features + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + hidden_states = self.linear_1(image_features) + + # Split along last dimension and apply SwiGLU + x, gate = hidden_states.chunk(2, dim=-1) + hidden_states = self.act(gate) * x + + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +class Cohere2VisionModelOutputWithPast(AyaVisionModelOutputWithPast): + pass + + +class Cohere2VisionCausalLMOutputWithPast(AyaVisionCausalLMOutputWithPast): + pass + + +class Cohere2VisionModel(AyaVisionModel): + _checkpoint_conversion_mapping = {} + + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_features.last_hidden_state + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, Cohere2VisionModelOutputWithPast]: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return Cohere2VisionModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +class Cohere2VisionForConditionalGeneration(AyaVisionForConditionalGeneration): + _checkpoint_conversion_mapping = {} + + def get_image_features(self, pixel_values: torch.FloatTensor): + return self.model.get_image_features(pixel_values=pixel_values) + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration + >>> import torch + + >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True) + >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto") + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg", + ... }, + ... {"type": "text", "text": "what is in this image?"}, + ... ], + ... }, + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", + ... ).to(model.device) + + >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3) + >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + image_sizes=image_sizes, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return Cohere2VisionCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + +@lru_cache(maxsize=10) +def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]: + """ + Computes all allowed aspect ratios for a given maximum number of input tiles. + + This function calculates all possible arrangements of tiles that can be formed + within the constraint of the maximum number of tiles. Each arrangement is + represented by its aspect ratio (width/height) and the corresponding tile configuration. + + Args: + max_image_tiles (`int`): + The maximum number of tiles allowed. + + Returns: + `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height) + configuration in terms of number of tiles. + + Example: + >>> get_all_supported_aspect_ratios(4) + [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)] + + """ + aspect_ratios = [] + for width in range(1, max_image_tiles + 1): + for height in range(1, max_image_tiles + 1): + if width * height <= max_image_tiles: + aspect_ratios.append((width, height)) + return aspect_ratios + + +def get_optimal_tiled_canvas( + original_image_size: tuple[int, int], + target_tile_size: tuple[int, int], + min_image_tiles: int, + max_image_tiles: int, +) -> tuple[int, int]: + possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles) + possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1]) + image_height, image_width = original_image_size + patch_size_height, patch_size_width = target_tile_size # (height == width) + + candidate_resolutions = np.array(possible_resolutions) * patch_size_height + original_size = np.stack([image_height, image_width]) + required_scales = candidate_resolutions / original_size + required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1] + if np.all(required_scale < 1): + # We are forced to downscale, so try to minimize the amount of downscaling + best_grid = possible_resolutions[np.argmax(required_scale)] + else: + # Pick the resolution that required the least upscaling so that it most closely fits the image + required_scale = np.where(required_scale < 1.0, 10e9, required_scale) + best_grid = possible_resolutions[np.argmin(required_scale)] + return best_grid + + +@auto_docstring +class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast): + size = {"height": 512, "width": 512} + min_patches = 1 + max_patches = 12 + crop_to_patches = True + patch_size = 16 + + +__all__ = [ + "Cohere2VisionForConditionalGeneration", + "Cohere2VisionPreTrainedModel", # noqa: F822 + "Cohere2VisionModel", + "Cohere2VisionImageProcessorFast", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..b72e1512ead9612c9e601101326f532ed3bf0d1a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py @@ -0,0 +1,216 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class Cohere2VisionImagesKwargs(ImagesKwargs, total=False): + max_patches: Optional[int] + + +class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Cohere2VisionImagesKwargs + _defaults = { + "text_kwargs": { + "padding_side": "left", + "padding": True, + "return_mm_token_type_ids": False, + }, + } + + +class Cohere2VisionProcessor(ProcessorMixin): + r""" + Constructs a Cohere2Vision processor which wraps a [`AutoImageProcessor`] and + [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and + tokenizer functionalities. See the [`~Cohere2VisionProcessor.__call__`] and [`~Cohere2VisionProcessor.decode`] for more information. + Args: + image_processor ([`AutoImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer=None, + chat_template=None, + **kwargs, + ): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + self.patch_size = self.image_processor.patch_size + self.boi_token = tokenizer.boi_token + self.eoi_token = tokenizer.eoi_token + self.image_token = tokenizer.image_token + self.img_line_break_token = tokenizer.img_line_break_token + self.image_token_id = tokenizer.image_token_id + + self.image_ids = tokenizer.convert_tokens_to_ids( + [ + self.image_token, + self.boi_token, + self.eoi_token, + self.img_line_break_token, + ] + ) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + **kwargs: Unpack[Cohere2VisionProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text. + To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to + GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if text is None: + raise ValueError("You have to specify text.") + elif not isinstance(text, (list, tuple)): + text = [text] + + output_kwargs = self._merge_kwargs( + Cohere2VisionProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + # Process images + image_inputs = {} + if images is not None: + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + batch_num_patches = iter(image_inputs.pop("num_patches")) + processed_text = [] + for sample in text: + while self.image_token in sample: + num_patches = next(batch_num_patches) + img_patches_per_tile = int(self.patch_size**2) + + img_string = f"{self.boi_token}" + for idx in range(1, num_patches): + img_string += "" * img_patches_per_tile + self.img_line_break_token + img_string += "" * img_patches_per_tile + self.img_line_break_token + img_string += f"{self.eoi_token}" + + sample = sample.replace(self.image_token, img_string, 1) + processed_text.append(sample) + text = [sample.replace("", self.image_token) for sample in processed_text] + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = Cohere2VisionProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + + num_image_patches = [ + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) + for image_size in image_sizes + ] + + token_per_patch = int(self.patch_size**2) + num_image_tokens = [ + 2 + sum(token_per_patch + 1 for _ in range(num_patches)) for num_patches in num_image_patches + ] # Add +2 and +1 for BOI/EOI and image break tokens + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(tokenizer_input_names) + list(image_processor_input_names) + + +__all__ = ["Cohere2VisionProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9d5cc944a817b4eedb0d918aab1aba15e88804f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/configuration_colpali.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/configuration_colpali.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..269d2da2974769107e6b56b0300257f86b54e553 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/configuration_colpali.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modeling_colpali.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modeling_colpali.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5706329fa9d0eaa707931a9cbf2a762150a3811e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modeling_colpali.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modular_colpali.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modular_colpali.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d130c76793909c4e4f55976cf1c260a02d04526 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modular_colpali.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/processing_colpali.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/processing_colpali.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3f2ec0260fabaa595c592f786ed7987022585b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/processing_colpali.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1df410a0981173307eda7266c5c3ea629586d453 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/configuration_colqwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/configuration_colqwen2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2774052a1589971f7235fbd6e9cd94b7c815df4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/configuration_colqwen2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modeling_colqwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modeling_colqwen2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ee0ec9df5ed4c229a90783ecfb6d84b6f089e37 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modeling_colqwen2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modular_colqwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modular_colqwen2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d9c3a6ad8fb17bfedb985cc51da79c75dc0232b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modular_colqwen2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/processing_colqwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/processing_colqwen2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc2ff757be25a7d7e9bfffdd11dd346c6a48cb79 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/processing_colqwen2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..662abc7dbc33f5665904117ae33b533396cd9344 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/configuration_conditional_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/configuration_conditional_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40c5e3cd1fed896d49c8aa90967f1604461b0d28 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/configuration_conditional_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/feature_extraction_conditional_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/feature_extraction_conditional_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ee9787cf49eb0e31311f38b8e05a362e39f00c4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/feature_extraction_conditional_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..068b40435c7c3fcf26e6f79e75770949389803c7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b3ce642d5148a9e52a4bc04ffa169146a645234 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/modular_conditional_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/modular_conditional_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee1f309e607af4c0c1550545ec822aa1ef750a0b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/modular_conditional_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b50084199fed7742cb7ccc63b7466829a77d11e1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/configuration_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/configuration_convnext.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba66fb14a2f9c133788b9f2d3d4dd232df63ead2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/configuration_convnext.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/feature_extraction_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/feature_extraction_convnext.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d77cb3025910fc7a1db56e298d4a14fc5fd4627 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/feature_extraction_convnext.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb98e5843061e90a3c5630914c7f41a3b9b02354 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6f7810ac3ecac4ee22c78816bd1ae7974a9643b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_convnext.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..529f2321642b2c4ca5aed584af46263c35e00c42 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_convnext.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_tf_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_tf_convnext.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..664014aed4f63cec3d04bc955a0e79d7c781d9c6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_tf_convnext.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd06c0dc23ae61fdaaa4d361e2bdb949718df946 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e98a4f0ba189bd275760924c021fdd9a1417fdae Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f658dc253715b5d8957b7b1c18b43883c1722693 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f0c4b7cd565db868a1234e00904e97f2efeef08 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/configuration_dac.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/configuration_dac.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33f0bb1d445b20f9ab9c5005232658d972853534 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/configuration_dac.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/feature_extraction_dac.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/feature_extraction_dac.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c429610f261518c8c184e0ad35ef8fba91bb742a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/feature_extraction_dac.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/modeling_dac.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/modeling_dac.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5343657b22912a96036076bd1b61e3f1fc9e1fa8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/modeling_dac.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bec515dda6f1586763d86fbc3d56c2ea2e7d9f01 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/configuration_deberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/configuration_deberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fad387634d4219a8151cd052cfadf270f3a03622 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/configuration_deberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_deberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_deberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..759326e6b6979c77dd9e433bd2cccb199a4aa4cc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_deberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_tf_deberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_tf_deberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d33cd9ac0633580ea92bcad7ee889519deba4d7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_tf_deberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8df68220ce114d798e53eac94b03de956032d00 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8201972d61e752c0ffd93c1c062f604418f20405 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed5162ccb6d7655517546d65458fed94b0d374f6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/configuration_deepseek_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/configuration_deepseek_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b27386c4c990189b689b0a815594b5227866d42 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/configuration_deepseek_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modeling_deepseek_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modeling_deepseek_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a9c0ae267281ec133b7a10fc8a33634e8617570 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modeling_deepseek_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modular_deepseek_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modular_deepseek_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0491301ccc8e9cb97126890ad5700d34c9d7cff1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modular_deepseek_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b00e674b6c8bfced43aedd3fa62d835238fcfd2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0facdba6154a63c6541edcd405f74916486b8279 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d30748b8f8b363bfbbbc65b192c4d34402d312fd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73d92f41dcdba14af5f35f9d694fff41ee55a312 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab18853681c9d57b30bff23365330b33475a3505 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a7e9c89a5eec6a0ff5c1d2fc05f0203a27c9b03 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ea81a02094813cfed89e2b343f255c17050ae0c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50373dc1de6c27744f52a0e3706487d1aefd6bab Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6488b055d4fe33f08c1d1a1ac0c07e7879353f56 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24e9b5c43bd7183878f579e4d6c4139d9794d96d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93b75021706bf9372741f3dd0843051bf5b4879e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22d5d4035f7dfc4ccae79dc6327156fa914b7fd0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27e7e5b7b1901f11b058501d851f3dd39c819bac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ec772ff99042e51d2518304800f7c94f31be185 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b28084a30fc63c52a9a856ea070b068ff14e3b1b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57423d39c0a53570e5c2e97e7e5027ee7de76a91 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7baaf001c7756b563594b2ffa8ecf820dacfcc63 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..997fb71e021a364cccb9684f37fcd04d5195eec6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d818441b648a6b73ff0b75552a36a4471486094 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c77156e588696aed379d44e6fe5810fc080e1c1e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8798159fb8c8a5dcb00e5bec17ae6b117b479f77 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85f002bc8f134bdd02558d911cacf685b56c905c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8936afb6c0bed2c755998863252d8ba8edf6e15 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0f583c0ecc79fad5498dcaaf364cd6c39473d3e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53f493dab11b1ef493b18d75ddf84b8a933cf4d0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a6c14a1d11cded6ee77b9683ac64ddf9851dc1b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed8941496901fb69b0eccb8b8e26010ec2e60c7b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b309ff23029e1d9365ba172230c6fa00c345c0a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d92e37e13eae72dce715e4d95811c6d667c75a7e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecd98bb30237a0ac68d9aa7ffca03ebe81676a02 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e39ca858f266338346ed75f1bd3167c80df91b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3efe9a62c2af8b4bc0f9be522b284e668f95d1a0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a49e89f9695c18d74cdf9ba81c9c246a87538c8b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..663012339337cc1ffdd8f613d216428baaca9072 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04d30077506511b9abc7b31601daebe76b874534 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67fb5d2ce23749442a3029565f1a662657d92352 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee3011aca424d392bcd5c0e08012eeea1f4fa5ca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc9749bac04ef66c1d6d827870eb171876116c21 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c7d206f499e7c947fcdfd2576cc61185f2298dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d21fbb059d8780ad2d6bfe908264badfc0892d52 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d32e2ef88daa2379086aa3025f371b817d95689 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..505372a1f3126bab069a15f2df6f820b5113511e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d6f2bd0d56c9cc85a2fa37d136578a94bd34411 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad05ad84f244edf0dc46b84d7353d41cc4b0b9c5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d639585a62d2600911df909d11a7fc4d5463e41a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb559947f1e645d567910f0400c453a4249b2956 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbd01b1d2a8d4aa5fec0be1c8c0a3a2d2a121fad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c3d28286c72c6b83c49aa4aae19fe5bac85416f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83a1002a93b984d3397a241a4d33697a1e631f16 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec068e6350e977f731ca352b584ff86e93abeb62 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cf611b71d3e333ef4ed226dc8febf7537d5f738 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12b18b1be7225caf0fc028d23aa8379f32709a5a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca490e414aa5e3c6f9141c6c634543a73eaae001 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07a9692c891e55aa62c68dd0f84cd5863308d989 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0e98c6ed1348cbf4d5024e9d79f3ae2a3761b55 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6104b101cbb7e3f9f9b129bc2fc48296877e0f8b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e65c5409aeb7b4702b794bfc80ad28099948dd36 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f46b323f6debbdc8199e00a63cd060fd1653b184 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fb0cfed949f1613f124f73add749f07d18309b4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..181bdafbadd4e47282728c1b1a14f6ad454b83e5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d71ec4bb8573a1ae37d8dd1be174e1cec7416f4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..275528a54427b5575a0f455a3558ba92e11ca027 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35e5505a14e5c096c9a585a89d80d88f2b71a8e2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1b969fca47c5e08a2f3f3c5f5a41d9e64fd577a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c61587acbc74c55dd8cbb26104f4c0e4f1efad8f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a4e88106d6317e7a359c9f67c2b03b29499fbcb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84f0f6a14dc631ce080ac6f2d36f84dbe4686503 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb9ea6aa3292c0693b3ad753a4e3c482f6297642 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c54dd6bce9331efad73dadbbf8325fc8ccdf98f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3b73e39bedb436c6eeaaad645ed3614908def5b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3df676e90e47b24dc1067fd19e424be260d0835 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec868cb35462973dab88650ce9d56b7dbc85ae22 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c13b073a9c4f32f589a3bc94ac0a20086ffd0854 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76f40143ba9dfbbd6b311c907b3a0fcbb8a55892 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5ee3036f7ccd339675c9f131226c871e983d076 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20e9a6e54913fd471f6c05ffa180d12dd43b3299 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a766af57b76487fe84e4fdf7ff4dcc5dab6b5f04 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0eb9b8732929c0f1ddcea743296272ff3d733ce8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1082209e23e7110e9a45c80a80ffe4ff3da543f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51206db9eb1a3aa8ca01aa7c5d645d2e10a94e93 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4acfd5284e937d297cc6537cdc63a89b5a6fa6d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8bee80fd9e664f74c0631e74c7924263ae37c8a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32a64173e4d529cd6fa975a878343540ae94ec5a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d6caf685e78df6a2580229e7a0b7d4acb26dd7c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e27efac01e38f50ea377d33d8028d10eca679cd0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf4f92fd20d225ecd05171e112e02c3691971e45 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8094c60c893d1a8d262cca889dc6b2de0bd1ac6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5f8da49cb85bb12423a85c85479b5bc654b1a9e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77c15953ba03c2e220a4a1978fd5ae8afcb74d85 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b8aae5e70381b21772bdec395693c007a9c02b7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_detr import * + from .feature_extraction_detr import * + from .image_processing_detr import * + from .image_processing_detr_fast import * + from .modeling_detr import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/feature_extraction_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/feature_extraction_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..a81f83c8c313bdb8a904f0b359360c0e100a83d9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/feature_extraction_detr.py @@ -0,0 +1,48 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for DETR.""" + +import warnings + +from ...image_transforms import rgb_to_id as _rgb_to_id +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_detr import DetrImageProcessor + + +logger = logging.get_logger(__name__) + + +def rgb_to_id(x): + warnings.warn( + "rgb_to_id has moved and will not be importable from this module from v5. " + "Please import from transformers.image_transforms instead.", + FutureWarning, + ) + return _rgb_to_id(x) + + +@requires(backends=("vision",)) +class DetrFeatureExtractor(DetrImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class DetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers." + " Please use DetrImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["DetrFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/image_processing_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/image_processing_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..f29bd48a5934cca8224a91d0cc25ce54f79e169f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/image_processing_detr.py @@ -0,0 +1,2049 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for DETR.""" + +import io +import pathlib +from collections import defaultdict +from collections.abc import Iterable +from typing import Any, Callable, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + id_to_rgb, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_annotations, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + is_flax_available, + is_jax_tensor, + is_scipy_available, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) +from ...utils.import_utils import requires + + +if is_torch_available(): + import torch + from torch import nn + + +if is_vision_available(): + import PIL + + +if is_scipy_available(): + import scipy.special + import scipy.stats + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76 +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + raw_size = None + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) + + if (height <= width and height == size) or (width <= height and width == size): + oh, ow = height, width + elif width < height: + ow = size + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) + else: + oh = size + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + + return (oh, ow) + + +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, tuple[int, int], list[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `tuple[int, int]` or `list[int]`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices +def max_across_indices(values: Iterable[Any]) -> list[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width +def get_max_height_width( + images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> list[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33 +def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`list[list[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + +# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50 +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by DETR. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + # Converting the filtered keypoints list to a numpy array + keypoints = np.asarray(keypoints, dtype=np.float32) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width) + new_target["masks"] = masks[keep] + + return new_target + + +def masks_to_boxes(masks: np.ndarray) -> np.ndarray: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +def prepare_coco_panoptic_annotation( + image: np.ndarray, + target: dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> dict: + """ + Prepare a coco panoptic annotation for DETR. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64) + new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64) + new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64) + + if "segments_info" in target: + masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([segment_info["id"] for segment_info in target["segments_info"]]) + masks = masks == ids[:, None, None] + masks = masks.astype(np.uint8) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = np.array( + [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["iscrowd"] = np.asarray( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["area"] = np.asarray( + [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32 + ) + + return new_target + + +def get_segmentation_image( + masks: np.ndarray, input_size: tuple, target_size: tuple, stuff_equiv_classes, deduplicate=False +): + h, w = input_size + final_h, final_w = target_size + + m_id = scipy.special.softmax(masks.transpose(0, 1), -1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = np.zeros((h, w), dtype=np.int64) + else: + m_id = m_id.argmax(-1).reshape(h, w) + + if deduplicate: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + for eq_id in equiv: + m_id[m_id == eq_id] = equiv[0] + + seg_img = id_to_rgb(m_id) + seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST) + return seg_img + + +def get_mask_area(seg_img: np.ndarray, target_size: tuple[int, int], n_classes: int) -> np.ndarray: + final_h, final_w = target_size + np_seg_img = seg_img.astype(np.uint8) + np_seg_img = np_seg_img.reshape(final_h, final_w, 3) + m_id = rgb_to_id(np_seg_img) + area = [(m_id == i).sum() for i in range(n_classes)] + return area + + +def score_labels_from_class_probabilities(logits: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + probs = scipy.special.softmax(logits, axis=-1) + labels = probs.argmax(-1, keepdims=True) + scores = np.take_along_axis(probs, labels, axis=-1) + scores, labels = scores.squeeze(-1), labels.squeeze(-1) + return scores, labels + + +def post_process_panoptic_sample( + out_logits: np.ndarray, + masks: np.ndarray, + boxes: np.ndarray, + processed_size: tuple[int, int], + target_size: tuple[int, int], + is_thing_map: dict, + threshold=0.85, +) -> dict: + """ + Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample. + + Args: + out_logits (`torch.Tensor`): + The logits for this sample. + masks (`torch.Tensor`): + The predicted segmentation masks for this sample. + boxes (`torch.Tensor`): + The predicted bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y, + width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding). + processed_size (`tuple[int, int]`): + The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size + after data augmentation but before batching. + target_size (`tuple[int, int]`): + The target size of the image, `(height, width)` corresponding to the requested final size of the + prediction. + is_thing_map (`Dict`): + A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not. + threshold (`float`, *optional*, defaults to 0.85): + The threshold used to binarize the segmentation masks. + """ + # we filter empty queries and detection below threshold + scores, labels = score_labels_from_class_probabilities(out_logits) + keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold) + + cur_scores = scores[keep] + cur_classes = labels[keep] + cur_boxes = center_to_corners_format(boxes[keep]) + + if len(cur_boxes) != len(cur_classes): + raise ValueError("Not as many boxes as there are classes") + + cur_masks = masks[keep] + cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR) + cur_masks = safe_squeeze(cur_masks, 1) + b, h, w = cur_masks.shape + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.reshape(b, -1) + stuff_equiv_classes = defaultdict(list) + for k, label in enumerate(cur_classes): + if not is_thing_map[label]: + stuff_equiv_classes[label].append(k) + + seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores)) + + # We filter out any mask that is too small + if cur_classes.size() > 0: + # We know filter empty masks as long as we find some + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + while filtered_small.any(): + cur_masks = cur_masks[~filtered_small] + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores)) + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + else: + cur_classes = np.ones((1, 1), dtype=np.int64) + + segments_info = [ + {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a} + for i, (cat, a) in enumerate(zip(cur_classes, area)) + ] + del cur_classes + + with io.BytesIO() as out: + PIL.Image.fromarray(seg_img).save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + + return predictions + + +def resize_annotation( + annotation: dict[str, Any], + orig_size: tuple[int, int], + target_size: tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`dict[str, Any]`): + The annotation dictionary. + orig_size (`tuple[int, int]`): + The original size of the input image. + target_size (`tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +# TODO - (Amy) make compatible with other frameworks +def binary_mask_to_rle(mask): + """ + Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + mask (`torch.Tensor` or `numpy.array`): + A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target + segment_id or class_id. + Returns: + `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE + format. + """ + if is_torch_tensor(mask): + mask = mask.numpy() + + pixels = mask.flatten() + pixels = np.concatenate([[0], pixels, [0]]) + runs = np.where(pixels[1:] != pixels[:-1])[0] + 1 + runs[1::2] -= runs[::2] + return list(runs) + + +# TODO - (Amy) make compatible with other frameworks +def convert_segmentation_to_rle(segmentation): + """ + Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + segmentation (`torch.Tensor` or `numpy.array`): + A segmentation map of shape `(height, width)` where each value denotes a segment or class id. + Returns: + `list[List]`: A list of lists, where each list is the run-length encoding of a segment / class id. + """ + segment_ids = torch.unique(segmentation) + + run_length_encodings = [] + for idx in segment_ids: + mask = torch.where(segmentation == idx, 1, 0) + rle = binary_mask_to_rle(mask) + run_length_encodings.append(rle) + + return run_length_encodings + + +def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels): + """ + Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and + `labels`. + + Args: + masks (`torch.Tensor`): + A tensor of shape `(num_queries, height, width)`. + scores (`torch.Tensor`): + A tensor of shape `(num_queries)`. + labels (`torch.Tensor`): + A tensor of shape `(num_queries)`. + object_mask_threshold (`float`): + A number between 0 and 1 used to binarize the masks. + Raises: + `ValueError`: Raised when the first dimension doesn't match in all input tensors. + Returns: + `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region + < `object_mask_threshold`. + """ + if not (masks.shape[0] == scores.shape[0] == labels.shape[0]): + raise ValueError("mask, scores and labels must have the same shape!") + + to_keep = labels.ne(num_labels) & (scores > object_mask_threshold) + + return masks[to_keep], scores[to_keep], labels[to_keep] + + +def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8): + # Get the mask associated with the k class + mask_k = mask_labels == k + mask_k_area = mask_k.sum() + + # Compute the area of all the stuff in query k + original_area = (mask_probs[k] >= mask_threshold).sum() + mask_exists = mask_k_area > 0 and original_area > 0 + + # Eliminate disconnected tiny segments + if mask_exists: + area_ratio = mask_k_area / original_area + if not area_ratio.item() > overlap_mask_area_threshold: + mask_exists = False + + return mask_exists, mask_k + + +def compute_segments( + mask_probs, + pred_scores, + pred_labels, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[set[int]] = None, + target_size: Optional[tuple[int, int]] = None, +): + height = mask_probs.shape[1] if target_size is None else target_size[0] + width = mask_probs.shape[2] if target_size is None else target_size[1] + + segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device) + segments: list[dict] = [] + + if target_size is not None: + mask_probs = nn.functional.interpolate( + mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + )[0] + + current_segment_id = 0 + + # Weigh each mask by its prediction score + mask_probs *= pred_scores.view(-1, 1, 1) + mask_labels = mask_probs.argmax(0) # [height, width] + + # Keep track of instances of each class + stuff_memory_list: dict[str, int] = {} + for k in range(pred_labels.shape[0]): + pred_class = pred_labels[k].item() + should_fuse = pred_class in label_ids_to_fuse + + # Check if mask exists and large enough to be a segment + mask_exists, mask_k = check_segment_validity( + mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold + ) + + if mask_exists: + if pred_class in stuff_memory_list: + current_segment_id = stuff_memory_list[pred_class] + else: + current_segment_id += 1 + + # Add current object segment to final segmentation map + segmentation[mask_k] = current_segment_id + segment_score = round(pred_scores[k].item(), 6) + segments.append( + { + "id": current_segment_id, + "label_id": pred_class, + "was_fused": should_fuse, + "score": segment_score, + } + ) + if should_fuse: + stuff_memory_list[pred_class] = current_segment_id + + return segmentation, segments + + +@requires(backends=("vision",)) +class DetrImageProcessor(BaseImageProcessor): + r""" + Constructs a Detr image processor. + + Args: + format (`str`, *optional*, defaults to `"coco_detection"`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to True): + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_annotations: Optional[bool] = None, + do_pad: bool = True, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + self._valid_processor_keys = [ + "images", + "annotations", + "return_segmentation_masks", + "masks_path", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "do_convert_annotations", + "image_mean", + "image_std", + "do_pad", + "pad_size", + "format", + "return_tensors", + "data_format", + "input_data_format", + ] + + @classmethod + def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + def prepare_annotation( + self, + image: np.ndarray, + target: dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> dict: + """ + Prepare an annotation for feeding into DETR model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None + size = get_size_dict(size, max_size=max_size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + new_size = get_resize_output_image_size( + image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + # TODO (Amy) - update to use `rescale_factor` instead of `scale` + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format and from absolute to relative pixel values. + """ + return normalize_annotation(annotation, image_size=image_size) + + def _update_annotation_for_padded_image( + self, + annotation: dict, + input_image_size: tuple[int, int], + output_image_size: tuple[int, int], + padding, + update_bboxes, + ) -> dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = pad( + masks, + padding, + mode=PaddingMode.CONSTANT, + constant_values=0, + input_data_format=ChannelDimension.FIRST, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= np.asarray( + [ + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + ] + ) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + def _pad_image( + self, + image: np.ndarray, + output_size: tuple[int, int], + annotation: Optional[dict[str, Any]] = None, + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes + ) + return padded_image, annotation + + def pad( + self, + images: list[np.ndarray], + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + pad_size: Optional[dict[str, int]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + images (list[`np.ndarray`]): + Images to pad. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + update_bboxes (`bool`, *optional*, defaults to `True`): + Whether to update the bounding boxes in the annotations to match the padded images. If the + bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` + format, the bounding boxes will not be updated. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) + + annotation_list = annotations if annotations is not None else [None] * len(images) + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotation_list): + padded_image, padded_annotation = self._pad_image( + image, + padded_size, + annotation, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=update_bboxes, + ) + padded_images.append(padded_image) + padded_annotations.append(padded_annotation) + + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations + ] + + return encoded_inputs + + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `list[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, size=size, resample=resample, input_data_format=input_data_format + ) + resized_annotation = self.resize_annotation( + target, orig_size, get_image_size(resized_image, input_data_format) + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + + if do_convert_annotations and annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + encoded_inputs = self.pad( + images, + annotations=annotations, + return_pixel_mask=True, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=do_convert_annotations, + return_tensors=return_tensors, + pad_size=pad_size, + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + # POSTPROCESSING METHODS - TODO: add support for other frameworks + # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258 + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = nn.functional.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(out_bbox) + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + return results + + def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5): + """ + Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch. + + Args: + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`): + Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. + threshold (`float`, *optional*, defaults to 0.9): + Threshold to use to filter out queries. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_semantic_segmentation`.", + ) + out_logits, raw_masks = outputs.logits, outputs.pred_masks + empty_label = out_logits.shape[-1] - 1 + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.tolist()) + + for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes): + # we filter empty queries and detection below threshold + cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) + keep = cur_labels.ne(empty_label) & (cur_scores > threshold) + cur_scores = cur_scores[keep] + cur_labels = cur_labels[keep] + cur_masks = cur_masks[keep] + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1 + + predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks} + preds.append(predictions) + return preds + + # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218 + def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5): + """ + Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports + PyTorch. + + Args: + results (`list[Dict]`): + Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added. + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original + image size (before any data augmentation). + max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). + threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an + image in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_instance_segmentation`.", + ) + + if len(orig_target_sizes) != len(max_target_sizes): + raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes") + max_h, max_w = max_target_sizes.max(0)[0].tolist() + outputs_masks = outputs.pred_masks.squeeze(2) + outputs_masks = nn.functional.interpolate( + outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False + ) + outputs_masks = (outputs_masks.sigmoid() > threshold).cpu() + + for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): + img_h, img_w = t[0], t[1] + results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) + results[i]["masks"] = nn.functional.interpolate( + results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" + ).byte() + + return results + + # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241 + def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85): + """ + Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch. + + Args: + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`): + Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data + augmentation but before batching. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`, *optional*): + Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction. + If left to None, it will default to the `processed_sizes`. + is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): + Dictionary mapping class indices to either True or False, depending on whether or not they are a thing. + If not set, defaults to the `is_thing_map` of COCO panoptic. + threshold (`float`, *optional*, defaults to 0.85): + Threshold to use to filter out queries. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for + an image in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_panoptic_segmentation`.", + ) + if target_sizes is None: + target_sizes = processed_sizes + if len(processed_sizes) != len(target_sizes): + raise ValueError("Make sure to pass in as many processed_sizes as target_sizes") + + if is_thing_map is None: + # default to is_thing_map of COCO panoptic + is_thing_map = {i: i <= 90 for i in range(201)} + + out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes + if not len(out_logits) == len(raw_masks) == len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" + ) + empty_label = out_logits.shape[-1] - 1 + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.tolist()) + + for cur_logits, cur_masks, cur_boxes, size, target_size in zip( + out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes + ): + # we filter empty queries and detection below threshold + cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) + keep = cur_labels.ne(empty_label) & (cur_scores > threshold) + cur_scores = cur_scores[keep] + cur_labels = cur_labels[keep] + cur_masks = cur_masks[keep] + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_boxes = center_to_corners_format(cur_boxes[keep]) + + h, w = cur_masks.shape[-2:] + if len(cur_boxes) != len(cur_labels): + raise ValueError("Not as many boxes as there are classes") + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.flatten(1) + stuff_equiv_classes = defaultdict(lambda: []) + for k, label in enumerate(cur_labels): + if not is_thing_map[label.item()]: + stuff_equiv_classes[label.item()].append(k) + + def get_ids_area(masks, scores, dedup=False): + # This helper function creates the final panoptic segmentation image + # It also returns the area of the masks that appears on the image + + m_id = masks.transpose(0, 1).softmax(-1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) + else: + m_id = m_id.argmax(-1).view(h, w) + + if dedup: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + if len(equiv) > 1: + for eq_id in equiv: + m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) + + final_h, final_w = to_tuple(target_size) + + seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy())) + seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST) + + np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) + np_seg_img = np_seg_img.view(final_h, final_w, 3) + np_seg_img = np_seg_img.numpy() + + m_id = torch.from_numpy(rgb_to_id(np_seg_img)) + + area = [] + for i in range(len(scores)): + area.append(m_id.eq(i).sum().item()) + return area, seg_img + + area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) + if cur_labels.numel() > 0: + # We know filter empty masks as long as we find some + while True: + filtered_small = torch.as_tensor( + [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device + ) + if filtered_small.any().item(): + cur_scores = cur_scores[~filtered_small] + cur_labels = cur_labels[~filtered_small] + cur_masks = cur_masks[~filtered_small] + area, seg_img = get_ids_area(cur_masks, cur_scores) + else: + break + + else: + cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device) + + segments_info = [] + for i, a in enumerate(area): + cat = cur_labels[i].item() + segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a}) + del cur_labels + + with io.BytesIO() as out: + seg_img.save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + preds.append(predictions) + return preds + + # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258 + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None + ): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = nn.functional.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # Convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(out_bbox) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, list): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results + + def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple[int, int]]] = None): + """ + Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + Raw outputs of the model. + target_sizes (`list[tuple[int, int]]`, *optional*): + A list of tuples (`tuple[int, int]`) containing the target size (height, width) of each image in the + batch. If unset, predictions will not be resized. + Returns: + `list[torch.Tensor]`: + A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width) + corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each + `torch.Tensor` correspond to a semantic class id. + """ + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + # Remove the null class `[..., :-1]` + masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] + masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Semantic segmentation logits of shape (batch_size, num_classes, height, width) + segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) + batch_size = class_queries_logits.shape[0] + + # Resize logits and compute semantic segmentation maps + if target_sizes is not None: + if batch_size != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + semantic_segmentation = [] + for idx in range(batch_size): + resized_logits = nn.functional.interpolate( + segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False + ) + semantic_map = resized_logits[0].argmax(dim=0) + semantic_segmentation.append(semantic_map) + else: + semantic_segmentation = segmentation.argmax(dim=1) + semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] + + return semantic_segmentation + + # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218 + def post_process_instance_segmentation( + self, + outputs, + threshold: float = 0.5, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + target_sizes: Optional[list[tuple[int, int]]] = None, + return_coco_annotation: Optional[bool] = False, + ) -> list[dict]: + """ + Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.5): + The probability score threshold to keep predicted instance masks. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8): + The overlap mask area threshold to merge or discard small disconnected parts within each binary + instance mask. + target_sizes (`list[Tuple]`, *optional*): + List of length (batch_size), where each list item (`tuple[int, int]]`) corresponds to the requested + final size (height, width) of each prediction. If unset, predictions will not be resized. + return_coco_annotation (`bool`, *optional*): + Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE) + format. + Returns: + `list[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: + - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or + `list[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to + `True`. Set to `None` if no mask if found above `threshold`. + - **segments_info** -- A dictionary that contains additional information on each segment. + - **id** -- An integer representing the `segment_id`. + - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`. + - **score** -- Prediction score of segment with `segment_id`. + """ + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Predicted label and score of each query (batch_size, num_queries) + pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1) + + # Loop over items in batch size + results: list[dict[str, TensorType]] = [] + + for i in range(batch_size): + mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects( + mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels + ) + + # No mask found + if mask_probs_item.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + # Get segmentation map and segment information of batch item + target_size = target_sizes[i] if target_sizes is not None else None + segmentation, segments = compute_segments( + mask_probs=mask_probs_item, + pred_scores=pred_scores_item, + pred_labels=pred_labels_item, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + label_ids_to_fuse=[], + target_size=target_size, + ) + + # Return segmentation map in run-length encoding (RLE) format + if return_coco_annotation: + segmentation = convert_segmentation_to_rle(segmentation) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241 + def post_process_panoptic_segmentation( + self, + outputs, + threshold: float = 0.5, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[set[int]] = None, + target_sizes: Optional[list[tuple[int, int]]] = None, + ) -> list[dict]: + """ + Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports + PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + The outputs from [`DetrForSegmentation`]. + threshold (`float`, *optional*, defaults to 0.5): + The probability score threshold to keep predicted instance masks. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8): + The overlap mask area threshold to merge or discard small disconnected parts within each binary + instance mask. + label_ids_to_fuse (`Set[int]`, *optional*): + The labels in this state will have all their instances be fused together. For instance we could say + there can only be one sky in an image, but several persons, so the label ID for sky would be in that + set, but not the one for person. + target_sizes (`list[Tuple]`, *optional*): + List of length (batch_size), where each list item (`tuple[int, int]]`) corresponds to the requested + final size (height, width) of each prediction in batch. If unset, predictions will not be resized. + Returns: + `list[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: + - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or + `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized to + the corresponding `target_sizes` entry. + - **segments_info** -- A dictionary that contains additional information on each segment. + - **id** -- an integer representing the `segment_id`. + - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`. + - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise. + Multiple instances of the same class / label were fused and assigned a single `segment_id`. + - **score** -- Prediction score of segment with `segment_id`. + """ + + if label_ids_to_fuse is None: + logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.") + label_ids_to_fuse = set() + + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Predicted label and score of each query (batch_size, num_queries) + pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1) + + # Loop over items in batch size + results: list[dict[str, TensorType]] = [] + + for i in range(batch_size): + mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects( + mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels + ) + + # No mask found + if mask_probs_item.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + # Get segmentation map and segment information of batch item + target_size = target_sizes[i] if target_sizes is not None else None + segmentation, segments = compute_segments( + mask_probs=mask_probs_item, + pred_scores=pred_scores_item, + pred_labels=pred_labels_item, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + label_ids_to_fuse=label_ids_to_fuse, + target_size=target_size, + ) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + +__all__ = ["DetrImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/modeling_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/modeling_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..86835ca62cfc02f7e2f8f134c9824c3eda1f31e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/modeling_detr.py @@ -0,0 +1,1693 @@ +# coding=utf-8 +# Copyright 2021 Facebook AI Research The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DETR model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + auto_docstring, + is_timm_available, + logging, + requires_backends, +) +from ...utils.backbone_utils import load_backbone +from .configuration_detr import DetrConfig + + +if is_timm_available(): + from timm import create_model + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions, + namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them + gone through a layernorm. This is useful when training the model with auxiliary decoding losses. + """ +) +class DetrDecoderOutput(BaseModelOutputWithCrossAttentions): + r""" + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput, + namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them + gone through a layernorm. This is useful when training the model with auxiliary decoding losses. + """ +) +class DetrModelOutput(Seq2SeqModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`DetrForObjectDetection`]. + """ +) +class DetrObjectDetectionOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`DetrForSegmentation`]. + """ +) +class DetrSegmentationOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`): + Segmentation masks logits for all queries. See also + [`~DetrImageProcessor.post_process_semantic_segmentation`] or + [`~DetrImageProcessor.post_process_instance_segmentation`] + [`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic + segmentation masks respectively. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + pred_masks: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + + +# BELOW: utilities copied from +# https://github.com/facebookresearch/detr/blob/master/backbone.py +class DetrFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = DetrFrozenBatchNorm2d(module.num_features) + + if module.weight.device != torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class DetrConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API + if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. + requires_backends(self, ["timm"]) + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (1, 2, 3, 4)) + num_channels = kwargs.pop("in_chans", config.num_channels) + if config.dilation: + kwargs["output_stride"] = kwargs.get("output_stride", 16) + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + out_indices=out_indices, + in_chans=num_channels, + **kwargs, + ) + else: + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = None + if config.backbone is not None: + backbone_model_type = config.backbone + elif config.backbone_config is not None: + backbone_model_type = config.backbone_config.model_type + else: + raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") + + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +class DetrConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +class DetrSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float() + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class DetrLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = DetrLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +class DetrAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor]): + return tensor if object_queries is None else tensor + object_queries + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, + key_value_states: Optional[torch.Tensor] = None, + spatial_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size, target_len, embed_dim = hidden_states.size() + + # add position embeddings to the hidden states before projecting to queries and keys + if object_queries is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, object_queries) + + # add key-value position embeddings to the key value states + if spatial_position_embeddings is not None: + key_value_states_original = key_value_states + key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, batch_size) + value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + if attention_mask.dtype == torch.bool: + attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_( + attention_mask, -torch.inf + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class DetrEncoderLayer(nn.Module): + def __init__(self, config: DetrConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = DetrAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + object_queries: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + object_queries (`torch.FloatTensor`, *optional*): + Object queries (also called content embeddings), to be added to the hidden states. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + object_queries=object_queries, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class DetrDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DetrConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = DetrAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = DetrAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, + query_position_embeddings: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + object_queries (`torch.FloatTensor`, *optional*): + object_queries that are added to the hidden states + in the cross-attention layer. + query_position_embeddings (`torch.FloatTensor`, *optional*): + position embeddings that are added to the queries and keys + in the self-attention layer. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + object_queries=query_position_embeddings, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + object_queries=query_position_embeddings, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + spatial_position_embeddings=object_queries, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +@auto_docstring +class DetrPreTrainedModel(PreTrainedModel): + config: DetrConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"] + + def _init_weights(self, module): + std = self.config.init_std + xavier_std = self.config.init_xavier_std + + if isinstance(module, DetrMHAttentionMap): + nn.init.zeros_(module.k_linear.bias) + nn.init.zeros_(module.q_linear.bias) + nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std) + nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std) + elif isinstance(module, DetrLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class DetrEncoder(DetrPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`DetrEncoderLayer`]. + + The encoder updates the flattened feature map through multiple self-attention layers. + + Small tweak for DETR: + + - object_queries are added to the forward pass. + + Args: + config: DetrConfig + """ + + def __init__(self, config: DetrConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + object_queries=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Object queries that are added to the queries in each self-attention layer. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + # we add object_queries as extra input to the encoder_layer + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + object_queries=object_queries, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class DetrDecoder(DetrPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some small tweaks for DETR: + + - object_queries and query_position_embeddings are added to the forward pass. + - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers. + + Args: + config: DetrConfig + """ + + def __init__(self, config: DetrConfig): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + + self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)]) + # in DETR, the decoder uses layernorm after the last decoder layer output + self.layernorm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + object_queries=None, + query_position_embeddings=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + The query embeddings that are passed into the decoder. + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`: + + - 1 for queries that are **not masked**, + - 0 for queries that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Object queries that are added to the queries and keys in each cross-attention layer. + query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + , *optional*): Position embeddings that are added to the values and keys in each self-attention layer. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + input_shape = inputs_embeds.size()[:-1] + + combined_attention_mask = None + + if attention_mask is not None and combined_attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + combined_attention_mask = combined_attention_mask + _prepare_4d_attention_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # optional intermediate hidden states + intermediate = () if self.config.auxiliary_loss else None + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + layer_outputs = decoder_layer( + hidden_states, + combined_attention_mask, + object_queries, + query_position_embeddings, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if self.config.auxiliary_loss: + hidden_states = self.layernorm(hidden_states) + intermediate += (hidden_states,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # finally, apply layernorm + hidden_states = self.layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # stack intermediate decoder activations + if self.config.auxiliary_loss: + intermediate = torch.stack(intermediate) + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate] + if v is not None + ) + return DetrDecoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + intermediate_hidden_states=intermediate, + ) + + +@auto_docstring( + custom_intro=""" + The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without + any specific head on top. + """ +) +class DetrModel(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = DetrConvEncoder(config) + object_queries = build_position_encoding(config) + self.backbone = DetrConvModel(backbone, object_queries) + + # Create projection layer + self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1) + + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) + + self.encoder = DetrEncoder(config) + self.decoder = DetrDecoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.encoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DetrModelOutput]: + r""" + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DetrModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") + >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> # the last hidden states are the final query embeddings of the Transformer decoder + >>> # these are of shape (batch_size, num_queries, hidden_size) + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 100, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), device=device) + + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # pixel_values should be of shape (batch_size, num_channels, height, width) + # pixel_mask should be of shape (batch_size, height, width) + features, object_queries_list = self.backbone(pixel_values, pixel_mask) + + # get final feature map and downsampled mask + feature_map, mask = features[-1] + + if mask is None: + raise ValueError("Backbone does not return downsampled pixel mask") + + # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + projected_feature_map = self.input_projection(feature_map) + + # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC + # In other words, turn their shape into (batch_size, sequence_length, hidden_size) + flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1) + object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1) + + flattened_mask = mask.flatten(1) + + # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder + # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, height*width) + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=flattened_features, + attention_mask=flattened_mask, + object_queries=object_queries, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output) + query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1) + queries = torch.zeros_like(query_position_embeddings) + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + inputs_embeds=queries, + attention_mask=None, + object_queries=object_queries, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=flattened_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return DetrModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + ) + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class DetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@auto_docstring( + custom_intro=""" + DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks + such as COCO detection. + """ +) +class DetrForObjectDetection(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # DETR encoder-decoder model + self.model = DetrModel(config) + + # Object detection heads + self.class_labels_classifier = nn.Linear( + config.d_model, config.num_labels + 1 + ) # We add one for the "no object" class + self.bbox_predictor = DetrMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DetrObjectDetectionOutput]: + r""" + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DetrForObjectDetection + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") + >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ + ... 0 + ... ] + + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98] + Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66] + Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76] + Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93] + Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + # class logits + predicted bounding boxes + logits = self.class_labels_classifier(sequence_output) + pred_boxes = self.bbox_predictor(sequence_output).sigmoid() + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + outputs_class, outputs_coord = None, None + if self.config.auxiliary_loss: + intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4] + outputs_class = self.class_labels_classifier(intermediate) + outputs_coord = self.bbox_predictor(intermediate).sigmoid() + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord + ) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return DetrObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@auto_docstring( + custom_intro=""" + DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks + such as COCO panoptic. + """ +) +class DetrForSegmentation(DetrPreTrainedModel): + def __init__(self, config: DetrConfig): + super().__init__(config) + + # object detection model + self.detr = DetrForObjectDetection(config) + + # segmentation head + hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads + intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes + + self.mask_head = DetrMaskHeadSmallConv( + hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size + ) + + self.bbox_attention = DetrMHAttentionMap( + hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std + ) + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DetrSegmentationOutput]: + r""" + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each + dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels, + bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves + should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a + `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a + `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`. + + Examples: + + ```python + >>> import io + >>> import requests + >>> from PIL import Image + >>> import torch + >>> import numpy + + >>> from transformers import AutoImageProcessor, DetrForSegmentation + >>> from transformers.image_transforms import rgb_to_id + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic") + >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps + >>> # Segmentation results are returned as a list of dictionaries + >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)]) + + >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found + >>> panoptic_seg = result[0]["segmentation"] + >>> # Get prediction score and segment_id to class_id mapping of each segment + >>> panoptic_segments_info = result[0]["segments_info"] + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones((batch_size, height, width), device=device) + + # First, get list of feature maps and position embeddings + features, object_queries_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask) + + # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + feature_map, mask = features[-1] + batch_size, num_channels, height, width = feature_map.shape + projected_feature_map = self.detr.model.input_projection(feature_map) + + # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC + # In other words, turn their shape into (batch_size, sequence_length, hidden_size) + flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1) + object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1) + + flattened_mask = mask.flatten(1) + + # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder + # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, height*width) + if encoder_outputs is None: + encoder_outputs = self.detr.model.encoder( + inputs_embeds=flattened_features, + attention_mask=flattened_mask, + object_queries=object_queries, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output) + query_position_embeddings = self.detr.model.query_position_embeddings.weight.unsqueeze(0).repeat( + batch_size, 1, 1 + ) + queries = torch.zeros_like(query_position_embeddings) + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn) + decoder_outputs = self.detr.model.decoder( + inputs_embeds=queries, + attention_mask=None, + object_queries=object_queries, + query_position_embeddings=query_position_embeddings, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=flattened_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = decoder_outputs[0] + + # Sixth, compute logits, pred_boxes and pred_masks + logits = self.detr.class_labels_classifier(sequence_output) + pred_boxes = self.detr.bbox_predictor(sequence_output).sigmoid() + + memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width) + mask = flattened_mask.view(batch_size, height, width) + + # FIXME h_boxes takes the last one computed, keep this in mind + # important: we need to reverse the mask, since in the original implementation the mask works reversed + # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32) + bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask) + + seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]]) + + pred_masks = seg_masks.view(batch_size, self.detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]) + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + outputs_class, outputs_coord = None, None + if self.config.auxiliary_loss: + intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1] + outputs_class = self.detr.class_labels_classifier(intermediate) + outputs_coord = self.detr.bbox_predictor(intermediate).sigmoid() + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, device, pred_boxes, pred_masks, self.config, outputs_class, outputs_coord + ) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs + else: + output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return DetrSegmentationOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + pred_masks=pred_masks, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +def _expand(tensor, length: int): + return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1) + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py +class DetrMaskHeadSmallConv(nn.Module): + """ + Simple convolutional head, using group norm. Upsampling is done using a FPN approach + """ + + def __init__(self, dim, fpn_dims, context_dim): + super().__init__() + + if dim % 8 != 0: + raise ValueError( + "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in" + " GroupNorm is set to 8" + ) + + inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64] + + self.lay1 = nn.Conv2d(dim, dim, 3, padding=1) + self.gn1 = nn.GroupNorm(8, dim) + self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1) + self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1]) + self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) + self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2]) + self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) + self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3]) + self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) + self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4]) + self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1) + + self.dim = dim + + self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1) + self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1) + self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_uniform_(m.weight, a=1) + nn.init.constant_(m.bias, 0) + + def forward(self, x: Tensor, bbox_mask: Tensor, fpns: list[Tensor]): + # here we concatenate x, the projected feature map, of shape (batch_size, d_model, height/32, width/32) with + # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32). + # We expand the projected feature map to match the number of heads. + x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) + + x = self.lay1(x) + x = self.gn1(x) + x = nn.functional.relu(x) + x = self.lay2(x) + x = self.gn2(x) + x = nn.functional.relu(x) + + cur_fpn = self.adapter1(fpns[0]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay3(x) + x = self.gn3(x) + x = nn.functional.relu(x) + + cur_fpn = self.adapter2(fpns[1]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay4(x) + x = self.gn4(x) + x = nn.functional.relu(x) + + cur_fpn = self.adapter3(fpns[2]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay5(x) + x = self.gn5(x) + x = nn.functional.relu(x) + + x = self.out_lay(x) + return x + + +class DetrMHAttentionMap(nn.Module): + """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" + + def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None): + super().__init__() + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.dropout = nn.Dropout(dropout) + + self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + + self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 + + def forward(self, q, k, mask: Optional[Tensor] = None): + q = self.q_linear(q) + k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) + queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) + keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) + weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head) + + if mask is not None: + weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) + weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) + weights = self.dropout(weights) + return weights + + +__all__ = [ + "DetrForObjectDetection", + "DetrForSegmentation", + "DetrModel", + "DetrPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93be9e5f1e3588c01952c1d87a70ef6ce500ecc4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b586d9fef055c85d750e118f2d830f7eb4064994 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09ddedc251f4620bb5ba2126e8a6d9e71a6c1879 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1e99600d7c072ae8765a118a2cd144593b89e04 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b344960e6ce231f9d6dd63215971482dce5a7103 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb550e61c6940f3d454202c26aceab907300592e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22b0622c27ceda17f08536c94ccb4acb625c03d7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56b4b7e884c57aa96066e57d8bb18d0fd1786bd7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e5be363e1b2fba80cccee13a65973823e32610b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..549d3adfb3bcfa036406792bf2903ff5ea9271d6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79b8e2a1008b2abcda69eca57e01bcd2f71fd4c1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..068afca20bca434bbbbe076421f498161cb2ccfc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c1885fc10cb82d1e777c29d3131ddfb068b7be5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d29c8a431e95f4b85af7979efe21f71e86257c16 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1db797287c4afed001d62c7093d4fe97a864746d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dit/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47beca258c26eb2cbee9ae5dcd4510cf21d0791e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dit/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8652623285ebf5b40684c48e2a2012d6a829ad2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8007eb0d4c644adf6d1d24b67678fa410e0a5182 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7487dc99f8a258053173c016db9d93241140a728 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38b2479ea69e3e5eca3e0cd497495bf0792ea909 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce4e6fd237c20dd953970f619cc05687fba86c45 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e89d2f97b8337befafcce628de4a6d36678bd18 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3982c72c0798f93ade8f59b71feb476b8940c32d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..409ffbec111390e5f31e4e945f97385b1f83f539 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..123043725bacea7395b80b3586c7fe709b106bcf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1f7b2a69f389abd390b15cb11579e59878bd94c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1762e1424b76626844937606610c571da0a6ea2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e24155031b1bf4d68e99ad5f95bf4983f905043a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c95ed77320da454e12484919e297fc2e4ffb8cd4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..425abfc6de1ee98a488b66a86ab21a3ba967fc99 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..658c6c6e846dbf1908daef2dafc3d6a2105aa222 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c640d810b48bdce483ca3d9cdc838403aa97db75 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5267484d2063c29e718a37ccef484196207fb946 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84979fd784ad1785c3cd57dc7ed9c5096c8f3de4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..122cd1a9f6acac7bb722141c418c9144cbaa0d68 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ced9484153c8b2b27e7e180a685bd8de44f980cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da28b3ab0e9965143e3b327da4b99c2075cce626 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66a2d002f8c0b45480682a56c154ea34f9f21f4f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e763d63c7aa4724f135d9d457060e1b24c127696 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c219f75ffd2eb219985be6968e9176a4c095f638 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c38b3c5944ac0b1e967c817eb80e60ec01386039 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3e31f74d6f64e3ef2b35090174810bd5039ce30 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5507a43b746def945e48a9ed90c66c15a3bf7cb9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf3c11e4330eb341c8e515fef3ec797b8f057fc7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee253b7b557c34b98ca22323e74a5a8afff2cbb8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2ca02658cd35fb004236ce7451806fc24ecfcf8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2e715f93241e5cf9b6537ab4c5f47fd9d904a75 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c6f92fecece3367b9ef64c39e9fb183fa2ec7f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3dc0447909984bd21a411654459bbb4c660b6f95 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f779cd0aee0cb35ea9c52f146dd3e138b41373ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4a614fecb6e8e85d374cf49a12c3c8df1bf4431 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a343b38b039690e19562170d1b9b985b4b26201 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c63c9ef5db5ecdcf1e53633ed266026cf7c71d4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6847c790e11a5e2a8e23c23781dabf4b76656641 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e981d9cbcb1e456c206b3bec252df1598e23575a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_flaubert import * + from .modeling_flaubert import * + from .modeling_tf_flaubert import * + from .tokenization_flaubert import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..071a74fe69b420954ff6ce7154b227c0e0d7e4dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py @@ -0,0 +1,235 @@ +# coding=utf-8 +# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flaubert configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FlaubertConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`FlaubertModel`] or a [`TFFlaubertModel`]. It is + used to instantiate a FlauBERT model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the FlauBERT + [flaubert/flaubert_base_uncased](https://huggingface.co/flaubert/flaubert_base_uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + pre_norm (`bool`, *optional*, defaults to `False`): + Whether to apply the layer normalization before or after the feed forward layer following the attention in + each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018) + layerdrop (`float`, *optional*, defaults to 0.0): + Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with + Structured Dropout. ICLR 2020) + vocab_size (`int`, *optional*, defaults to 30145): + Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`FlaubertModel`] or [`TFFlaubertModel`]. + emb_dim (`int`, *optional*, defaults to 2048): + Dimensionality of the encoder layers and the pooler layer. + n_layer (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention mechanism + gelu_activation (`bool`, *optional*, defaults to `True`): + Whether or not to use a *gelu* activation instead of *relu*. + sinusoidal_embeddings (`bool`, *optional*, defaults to `False`): + Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings. + causal (`bool`, *optional*, defaults to `False`): + Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in + order to only attend to the left-side context instead if a bidirectional context. + asm (`bool`, *optional*, defaults to `False`): + Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction + layer. + n_langs (`int`, *optional*, defaults to 1): + The number of languages the model handles. Set to 1 for monolingual models. + use_lang_emb (`bool`, *optional*, defaults to `True`) + Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual + models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for information + on how to use them. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + embed_init_std (`float`, *optional*, defaults to 2048^-0.5): + The standard deviation of the truncated_normal_initializer for initializing the embedding matrices. + init_std (`int`, *optional*, defaults to 50257): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the + embedding matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + bos_index (`int`, *optional*, defaults to 0): + The index of the beginning of sentence token in the vocabulary. + eos_index (`int`, *optional*, defaults to 1): + The index of the end of sentence token in the vocabulary. + pad_index (`int`, *optional*, defaults to 2): + The index of the padding token in the vocabulary. + unk_index (`int`, *optional*, defaults to 3): + The index of the unknown token in the vocabulary. + mask_index (`int`, *optional*, defaults to 5): + The index of the masking token in the vocabulary. + is_encoder(`bool`, *optional*, defaults to `True`): + Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. + summary_type (`string`, *optional*, defaults to "first"): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Has to be one of the following options: + + - `"last"`: Take the last token hidden state (like XLNet). + - `"first"`: Take the first token hidden state (like BERT). + - `"mean"`: Take the mean of all tokens hidden states. + - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). + - `"attn"`: Not implemented now, use multi-head attention. + summary_use_proj (`bool`, *optional*, defaults to `True`): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Whether or not to add a projection after the vector extraction. + summary_activation (`str`, *optional*): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation. + summary_proj_to_labels (`bool`, *optional*, defaults to `True`): + Used in the sequence classification and multiple choice models. + + Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes. + summary_first_dropout (`float`, *optional*, defaults to 0.1): + Used in the sequence classification and multiple choice models. + + The dropout ratio to be used after the projection and activation. + start_n_top (`int`, *optional*, defaults to 5): + Used in the SQuAD evaluation script. + end_n_top (`int`, *optional*, defaults to 5): + Used in the SQuAD evaluation script. + mask_token_id (`int`, *optional*, defaults to 0): + Model agnostic parameter to identify masked tokens when generating text in an MLM context. + lang_id (`int`, *optional*, defaults to 1): + The ID of the language used by the model. This parameter is used when generating text in a given language. + """ + + model_type = "flaubert" + attribute_map = { + "hidden_size": "emb_dim", + "num_attention_heads": "n_heads", + "num_hidden_layers": "n_layers", + "n_words": "vocab_size", # For backward compatibility + } + + def __init__( + self, + pre_norm=False, + layerdrop=0.0, + vocab_size=30145, + emb_dim=2048, + n_layers=12, + n_heads=16, + dropout=0.1, + attention_dropout=0.1, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=1, + use_lang_emb=True, + max_position_embeddings=512, + embed_init_std=2048**-0.5, + layer_norm_eps=1e-12, + init_std=0.02, + bos_index=0, + eos_index=1, + pad_index=2, + unk_index=3, + mask_index=5, + is_encoder=True, + summary_type="first", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + start_n_top=5, + end_n_top=5, + mask_token_id=0, + lang_id=0, + pad_token_id=2, + bos_token_id=0, + **kwargs, + ): + """Constructs FlaubertConfig.""" + self.pre_norm = pre_norm + self.layerdrop = layerdrop + self.vocab_size = vocab_size + self.emb_dim = emb_dim + self.n_layers = n_layers + self.n_heads = n_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.gelu_activation = gelu_activation + self.sinusoidal_embeddings = sinusoidal_embeddings + self.causal = causal + self.asm = asm + self.n_langs = n_langs + self.use_lang_emb = use_lang_emb + self.layer_norm_eps = layer_norm_eps + self.bos_index = bos_index + self.eos_index = eos_index + self.pad_index = pad_index + self.unk_index = unk_index + self.mask_index = mask_index + self.is_encoder = is_encoder + self.max_position_embeddings = max_position_embeddings + self.embed_init_std = embed_init_std + self.init_std = init_std + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_proj_to_labels = summary_proj_to_labels + self.summary_first_dropout = summary_first_dropout + self.start_n_top = start_n_top + self.end_n_top = end_n_top + self.mask_token_id = mask_token_id + self.lang_id = lang_id + + if "n_words" in kwargs: + self.n_words = kwargs["n_words"] + + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) + + +class FlaubertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["FlaubertConfig", "FlaubertOnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..1dadc6f5377b4007c78f44d81d7a39acf2dedbdb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py @@ -0,0 +1,1700 @@ +# coding=utf-8 +# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Flaubert model, based on XLM.""" + +import math +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import gelu, get_activation +from ...cache_utils import DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_outputs import ( + BaseModelOutput, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, logging +from .configuration_flaubert import FlaubertConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.xlm.modeling_xlm.create_sinusoidal_embeddings +def create_sinusoidal_embeddings(n_pos, dim, out): + position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) + out.requires_grad = False + out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + out.detach_() + + +# Copied from transformers.models.xlm.modeling_xlm.get_masks +def get_masks(slen, lengths, causal, padding_mask=None): + """ + Generate hidden states mask, and optionally an attention mask. + """ + alen = torch.arange(slen, dtype=torch.long, device=lengths.device) + if padding_mask is not None: + mask = padding_mask + else: + assert lengths.max().item() <= slen + mask = alen < lengths[:, None] + + # attention mask is the same as mask, or triangular inferior attention (causal) + bs = lengths.size(0) + if causal: + attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None] + else: + attn_mask = mask + + # sanity check + assert mask.size() == (bs, slen) + assert causal is False or attn_mask.size() == (bs, slen, slen) + + return mask, attn_mask + + +# Copied from transformers.models.xlm.modeling_xlm.MultiHeadAttention +class MultiHeadAttention(nn.Module): + def __init__(self, n_heads, dim, config, layer_idx: int = 0): + super().__init__() + self.layer_id = layer_idx + self.dim = dim + self.n_heads = n_heads + self.head_dim = dim // n_heads + self.dropout = config.attention_dropout + assert self.dim % self.n_heads == 0 + + self.q_lin = nn.Linear(dim, dim) + self.k_lin = nn.Linear(dim, dim) + self.v_lin = nn.Linear(dim, dim) + self.out_lin = nn.Linear(dim, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + attention_head_size = self.dim // self.n_heads + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads) + # Prune linear layers + self.q_lin = prune_linear_layer(self.q_lin, index) + self.k_lin = prune_linear_layer(self.k_lin, index) + self.v_lin = prune_linear_layer(self.v_lin, index) + self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.dim = attention_head_size * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + input, + mask, + kv=None, + cache=None, + head_mask=None, + output_attentions=False, + cache_position=None, + ): + """ + Self-attention (if kv is None) or attention over source sentence (provided by kv). + """ + # Input is (bs, qlen, dim) + # Mask is (bs, klen) (non-causal) or (bs, klen, klen) + bs, qlen, dim = input.size() + is_cross_attention = kv is not None + mask_reshape = (bs, 1, qlen, -1) if mask.dim() == 3 else (bs, 1, 1, -1) + + q = self.q_lin(input).view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2) + if cache is not None: + if isinstance(cache, EncoderDecoderCache): + is_updated = cache.is_updated.get(self.layer_id) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = cache.cross_attention_cache + else: + curr_past_key_value = cache.self_attention_cache + else: + curr_past_key_value = cache + + current_states = kv if is_cross_attention else input + if is_cross_attention and cache is not None and is_updated: + # reuse k,v, cross_attentions + k = curr_past_key_value.key_cache[self.layer_id] + v = curr_past_key_value.value_cache[self.layer_id] + else: + k = self.k_lin(current_states) + v = self.v_lin(current_states) + k = k.view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2) + v = v.view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2) + + if cache is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + k, v = curr_past_key_value.update(k, v, self.layer_id, {"cache_position": cache_position}) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + cache.is_updated[self.layer_id] = True + + q = q / math.sqrt(self.head_dim) # (bs, n_heads, qlen, head_dim) + scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) + mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) + scores.masked_fill_(mask, torch.finfo(scores.dtype).min) # (bs, n_heads, qlen, klen) + + weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) + weights = nn.functional.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = torch.matmul(weights, v) # (bs, n_heads, qlen, head_dim) + context = context.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * self.head_dim) + + outputs = (self.out_lin(context),) + if output_attentions: + outputs = outputs + (weights,) + return outputs + + +# Copied from transformers.models.xlm.modeling_xlm.TransformerFFN +class TransformerFFN(nn.Module): + def __init__(self, in_dim, dim_hidden, out_dim, config): + super().__init__() + self.dropout = config.dropout + self.lin1 = nn.Linear(in_dim, dim_hidden) + self.lin2 = nn.Linear(dim_hidden, out_dim) + self.act = gelu if config.gelu_activation else nn.functional.relu + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + + def forward(self, input): + return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input) + + def ff_chunk(self, input): + x = self.lin1(input) + x = self.act(x) + x = self.lin2(x) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) + return x + + +@auto_docstring( + custom_intro=""" + The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top. + """ +) +# Copied from transformers.models.xlm.modeling_xlm.XLMPredLayer with XLM->Flaubert +class FlaubertPredLayer(nn.Module): + """ + Prediction layer (cross_entropy or adaptive_softmax). + """ + + def __init__(self, config): + super().__init__() + self.asm = config.asm + self.n_words = config.n_words + self.pad_index = config.pad_index + dim = config.emb_dim + + if config.asm is False: + self.proj = nn.Linear(dim, config.n_words, bias=True) + else: + self.proj = nn.AdaptiveLogSoftmaxWithLoss( + in_features=dim, + n_classes=config.n_words, + cutoffs=config.asm_cutoffs, + div_value=config.asm_div_value, + head_bias=True, # default is False + ) + + def forward(self, x, y=None): + """Compute the loss, and optionally the scores.""" + outputs = () + if self.asm is False: + scores = self.proj(x) + outputs = (scores,) + outputs + if y is not None: + loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean") + outputs = (loss,) + outputs + else: + scores = self.proj.log_prob(x) + outputs = (scores,) + outputs + if y is not None: + _, loss = self.proj(x, y) + outputs = (loss,) + outputs + + return outputs + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`]. + """ +) +# Copied from transformers.models.xlm.modeling_xlm.XLMSquadHeadOutput with XLM->Flaubert +class FlaubertSquadHeadOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided): + Classification loss as the sum of start token, end token (and is_impossible if provided) classification + losses. + start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top config.start_n_top start token possibilities (beam-search). + start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top config.start_n_top start token possibilities (beam-search). + end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities + (beam-search). + end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search). + cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the `is_impossible` label of the answers. + """ + + loss: Optional[torch.FloatTensor] = None + start_top_log_probs: Optional[torch.FloatTensor] = None + start_top_index: Optional[torch.LongTensor] = None + end_top_log_probs: Optional[torch.FloatTensor] = None + end_top_index: Optional[torch.LongTensor] = None + cls_logits: Optional[torch.FloatTensor] = None + + +# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerStartLogits with XLM->Flaubert +class FlaubertPoolerStartLogits(nn.Module): + """ + Compute SQuAD start logits from sequence hidden states. + + Args: + config ([`FlaubertConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, 1) + + def forward( + self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + + Returns: + `torch.FloatTensor`: The start logits for SQuAD. + """ + x = self.dense(hidden_states).squeeze(-1) + + if p_mask is not None: + if p_mask.dtype == torch.float16: + x = x * (1 - p_mask) - 65500 * p_mask + else: + x = x * (1 - p_mask) - 1e30 * p_mask + + return x + + +# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerEndLogits with XLM->Flaubert +class FlaubertPoolerEndLogits(nn.Module): + """ + Compute SQuAD end logits from sequence hidden states. + + Args: + config ([`FlaubertConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps` + to use. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.activation = nn.Tanh() + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dense_1 = nn.Linear(config.hidden_size, 1) + + def forward( + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + The hidden states of the first tokens for the labeled span. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the first token for the labeled span. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + + + + One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides + `start_states`. + + + + Returns: + `torch.FloatTensor`: The end logits for SQuAD. + """ + assert start_states is not None or start_positions is not None, ( + "One of start_states, start_positions should be not None" + ) + if start_positions is not None: + slen, hsz = hidden_states.shape[-2:] + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) + start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) + + x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) + x = self.activation(x) + x = self.LayerNorm(x) + x = self.dense_1(x).squeeze(-1) + + if p_mask is not None: + if p_mask.dtype == torch.float16: + x = x * (1 - p_mask) - 65500 * p_mask + else: + x = x * (1 - p_mask) - 1e30 * p_mask + + return x + + +# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerAnswerClass with XLM->Flaubert +class FlaubertPoolerAnswerClass(nn.Module): + """ + Compute SQuAD 2.0 answer class from classification and start tokens hidden states. + + Args: + config ([`FlaubertConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.activation = nn.Tanh() + self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) + + def forward( + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + The hidden states of the first tokens for the labeled span. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the first token for the labeled span. + cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Position of the CLS token for each sentence in the batch. If `None`, takes the last token. + + + + One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides + `start_states`. + + + + Returns: + `torch.FloatTensor`: The SQuAD 2.0 answer class. + """ + # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample. + hsz = hidden_states.shape[-1] + assert start_states is not None or start_positions is not None, ( + "One of start_states, start_positions should be not None" + ) + if start_positions is not None: + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) + + if cls_index is not None: + cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) + else: + cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) + + x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) + x = self.activation(x) + x = self.dense_1(x).squeeze(-1) + + return x + + +# Copied from transformers.models.xlm.modeling_xlm.XLMSQuADHead with XLM->Flaubert +class FlaubertSQuADHead(nn.Module): + r""" + A SQuAD head inspired by XLNet. + + Args: + config ([`FlaubertConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps` + to use. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + self.start_n_top = config.start_n_top + self.end_n_top = config.end_n_top + + self.start_logits = FlaubertPoolerStartLogits(config) + self.end_logits = FlaubertPoolerEndLogits(config) + self.answer_class = FlaubertPoolerAnswerClass(config) + + @auto_docstring + def forward( + self, + hidden_states: torch.FloatTensor, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, + is_impossible: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, + return_dict: bool = False, + ) -> Union[FlaubertSquadHeadOutput, tuple[torch.FloatTensor]]: + r""" + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + Final hidden states of the model on the sequence tokens. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Positions of the first token for the labeled span. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Positions of the last token for the labeled span. + cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Position of the CLS token for each sentence in the batch. If `None`, takes the last token. + is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Whether the question has a possible answer in the paragraph or not. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + """ + start_logits = self.start_logits(hidden_states, p_mask=p_mask) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, let's remove the dimension added by batch splitting + for x in (start_positions, end_positions, cls_index, is_impossible): + if x is not None and x.dim() > 1: + x.squeeze_(-1) + + # during training, compute the end logits based on the ground truth of the start position + end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) + + loss_fct = CrossEntropyLoss() + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if cls_index is not None and is_impossible is not None: + # Predict answerability from the representation of CLS and START + cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) + loss_fct_cls = nn.BCEWithLogitsLoss() + cls_loss = loss_fct_cls(cls_logits, is_impossible) + + # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss + total_loss += cls_loss * 0.5 + + return FlaubertSquadHeadOutput(loss=total_loss) if return_dict else (total_loss,) + + else: + # during inference, compute the end logits based on beam search + bsz, slen, hsz = hidden_states.size() + start_log_probs = nn.functional.softmax(start_logits, dim=-1) # shape (bsz, slen) + + start_top_log_probs, start_top_index = torch.topk( + start_log_probs, self.start_n_top, dim=-1 + ) # shape (bsz, start_n_top) + start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) + start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) + start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) + + hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( + start_states + ) # shape (bsz, slen, start_n_top, hsz) + p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None + end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) + end_log_probs = nn.functional.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + + end_top_log_probs, end_top_index = torch.topk( + end_log_probs, self.end_n_top, dim=1 + ) # shape (bsz, end_n_top, start_n_top) + end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) + end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) + + start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) + cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) + + if not return_dict: + return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + else: + return FlaubertSquadHeadOutput( + start_top_log_probs=start_top_log_probs, + start_top_index=start_top_index, + end_top_log_probs=end_top_log_probs, + end_top_index=end_top_index, + cls_logits=cls_logits, + ) + + +# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Flaubert +class FlaubertSequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`FlaubertConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: + + - `"last"` -- Take the last token hidden state (like XLNet) + - `"first"` -- Take the first token hidden state (like Bert) + - `"mean"` -- Take the mean of all tokens hidden states + - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - `"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + + self.summary_type = getattr(config, "summary_type", "last") + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = nn.Identity() + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + activation_string = getattr(config, "summary_activation", None) + self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity() + + self.first_dropout = nn.Identity() + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = nn.Identity() + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + def forward( + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + ) -> torch.FloatTensor: + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `torch.FloatTensor`: The summary of the sequence hidden states. + """ + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = hidden_states.mean(dim=1) + elif self.summary_type == "cls_index": + if cls_index is None: + cls_index = torch.full_like( + hidden_states[..., :1, :], + hidden_states.shape[-2] - 1, + dtype=torch.long, + ) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +@auto_docstring +# Copied from transformers.models.xlm.modeling_xlm.XLMPreTrainedModel with XLM->Flaubert +class FlaubertPreTrainedModel(PreTrainedModel): + config: FlaubertConfig + load_tf_weights = None + base_model_prefix = "transformer" + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + @property + def dummy_inputs(self): + inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) + attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) + if self.config.use_lang_emb and self.config.n_langs > 1: + langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) + else: + langs_list = None + return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Embedding): + if self.config is not None and self.config.embed_init_std is not None: + nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if isinstance(module, nn.Linear): + if self.config is not None and self.config.init_std is not None: + nn.init.normal_(module.weight, mean=0, std=self.config.init_std) + if module.bias is not None: + nn.init.constant_(module.bias, 0.0) + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, FlaubertModel) and self.config.sinusoidal_embeddings: + create_sinusoidal_embeddings( + self.config.max_position_embeddings, self.config.emb_dim, out=module.position_embeddings.weight + ) + + +@auto_docstring +class FlaubertModel(FlaubertPreTrainedModel): + def __init__(self, config): # , dico, is_encoder, with_output): + super().__init__(config) + + # encoder / decoder, output layer + self.is_encoder = config.is_encoder + self.is_decoder = not config.is_encoder + if self.is_decoder: + raise NotImplementedError("Currently Flaubert can only be used as an encoder") + # self.with_output = with_output + self.causal = config.causal + + # dictionary / languages + self.n_langs = config.n_langs + self.use_lang_emb = config.use_lang_emb + self.n_words = config.n_words + self.eos_index = config.eos_index + self.pad_index = config.pad_index + # self.dico = dico + # self.id2lang = config.id2lang + # self.lang2id = config.lang2id + # assert len(self.dico) == self.n_words + # assert len(self.id2lang) == len(self.lang2id) == self.n_langs + + # model parameters + self.dim = config.emb_dim # 512 by default + self.hidden_dim = self.dim * 4 # 2048 by default + self.n_heads = config.n_heads # 8 by default + self.n_layers = config.n_layers + self.dropout = config.dropout + self.attention_dropout = config.attention_dropout + assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads" + + # embeddings + self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) + if config.n_langs > 1 and config.use_lang_emb: + self.lang_embeddings = nn.Embedding(self.n_langs, self.dim) + self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index) + self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps) + + # transformer layers + self.attentions = nn.ModuleList() + self.layer_norm1 = nn.ModuleList() + self.ffns = nn.ModuleList() + self.layer_norm2 = nn.ModuleList() + # if self.is_decoder: + # self.layer_norm15 = nn.ModuleList() + # self.encoder_attn = nn.ModuleList() + + for i in range(self.n_layers): + self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config, layer_idx=i)) + self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # if self.is_decoder: + # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) + self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config)) + self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + + if hasattr(config, "pruned_heads"): + pruned_heads = config.pruned_heads.copy().items() + config.pruned_heads = {} + for layer, heads in pruned_heads: + if self.attentions[int(layer)].n_heads == config.n_heads: + self.prune_heads({int(layer): list(map(int, heads))}) + + # Initialize weights and apply final processing + self.post_init() + + self.layerdrop = getattr(config, "layerdrop", 0.0) + self.pre_norm = getattr(config, "pre_norm", False) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + # Copied from transformers.models.xlm.modeling_xlm.XLMModel.get_input_embeddings + def get_input_embeddings(self): + return self.embeddings + + # Copied from transformers.models.xlm.modeling_xlm.XLMModel.set_input_embeddings + def set_input_embeddings(self, new_embeddings): + self.embeddings = new_embeddings + + # Copied from transformers.models.xlm.modeling_xlm.XLMModel._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.attentions[layer].prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + lengths: Optional[torch.LongTensor] = None, + cache: Optional[dict[str, torch.FloatTensor]] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`: + cache (`dict[str, torch.FloatTensor]`, *optional*): + Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the + attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential + decoding. The dictionary object will be modified in-place during the forward pass to add newly computed + hidden-states. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # removed: src_enc=None, src_len=None + if input_ids is not None: + bs, slen = input_ids.size() + else: + bs, slen = inputs_embeds.size()[:-1] + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if cache is None: + cache = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + + if isinstance(cache, tuple): + cache = EncoderDecoderCache.from_legacy_cache(cache) + + if lengths is None: + if input_ids is not None: + lengths = (input_ids != self.pad_index).sum(dim=1).long() + else: + lengths = torch.tensor([slen] * bs, device=device) + # mask = input_ids != self.pad_index + + # check inputs + assert lengths.size(0) == bs + assert lengths.max().item() <= slen + # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 + # assert (src_enc is None) == (src_len is None) + # if src_enc is not None: + # assert self.is_decoder + # assert src_enc.size(0) == bs + + # generate masks + mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) + # if self.is_decoder and src_enc is not None: + # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] + + # Setting the position-ids to the registered buffer in constructor, it helps + # when tracing the model without passing position-ids, solves + # issues similar to issue #5664 + if position_ids is None: + if hasattr(self, "position_ids"): + position_ids = self.position_ids[:, :slen] + position_ids = position_ids.expand((bs, slen)) + else: + position_ids = torch.arange(slen, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0).expand((bs, slen)) + else: + assert position_ids.size() == (bs, slen) # (slen, bs) + # position_ids = position_ids.transpose(0, 1) + + # langs + if langs is not None: + assert langs.size() == (bs, slen) # (slen, bs) + # langs = langs.transpose(0, 1) + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.n_layers) + + # do not recompute cached elements + if cache is not None and input_ids is not None: + _slen = slen - cache.get_seq_length() + input_ids = input_ids[:, -_slen:] + position_ids = position_ids[:, -_slen:] + if langs is not None: + langs = langs[:, -_slen:] + mask = mask[:, -_slen:] + attn_mask = attn_mask[:, -_slen:] + + # embeddings + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds) + if langs is not None and self.use_lang_emb and self.config.n_langs > 1: + tensor = tensor + self.lang_embeddings(langs) + if token_type_ids is not None: + tensor = tensor + self.embeddings(token_type_ids) + tensor = self.layer_norm_emb(tensor) + tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training) + tensor *= mask.unsqueeze(-1).to(tensor.dtype) + + # transformer layers + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None + for i in range(self.n_layers): + # LayerDrop + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + if output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # self attention + if not self.pre_norm: + attn_outputs = self.attentions[i]( + tensor, + attn_mask, + cache=cache, + head_mask=head_mask[i], + output_attentions=output_attentions, + cache_position=cache_position, + ) + attn = attn_outputs[0] + if output_attentions: + attentions = attentions + (attn_outputs[1],) + attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) + tensor = tensor + attn + tensor = self.layer_norm1[i](tensor) + else: + tensor_normalized = self.layer_norm1[i](tensor) + attn_outputs = self.attentions[i](tensor_normalized, attn_mask, cache=cache, head_mask=head_mask[i]) + attn = attn_outputs[0] + if output_attentions: + attentions = attentions + (attn_outputs[1],) + attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) + tensor = tensor + attn + + # FFN + if not self.pre_norm: + tensor = tensor + self.ffns[i](tensor) + tensor = self.layer_norm2[i](tensor) + else: + tensor_normalized = self.layer_norm2[i](tensor) + tensor = tensor + self.ffns[i](tensor_normalized) + + tensor *= mask.unsqueeze(-1).to(tensor.dtype) + + # Add last hidden state + if output_hidden_states: + hidden_states = hidden_states + (tensor,) + + if not return_dict: + return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) + + return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) + + +@auto_docstring( + custom_intro=""" + The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """ +) +class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["pred_layer.proj.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = FlaubertModel(config) + self.pred_layer = FlaubertPredLayer(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.pred_layer.proj + + def set_output_embeddings(self, new_embeddings): + self.pred_layer.proj = new_embeddings + + def prepare_inputs_for_generation(self, input_ids, **kwargs): + # Overwritten -- uses a language id + + mask_token_id = self.config.mask_token_id + lang_id = self.config.lang_id + + effective_batch_size = input_ids.shape[0] + mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device) + input_ids = torch.cat([input_ids, mask_token], dim=1) + if lang_id is not None: + langs = torch.full_like(input_ids, lang_id) + else: + langs = None + return {"input_ids": input_ids, "langs": langs} + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedLMOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`: + cache (`dict[str, torch.FloatTensor]`, *optional*): + Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the + attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential + decoding. The dictionary object will be modified in-place during the forward pass to add newly computed + hidden-states. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + output = transformer_outputs[0] + outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided. + + if not return_dict: + return outputs + transformer_outputs[1:] + + return MaskedLMOutput( + loss=outputs[0] if labels is not None else None, + logits=outputs[0] if labels is None else outputs[1], + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) + e.g. for GLUE tasks. + """ +) +# Copied from transformers.models.xlm.modeling_xlm.XLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForSequenceClassification(FlaubertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.transformer = FlaubertModel(config) + self.sequence_summary = FlaubertSequenceSummary(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + output = transformer_outputs[0] + logits = self.sequence_summary(output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.xlm.modeling_xlm.XLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForTokenClassification(FlaubertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = FlaubertModel(config) + self.dropout = nn.Dropout(config.dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """ +) +# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.transformer = FlaubertModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = transformer_outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + transformer_outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of question answering models using a `SquadHead`. + """ +) +# Copied from transformer.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput with XLM->Flaubert +class FlaubertForQuestionAnsweringOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided): + Classification loss as the sum of start token, end token (and is_impossible if provided) classification + losses. + start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top config.start_n_top start token possibilities (beam-search). + start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top config.start_n_top start token possibilities (beam-search). + end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities + (beam-search). + end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search). + cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the `is_impossible` label of the answers. + """ + + loss: Optional[torch.FloatTensor] = None + start_top_log_probs: Optional[torch.FloatTensor] = None + start_top_index: Optional[torch.LongTensor] = None + end_top_log_probs: Optional[torch.FloatTensor] = None + end_top_index: Optional[torch.LongTensor] = None + cls_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring +# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnswering with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForQuestionAnswering(FlaubertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.transformer = FlaubertModel(config) + self.qa_outputs = FlaubertSQuADHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + is_impossible: Optional[torch.Tensor] = None, + cls_index: Optional[torch.Tensor] = None, + p_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, FlaubertForQuestionAnsweringOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels whether a question has an answer or no answer (SQuAD 2.0) + cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the classification token to use as input for computing plausibility of the + answer. + p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be + masked. 0.0 mean token is not masked. + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaubertForQuestionAnswering + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048") + >>> model = FlaubertForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048") + + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze( + ... 0 + ... ) # Batch size 1 + >>> start_positions = torch.tensor([1]) + >>> end_positions = torch.tensor([3]) + + >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + >>> loss = outputs.loss + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + output = transformer_outputs[0] + + outputs = self.qa_outputs( + output, + start_positions=start_positions, + end_positions=end_positions, + cls_index=cls_index, + is_impossible=is_impossible, + p_mask=p_mask, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + transformer_outputs[1:] + + return FlaubertForQuestionAnsweringOutput( + loss=outputs.loss, + start_top_log_probs=outputs.start_top_log_probs, + start_top_index=outputs.start_top_index, + end_top_log_probs=outputs.end_top_log_probs, + end_top_index=outputs.end_top_index, + cls_logits=outputs.cls_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.xlm.modeling_xlm.XLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForMultipleChoice(FlaubertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.transformer = FlaubertModel(config) + self.sequence_summary = FlaubertSequenceSummary(config) + self.logits_proj = nn.Linear(config.num_labels, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + langs (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + langs = langs.view(-1, langs.size(-1)) if langs is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + if lengths is not None: + logger.warning( + "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the " + "attention mask instead." + ) + lengths = None + + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + output = transformer_outputs[0] + logits = self.sequence_summary(output) + logits = self.logits_proj(logits) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = [ + "FlaubertForMultipleChoice", + "FlaubertForQuestionAnswering", + "FlaubertForQuestionAnsweringSimple", + "FlaubertForSequenceClassification", + "FlaubertForTokenClassification", + "FlaubertModel", + "FlaubertWithLMHeadModel", + "FlaubertPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..88b7ae9f0c9ddd96e761b3739a035c55a217dec8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py @@ -0,0 +1,1343 @@ +# coding=utf-8 +# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TF 2.0 Flaubert model. +""" + +from __future__ import annotations + +import itertools +import random +import warnings +from dataclasses import dataclass + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFSequenceSummary, + TFSharedEmbeddings, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_flaubert import FlaubertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased" +_CONFIG_FOR_DOC = "FlaubertConfig" + + +FLAUBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +FLAUBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - `1` for tokens that are **not masked**, + - `0` for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + langs (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - `0` corresponds to a *sentence A* token, + - `1` corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + lengths (`tf.Tensor` or `Numpy array` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility Indices selected in + `[0, ..., input_ids.size(-1)]`: + cache (`dict[str, tf.Tensor]`, *optional*): + Dictionary string to `tf.FloatTensor` that contains precomputed hidden states (key and values in the + attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential + decoding. + + The dictionary object will be modified in-place during the forward pass to add newly computed + hidden-states. + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - `1` indicates the head is **not masked**, + - `0` indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +def get_masks(slen, lengths, causal, padding_mask=None): + """ + Generate hidden states mask, and optionally an attention mask. + """ + bs = shape_list(lengths)[0] + if padding_mask is not None: + mask = padding_mask + else: + # assert lengths.max().item() <= slen + alen = tf.range(slen, dtype=lengths.dtype) + mask = alen < tf.expand_dims(lengths, axis=1) + + # attention mask is the same as mask, or triangular inferior attention (causal) + if causal: + attn_mask = tf.less_equal( + tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1)) + ) + else: + attn_mask = mask + + # sanity check + # assert shape_list(mask) == [bs, slen] + tf.debugging.assert_equal(shape_list(mask), [bs, slen]) + if causal: + tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen]) + + return mask, attn_mask + + +class TFFlaubertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = FlaubertConfig + base_model_prefix = "transformer" + + @property + def dummy_inputs(self): + # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed + inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32) + attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32) + if self.config.use_lang_emb and self.config.n_langs > 1: + return { + "input_ids": inputs_list, + "attention_mask": attns_list, + "langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32), + } + else: + return {"input_ids": inputs_list, "attention_mask": attns_list} + + +@add_start_docstrings( + "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.", + FLAUBERT_START_DOCSTRING, +) +class TFFlaubertModel(TFFlaubertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutput: + outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert +class TFFlaubertMultiHeadAttention(keras.layers.Layer): + NEW_ID = itertools.count() + + def __init__(self, n_heads, dim, config, **kwargs): + super().__init__(**kwargs) + self.layer_id = next(TFFlaubertMultiHeadAttention.NEW_ID) + self.dim = dim + self.n_heads = n_heads + self.output_attentions = config.output_attentions + assert self.dim % self.n_heads == 0 + + self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") + self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") + self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") + self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") + self.dropout = keras.layers.Dropout(config.attention_dropout) + self.pruned_heads = set() + self.dim = dim + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, input, mask, kv, cache, head_mask, output_attentions, training=False): + """ + Self-attention (if kv is None) or attention over source sentence (provided by kv). + """ + # Input is (bs, qlen, dim) + # Mask is (bs, klen) (non-causal) or (bs, klen, klen) + bs, qlen, dim = shape_list(input) + + if kv is None: + klen = qlen if cache is None else cache["slen"] + qlen + else: + klen = shape_list(kv)[1] + + # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' + dim_per_head = self.dim // self.n_heads + mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen) + + def shape(x): + """projection""" + return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) + + def unshape(x): + """compute context""" + return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) + + q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) + + if kv is None: + k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) + elif cache is None or self.layer_id not in cache: + k = v = kv + k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) + + if cache is not None: + if self.layer_id in cache: + if kv is None: + k_, v_ = cache[self.layer_id] + k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) + v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) + else: + k, v = cache[self.layer_id] + + cache[self.layer_id] = (k, v) + + f_dim_per_head = tf.cast(dim_per_head, dtype=q.dtype) + q = tf.multiply(q, tf.math.rsqrt(f_dim_per_head)) # (bs, n_heads, qlen, dim_per_head) + k = tf.cast(k, dtype=q.dtype) + scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) + mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + mask = tf.cast(mask, dtype=scores.dtype) + scores = scores - 1e30 * (1.0 - mask) + weights = stable_softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) + outputs = (self.out_lin(context),) + + if output_attentions: + outputs = outputs + (weights,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_lin", None) is not None: + with tf.name_scope(self.q_lin.name): + self.q_lin.build([None, None, self.dim]) + if getattr(self, "k_lin", None) is not None: + with tf.name_scope(self.k_lin.name): + self.k_lin.build([None, None, self.dim]) + if getattr(self, "v_lin", None) is not None: + with tf.name_scope(self.v_lin.name): + self.v_lin.build([None, None, self.dim]) + if getattr(self, "out_lin", None) is not None: + with tf.name_scope(self.out_lin.name): + self.out_lin.build([None, None, self.dim]) + + +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMTransformerFFN +class TFFlaubertTransformerFFN(keras.layers.Layer): + def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): + super().__init__(**kwargs) + + self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") + self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") + self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu") + self.dropout = keras.layers.Dropout(config.dropout) + self.in_dim = in_dim + self.dim_hidden = dim_hidden + + def call(self, input, training=False): + x = self.lin1(input) + x = self.act(x) + x = self.lin2(x) + x = self.dropout(x, training=training) + + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "lin1", None) is not None: + with tf.name_scope(self.lin1.name): + self.lin1.build([None, None, self.in_dim]) + if getattr(self, "lin2", None) is not None: + with tf.name_scope(self.lin2.name): + self.lin2.build([None, None, self.dim_hidden]) + + +@keras_serializable +class TFFlaubertMainLayer(keras.layers.Layer): + config_class = FlaubertConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.n_heads = config.n_heads + self.n_langs = config.n_langs + self.dim = config.emb_dim + self.hidden_dim = self.dim * 4 + self.n_words = config.n_words + self.pad_index = config.pad_index + self.causal = config.causal + self.n_layers = config.n_layers + self.use_lang_emb = config.use_lang_emb + self.layerdrop = getattr(config, "layerdrop", 0.0) + self.pre_norm = getattr(config, "pre_norm", False) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + self.max_position_embeddings = config.max_position_embeddings + self.embed_init_std = config.embed_init_std + self.dropout = keras.layers.Dropout(config.dropout) + self.embeddings = TFSharedEmbeddings( + self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings" + ) + self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") + self.attentions = [] + self.layer_norm1 = [] + self.ffns = [] + self.layer_norm2 = [] + + for i in range(self.n_layers): + self.attentions.append( + TFFlaubertMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}") + ) + self.layer_norm1.append( + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}") + ) + # if self.is_decoder: + # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) + self.ffns.append( + TFFlaubertTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}") + ) + self.layer_norm2.append( + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}") + ) + + def build(self, input_shape=None): + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.dim], + initializer=get_initializer(self.embed_init_std), + ) + + if self.n_langs > 1 and self.use_lang_emb: + with tf.name_scope("lang_embeddings"): + self.lang_embeddings = self.add_weight( + name="embeddings", + shape=[self.n_langs, self.dim], + initializer=get_initializer(self.embed_init_std), + ) + + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "layer_norm_emb", None) is not None: + with tf.name_scope(self.layer_norm_emb.name): + self.layer_norm_emb.build([None, None, self.dim]) + for layer in self.attentions: + with tf.name_scope(layer.name): + layer.build(None) + for layer in self.layer_norm1: + with tf.name_scope(layer.name): + layer.build([None, None, self.dim]) + for layer in self.ffns: + with tf.name_scope(layer.name): + layer.build(None) + for layer in self.layer_norm2: + with tf.name_scope(layer.name): + layer.build([None, None, self.dim]) + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + @unpack_inputs + def call( + self, + input_ids: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutput: + # removed: src_enc=None, src_len=None + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + bs, slen = shape_list(input_ids) + elif inputs_embeds is not None: + bs, slen = shape_list(inputs_embeds)[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if lengths is None: + if input_ids is not None: + lengths = tf.reduce_sum( + tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=input_ids.dtype), axis=1 + ) + else: + lengths = tf.convert_to_tensor([slen] * bs) + # mask = input_ids != self.pad_index + + # check inputs + # assert shape_list(lengths)[0] == bs + ( + tf.debugging.assert_equal(shape_list(lengths)[0], bs), + f"Expected batch size {shape_list(lengths)[0]} and received batch size {bs} mismatched", + ) + # assert lengths.max().item() <= slen + # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 + # assert (src_enc is None) == (src_len is None) + # if src_enc is not None: + # assert self.is_decoder + # assert src_enc.size(0) == bs + + # generate masks + mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) + # if self.is_decoder and src_enc is not None: + # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] + + # position_ids + if position_ids is None: + position_ids = tf.expand_dims(tf.range(slen), axis=0) + position_ids = tf.tile(position_ids, (bs, 1)) + + # assert shape_list(position_ids) == [bs, slen] # (slen, bs) + ( + tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]), + f"Position id shape {shape_list(position_ids)} and input shape {[bs, slen]} mismatched", + ) + # position_ids = position_ids.transpose(0, 1) + + # langs + if langs is not None: + # assert shape_list(langs) == [bs, slen] # (slen, bs) + ( + tf.debugging.assert_equal(shape_list(langs), [bs, slen]), + f"Lang shape {shape_list(langs)} and input shape {[bs, slen]} mismatched", + ) + # langs = langs.transpose(0, 1) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.n_layers + + # do not recompute cached elements + if cache is not None and input_ids is not None: + _slen = slen - cache["slen"] + input_ids = input_ids[:, -_slen:] + position_ids = position_ids[:, -_slen:] + if langs is not None: + langs = langs[:, -_slen:] + mask = mask[:, -_slen:] + attn_mask = attn_mask[:, -_slen:] + + # embeddings + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size) + inputs_embeds = self.embeddings(input_ids) + + tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids) + + if langs is not None and self.use_lang_emb: + tensor = tensor + tf.gather(self.lang_embeddings, langs) + if token_type_ids is not None: + tensor = tensor + self.embeddings(token_type_ids) + + tensor = self.layer_norm_emb(tensor) + tensor = self.dropout(tensor, training=training) + mask = tf.cast(mask, dtype=tensor.dtype) + tensor = tensor * tf.expand_dims(mask, axis=-1) + + # hidden_states and attentions cannot be None in graph mode. + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None + + # transformer layers + for i in range(self.n_layers): + # LayerDrop + dropout_probability = random.uniform(0, 1) + + if training and (dropout_probability < self.layerdrop): + continue + + if output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # self attention + if not self.pre_norm: + attn_outputs = self.attentions[i]( + tensor, + attn_mask, + None, + cache, + head_mask[i], + output_attentions, + training=training, + ) + attn = attn_outputs[0] + + if output_attentions: + attentions = attentions + (attn_outputs[1],) + + attn = self.dropout(attn, training=training) + tensor = tensor + attn + tensor = self.layer_norm1[i](tensor) + else: + tensor_normalized = self.layer_norm1[i](tensor) + attn_outputs = self.attentions[i]( + tensor_normalized, + attn_mask, + None, + cache, + head_mask[i], + output_attentions, + training=training, + ) + attn = attn_outputs[0] + + if output_attentions: + attentions = attentions + (attn_outputs[1],) + + attn = self.dropout(attn, training=training) + tensor = tensor + attn + + # encoder attention (for decoder only) + # if self.is_decoder and src_enc is not None: + # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) + # attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) + # tensor = tensor + attn + # tensor = self.layer_norm15[i](tensor) + + # FFN + if not self.pre_norm: + tensor = tensor + self.ffns[i](tensor) + tensor = self.layer_norm2[i](tensor) + else: + tensor_normalized = self.layer_norm2[i](tensor) + tensor = tensor + self.ffns[i](tensor_normalized) + + tensor = tensor * tf.expand_dims(mask, axis=-1) + + # Add last hidden state + if output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # update cache length + if cache is not None: + cache["slen"] += tensor.size(1) + + # move back sequence length to dimension 0 + # tensor = tensor.transpose(0, 1) + + if not return_dict: + return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) + + return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) + + +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMPredLayer +class TFFlaubertPredLayer(keras.layers.Layer): + """ + Prediction layer (cross_entropy or adaptive_softmax). + """ + + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.asm = config.asm + self.n_words = config.n_words + self.pad_index = config.pad_index + + if config.asm is False: + self.input_embeddings = input_embeddings + else: + raise NotImplementedError + # self.proj = nn.AdaptiveLogSoftmaxWithLoss( + # in_features=dim, + # n_classes=config.n_words, + # cutoffs=config.asm_cutoffs, + # div_value=config.asm_div_value, + # head_bias=True, # default is False + # ) + + def build(self, input_shape): + # The output weights are the same as the input embeddings, but there is an output-only bias for each token. + self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self): + return self.input_embeddings + + def set_output_embeddings(self, value): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + hidden_states = self.input_embeddings(hidden_states, mode="linear") + hidden_states = hidden_states + self.bias + + return hidden_states + + +@dataclass +class TFFlaubertWithLMHeadModelOutput(ModelOutput): + """ + Base class for [`TFFlaubertWithLMHeadModel`] outputs. + + Args: + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor] | None = None + attentions: tuple[tf.Tensor] | None = None + + +@add_start_docstrings( + """ + The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + FLAUBERT_START_DOCSTRING, +) +class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") + # Flaubert does not have past caching features + self.supports_xla_generation = False + + def get_lm_head(self): + return self.pred_layer + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.pred_layer.name + + def prepare_inputs_for_generation(self, inputs, **kwargs): + mask_token_id = self.config.mask_token_id + lang_id = self.config.lang_id + + effective_batch_size = inputs.shape[0] + mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id + inputs = tf.concat([inputs, mask_token], axis=1) + + if lang_id is not None: + langs = tf.ones_like(inputs) * lang_id + else: + langs = None + return {"input_ids": inputs, "langs": langs} + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFFlaubertWithLMHeadModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFFlaubertWithLMHeadModelOutput: + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + output = transformer_outputs[0] + outputs = self.pred_layer(output) + + if not return_dict: + return (outputs,) + transformer_outputs[1:] + + return TFFlaubertWithLMHeadModelOutput( + logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "pred_layer", None) is not None: + with tf.name_scope(self.pred_layer.name): + self.pred_layer.build(None) + + +@add_start_docstrings( + """ + Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) + e.g. for GLUE tasks. + """, + FLAUBERT_START_DOCSTRING, +) +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class TFFlaubertForSequenceClassification(TFFlaubertPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + output = transformer_outputs[0] + + logits = self.sequence_summary(output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + + +@add_start_docstrings( + """ + Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + FLAUBERT_START_DOCSTRING, +) +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class TFFlaubertForQuestionAnsweringSimple(TFFlaubertPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = transformer_outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + loss = None + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + FLAUBERT_START_DOCSTRING, +) +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class TFFlaubertForTokenClassification(TFFlaubertPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.dropout = keras.layers.Dropout(config.dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFTokenClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = transformer_outputs[0] + + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + FLAUBERT_START_DOCSTRING, +) +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class TFFlaubertForMultipleChoice(TFFlaubertPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") + self.logits_proj = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" + ) + self.config = config + + @property + def dummy_inputs(self): + """ + Dummy inputs to build the network. + + Returns: + tf.Tensor with dummy inputs + """ + # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed + if self.config.use_lang_emb and self.config.n_langs > 1: + return { + "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32), + "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32), + } + else: + return { + "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32), + } + + @unpack_inputs + @add_start_docstrings_to_model_forward( + FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]: + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + + if lengths is not None: + logger.warning( + "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the " + "attention mask instead.", + ) + lengths = None + + transformer_outputs = self.transformer( + flat_input_ids, + flat_attention_mask, + flat_langs, + flat_token_type_ids, + flat_position_ids, + lengths, + cache, + head_mask, + flat_inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + output = transformer_outputs[0] + logits = self.sequence_summary(output) + logits = self.logits_proj(logits) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "logits_proj", None) is not None: + with tf.name_scope(self.logits_proj.name): + self.logits_proj.build([None, None, self.config.num_labels]) + + +__all__ = [ + "TFFlaubertForMultipleChoice", + "TFFlaubertForQuestionAnsweringSimple", + "TFFlaubertForSequenceClassification", + "TFFlaubertForTokenClassification", + "TFFlaubertModel", + "TFFlaubertPreTrainedModel", + "TFFlaubertWithLMHeadModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..dee653450ebacc02bcfee8142c5d08b54358c1a9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py @@ -0,0 +1,538 @@ +# coding=utf-8 +# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Flaubert.""" + +import json +import os +import re +import unicodedata +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +def convert_to_unicode(text): + """ + Converts `text` to Unicode (if it's not already), assuming UTF-8 input. + """ + + def ensure_text(s, encoding="utf-8", errors="strict"): + if isinstance(s, bytes): + return s.decode(encoding, errors) + elif isinstance(s, str): + return s + else: + raise TypeError(f"not expecting type '{type(s)}'") + + return ensure_text(text, encoding="utf-8", errors="ignore") + + +# Copied from transformers.models.xlm.tokenization_xlm.get_pairs +def get_pairs(word): + """ + Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length + strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct +def replace_unicode_punct(text): + """ + Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl + """ + text = text.replace(",", ",") + text = re.sub(r"。\s*", ". ", text) + text = text.replace("、", ",") + text = text.replace("”", '"') + text = text.replace("“", '"') + text = text.replace("∶", ":") + text = text.replace(":", ":") + text = text.replace("?", "?") + text = text.replace("《", '"') + text = text.replace("》", '"') + text = text.replace(")", ")") + text = text.replace("!", "!") + text = text.replace("(", "(") + text = text.replace(";", ";") + text = text.replace("1", "1") + text = text.replace("」", '"') + text = text.replace("「", '"') + text = text.replace("0", "0") + text = text.replace("3", "3") + text = text.replace("2", "2") + text = text.replace("5", "5") + text = text.replace("6", "6") + text = text.replace("9", "9") + text = text.replace("7", "7") + text = text.replace("8", "8") + text = text.replace("4", "4") + text = re.sub(r".\s*", ". ", text) + text = text.replace("~", "~") + text = text.replace("’", "'") + text = text.replace("…", "...") + text = text.replace("━", "-") + text = text.replace("〈", "<") + text = text.replace("〉", ">") + text = text.replace("【", "[") + text = text.replace("】", "]") + text = text.replace("%", "%") + return text + + +# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char +def remove_non_printing_char(text): + """ + Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl + """ + output = [] + for char in text: + cat = unicodedata.category(char) + if cat.startswith("C"): + continue + output.append(char) + return "".join(output) + + +class FlaubertTokenizer(PreTrainedTokenizer): + """ + Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following: + + - Moses preprocessing and tokenization. + - Normalizing all inputs text. + - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like + "__classify__") to a vocabulary. + - The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Vocabulary file. + merges_file (`str`): + Merges file. + do_lowercase (`bool`, *optional*, defaults to `False`): + Controls lower casing. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`List[str]`, *optional*, defaults to `['', '', '', '', '', '', '', '', '', '']`): + List of additional special tokens. + lang2id (`Dict[str, int]`, *optional*): + Dictionary mapping languages string identifiers to their IDs. + id2lang (`Dict[int, str]`, *optional*): + Dictionary mapping language IDs to their string identifiers. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + merges_file, + do_lowercase=False, + unk_token="", + bos_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + additional_special_tokens=[ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + lang2id=None, + id2lang=None, + **kwargs, + ): + do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None) + if do_lowercase_and_remove_accent is not None: + logger.warning( + "`do_lowercase_and_remove_accent` is passed as a keyword argument, but this won't do anything." + " `FlaubertTokenizer` will always set it to `False`." + ) + # always `False` + self.do_lowercase_and_remove_accent = False + + self.do_lowercase = do_lowercase + + try: + import sacremoses + except ImportError: + raise ImportError( + "You need to install sacremoses to use FlaubertTokenizer. " + "See https://pypi.org/project/sacremoses/ for installation." + ) + + self.sm = sacremoses + + # cache of sm.MosesPunctNormalizer instance + self.cache_moses_punct_normalizer = {} + # cache of sm.MosesTokenizer instance + self.cache_moses_tokenizer = {} + self.lang_with_custom_tokenizer = {"zh", "th", "ja"} + self.lang2id = lang2id + self.id2lang = id2lang + if lang2id is not None and id2lang is not None: + assert len(lang2id) == len(id2lang) + + self.ja_word_tokenizer = None + self.zh_word_tokenizer = None + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[:-1] + merges = [tuple(merge.split()[:2]) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + super().__init__( + do_lowercase=do_lowercase, + unk_token=unk_token, + bos_token=bos_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + lang2id=lang2id, + id2lang=id2lang, + **kwargs, + ) + + @property + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case + def do_lower_case(self): + return self.do_lowercase_and_remove_accent + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm + def moses_punct_norm(self, text, lang): + if lang not in self.cache_moses_punct_normalizer: + punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang) + self.cache_moses_punct_normalizer[lang] = punct_normalizer + else: + punct_normalizer = self.cache_moses_punct_normalizer[lang] + return punct_normalizer.normalize(text) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize + def moses_tokenize(self, text, lang): + if lang not in self.cache_moses_tokenizer: + moses_tokenizer = self.sm.MosesTokenizer(lang=lang) + self.cache_moses_tokenizer[lang] = moses_tokenizer + else: + moses_tokenizer = self.cache_moses_tokenizer[lang] + return moses_tokenizer.tokenize(text, return_str=False, escape=False) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline + def moses_pipeline(self, text, lang): + text = replace_unicode_punct(text) + text = self.moses_punct_norm(text, lang) + text = remove_non_printing_char(text) + return text + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize + def ja_tokenize(self, text): + if self.ja_word_tokenizer is None: + try: + import Mykytea + + self.ja_word_tokenizer = Mykytea.Mykytea( + f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin" + ) + except (AttributeError, ImportError): + logger.error( + "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper" + " (https://github.com/chezou/Mykytea-python) with the following steps" + ) + logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea") + logger.error("2. autoreconf -i") + logger.error("3. ./configure --prefix=$HOME/local") + logger.error("4. make && make install") + logger.error("5. pip install kytea") + raise + return list(self.ja_word_tokenizer.getWS(text)) + + @property + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size + def vocab_size(self): + return len(self.encoder) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + "",) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token + "" + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + if word == "\n ": + word = "\n" + self.cache[token] = word + return word + + def preprocess_text(self, text): + text = text.replace("``", '"').replace("''", '"') + text = convert_to_unicode(text) + text = unicodedata.normalize("NFC", text) + + if self.do_lowercase: + text = text.lower() + + return text + + def _tokenize(self, text, bypass_tokenizer=False): + """ + Tokenize a string given language code using Moses. + + Details of tokenization: + + - [sacremoses](https://github.com/alvations/sacremoses): port of Moses + - Install with `pip install sacremoses` + + Args: + - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) + (bool). If True, we only apply BPE. + + Returns: + List of tokens. + """ + lang = "fr" + if lang and self.lang2id and lang not in self.lang2id: + logger.error( + "Supplied language code not found in lang2id mapping. Please check that your language is supported by" + " the loaded pretrained model." + ) + + if bypass_tokenizer: + text = text.split() + else: + text = self.preprocess_text(text) + text = self.moses_pipeline(text, lang=lang) + text = self.moses_tokenize(text, lang=lang) + + split_tokens = [] + for token in text: + if token: + split_tokens.extend(list(self.bpe(token).split(" "))) + + return split_tokens + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = "".join(tokens).replace("", " ").strip() + return out_string + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLM sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + + """ + bos = [self.bos_token_id] + sep = [self.sep_token_id] + + if token_ids_1 is None: + return bos + token_ids_0 + sep + return bos + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__ + def __getstate__(self): + state = self.__dict__.copy() + state["sm"] = None + return state + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__ + def __setstate__(self, d): + self.__dict__ = d + + try: + import sacremoses + except ImportError: + raise ImportError( + "You need to install sacremoses to use XLMTokenizer. " + "See https://pypi.org/project/sacremoses/ for installation." + ) + + self.sm = sacremoses + + +__all__ = ["FlaubertTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..292593cb4a201e35a9fd571baec639d9b940e76c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_flava import * + from .feature_extraction_flava import * + from .image_processing_flava import * + from .image_processing_flava_fast import * + from .modeling_flava import * + from .processing_flava import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/configuration_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/configuration_flava.py new file mode 100644 index 0000000000000000000000000000000000000000..b7bcb920e47acfacadd7a066e773ca2a3d42e2dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/configuration_flava.py @@ -0,0 +1,697 @@ +# coding=utf-8 +# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FLAVA model configurations""" + +from typing import Any, Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FlavaImageConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FlavaImageModel`]. It is used to instantiate an + FLAVA model according to the specified arguments, defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA + [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + mask_token (`bool`, *optional*, defaults to `True`): + Whether to use a mask token or not. Used in MIM (Masked Image Modeling) loss for FLAVA. + vocab_size (`int`, *optional*, defaults to 8192): + Vocabulary size of the [`FlavaImageCodebook`] used in conjunction with [`FlavaImageModel`] for MIM (Masked + Image Modeling) loss for FLAVA. + + Example: + + ```python + >>> from transformers import FlavaImageConfig, FlavaImageModel + + >>> # Initializing a FlavaImageModel with style configuration + >>> configuration = FlavaImageConfig() + + >>> # Initializing a FlavaImageModel model (with random weights) from the style configuration + >>> model = FlavaImageModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "flava_image_model" + base_config_key = "image_config" + + def __init__( + self, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: int = "gelu", + hidden_dropout_prob: float = 0.0, + attention_probs_dropout_prob: float = 0.0, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + image_size: int = 224, + patch_size: int = 16, + num_channels: int = 3, + qkv_bias: bool = True, + mask_token: bool = True, + vocab_size: int = 8192, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.mask_token = mask_token + self.vocab_size = vocab_size + + +class FlavaTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FlavaTextModel`]. It is used to instantiate an + FLAVA model according to the specified arguments, defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA + [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`FlavaTextModel`]. + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`FlavaTextModel`]. Note that even though + text encoder allows `token_type_ids`'s value as 2, for text-only pretraining and fine-tuning, only 1 is + used similar to RoBERTa. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). For VL, max_length passed to model is 77. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + + Example: + + ```python + >>> from transformers import FlavaTextConfig, FlavaTextModel + + >>> # Initializing a FlavaTextModel with style configuration + >>> configuration = FlavaTextConfig() + + >>> # Initializing a FlavaTextModel model (with random weights) from the style configuration + >>> model = FlavaTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "flava_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size: int = 30522, + type_vocab_size: int = 2, + max_position_embeddings: int = 512, + position_embedding_type: str = "absolute", + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.0, + attention_probs_dropout_prob: float = 0.0, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + pad_token_id: int = 0, + qkv_bias: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.type_vocab_size = type_vocab_size + self.max_position_embeddings = max_position_embeddings + self.position_embedding_type = position_embedding_type + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.qkv_bias = qkv_bias + self.pad_token_id = pad_token_id + + +class FlavaMultimodalConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FlavaMultimodalModel`]. It is used to instantiate + an FLAVA model according to the specified arguments, defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA + [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + use_cls_token (`bool`, *optional*, defaults to `True`): + Whether to use an extra CLS token for multimodal settings. Usually needed by the FLAVA model. + + + Example: + + ```python + >>> from transformers import FlavaMultimodalConfig, FlavaMultimodalModel + + >>> # Initializing a FlavaMultimodalModel with style configuration + >>> configuration = FlavaMultimodalConfig() + + >>> # Initializing a FlavaMultimodalModel model (with random weights) from the style configuration + >>> model = FlavaMultimodalModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "flava_multimodal_model" + base_config_key = "multimodal_config" + + def __init__( + self, + hidden_size: int = 768, + num_hidden_layers: int = 6, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: int = "gelu", + hidden_dropout_prob: int = 0.0, + attention_probs_dropout_prob: int = 0.0, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + qkv_bias: bool = True, + use_cls_token: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.qkv_bias = qkv_bias + self.use_cls_token = use_cls_token + + +class FlavaImageCodebookConfig(PretrainedConfig): + model_type = "flava_image_codebook" + base_config_key = "image_codebook_config" + + r""" + [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It + is used to instantiate an FLAVA model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA + [facebook/flava-image-codebook](https://huggingface.co/facebook/flava-image-codebook) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_groups (`int`, *optional*, defaults to 4): + Number of groups to be created. This parameter as of now doesn't affect the model and is used for some + internal calculation and estimations. + input_channels (`int`, *optional*, defaults to 3): + Number of channels in the image to be passed. + num_blocks_per_group (`int`, *optional*, defaults to 2): + Number of conv-based blocks per group. + hidden_size (`int`, *optional*, defaults to 256): + Size of hidden dim for the blocks. + vocab_size (`int`, *optional*, defaults to 8192): + Size of the output vocabulary for the codebook. + freeze (`bool`, defaults to `True`): + Whether to freeze the weights of the model. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import FlavaImageCodebookConfig, FlavaImageCodebook + + >>> # Initializing a FlavaImageCodebook with style configuration + >>> configuration = FlavaImageCodebookConfig() + + >>> # Initializing a FlavaImageCodebook model (with random weights) from the style configuration + >>> model = FlavaImageCodebook(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + def __init__( + self, + num_groups: int = 4, + input_channels: int = 3, + num_blocks_per_group: int = 2, + hidden_size: int = 256, + vocab_size: int = 8192, + freeze: int = True, + initializer_range: float = 0.02, + **kwargs, + ): + super().__init__(**kwargs) + self.num_groups = num_groups + self.input_channels = input_channels + self.num_blocks_per_group = num_blocks_per_group + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.freeze = freeze + self.initializer_range = initializer_range + + +class FlavaConfig(PretrainedConfig): + r""" + [`FlavaConfig`] is the configuration class to store the configuration of a [`FlavaModel`]. It is used to + instantiate FLAVA model according to the specified arguments, defining the text model, image model, image codebook + and multimodal model configs. Instantiating a configuration with the defaults will yield a similar configuration to + that of the FLAVA [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`FlavaTextConfig`]. + image_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`FlavaImageConfig`]. + multimodal_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`FlavaMultimodalConfig`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and image projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original FLAVA/CLIP + implementation. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + ce_ignore_index (`int`, *optional*, defaults to -100): + Cross entropy index to ignore. + mim_weight (`float`, *optional*, defaults to 1.0): + Weight to be assigned to MIM (Masked Image Modeling) unimodal loss + mlm_weight (`float`, *optional*, defaults to 1.0): + Weight to be assigned to MLM (Masked Language Modeling) unimodal loss + global_contrastive_weight (`float`, *optional*, defaults to 1.0): + Weight to be assigned to global contrastive cross-alignment loss. + itm_weight (`float`, *optional*, defaults to 1.0): + Weight to be assigned to image-text matching multimodal loss. + mmm_image_weight (`float`, *optional*, defaults to 1.0): + Weight to be assigned to MMM loss's image part. + mmm_text_weight (`float`, *optional*, defaults to 1.0): + Weight to be assigned to MMM loss's text part. + global_backprop_contrastive (`bool`, *optional*, defaults to `True`): + Whether to use global backpropgation through all workers in contrastive loss. + skip_unmasked_multimodal_encoder (`bool`, *optional*, defaults to `True`): + Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses. + return_loss (`bool`, *optional*, defaults to `True`): + Whether to return loss or not + + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import FlavaConfig, FlavaModel, FlavaForPreTraining + + >>> # Initializing a FlavaConfig with style configuration + >>> configuration = FlavaConfig() + + >>> # Initializing a FlavaModel and FlavaForPreTraining model (with random weights) from the style configuration + >>> model = FlavaModel(configuration) + >>> model_pre = FlavaForPreTraining(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + >>> configuration_pre = model_pre.config + ``` + """ + + model_type = "flava" + sub_configs = { + "text_config": FlavaTextConfig, + "image_config": FlavaImageConfig, + "multimodal_config": FlavaMultimodalConfig, + "image_codebook_config": FlavaImageCodebookConfig, + } + + def __init__( + self, + image_config: Optional[dict[str, Any]] = None, + text_config: Optional[dict[str, Any]] = None, + multimodal_config: Optional[dict[str, Any]] = None, + image_codebook_config: Optional[dict[str, Any]] = None, + hidden_size: int = 768, + layer_norm_eps: float = 1e-12, + projection_dim: int = 768, + init_codebook: bool = True, + logit_scale_init_value: float = 2.6592, + initializer_range: float = 0.02, + ce_ignore_index: int = -100, + mim_weight: float = 1.0, + mlm_weight: float = 1.0, + global_contrastive_weight: float = 1.0, + itm_weight: float = 1.0, + mmm_image_weight: float = 1.0, + mmm_text_weight: float = 1.0, + global_backprop_contrastive: bool = True, + skip_unmasked_multimodal_encoder: bool = True, + return_loss: bool = True, + **kwargs, + ): + # If `_config_dict` exist, we use them for the backward compatibility. + # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot + # of confusion!). + text_config_dict = kwargs.pop("text_config_dict", None) + image_config_dict = kwargs.pop("image_config_dict", None) + multimodal_config_dict = kwargs.pop("multimodal_config_dict", None) + image_codebook_config_dict = kwargs.pop("image_codebook_config_dict", None) + + super().__init__(**kwargs) + + # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in + # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most + # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. + if text_config_dict is not None: + if text_config is None: + text_config = {} + + # This is the complete result when using `text_config_dict`. + _text_config_dict = FlavaTextConfig(**text_config_dict).to_dict() + + # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. + for key, value in _text_config_dict.items(): + if key in text_config and value != text_config[key] and key != "transformers_version": + # If specified in `text_config_dict` + if key in text_config_dict: + message = ( + f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. " + f'The value `text_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The " + f'value `text_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `text_config` with the ones in `_text_config_dict`. + text_config.update(_text_config_dict) + + if image_config_dict is not None: + if image_config is None: + image_config = {} + + # This is the complete result when using `image_config_dict`. + _image_config_dict = FlavaImageConfig(**image_config_dict).to_dict() + # convert keys to string instead of integer + if "id2label" in _image_config_dict: + _image_config_dict["id2label"] = { + str(key): value for key, value in _image_config_dict["id2label"].items() + } + + # Give a warning if the values exist in both `_image_config_dict` and `image_config` but being different. + for key, value in _image_config_dict.items(): + if key in image_config and value != image_config[key] and key != "transformers_version": + # If specified in `image_config_dict` + if key in image_config_dict: + message = ( + f"`{key}` is found in both `image_config_dict` and `image_config` but with different " + f'values. The value `image_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`image_config_dict` is provided which will be used to initialize `FlavaImageConfig`. " + f'The value `image_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `image_config` with the ones in `_image_config_dict`. + image_config.update(_image_config_dict) + + if multimodal_config_dict is not None: + if multimodal_config is None: + multimodal_config = {} + + # This is the complete result when using `multimodal_config_dict`. + _multimodal_config_dict = FlavaMultimodalConfig(**multimodal_config_dict).to_dict() + + # Give a warning if the values exist in both `_multimodal_config_dict` and `multimodal_config` but being + # different. + for key, value in _multimodal_config_dict.items(): + if key in multimodal_config and value != multimodal_config[key] and key != "transformers_version": + # If specified in `multimodal_config_dict` + if key in multimodal_config_dict: + message = ( + f"`{key}` is found in both `multimodal_config_dict` and `multimodal_config` but with " + f'different values. The value `multimodal_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`multimodal_config_dict` is provided which will be used to initialize " + f'`FlavaMultimodalConfig`. The value `multimodal_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `multimodal_config` with the ones in `_multimodal_config_dict`. + multimodal_config.update(_multimodal_config_dict) + + if image_codebook_config_dict is not None: + if image_codebook_config is None: + image_codebook_config = {} + + # This is the complete result when using `image_codebook_config_dict`. + _image_codebook_config_dict = FlavaImageCodebookConfig(**image_codebook_config_dict).to_dict() + + # Give a warning if the values exist in both `_image_codebook_config_dict` and `image_codebook_config` but + # being different. + for key, value in _image_codebook_config_dict.items(): + if ( + key in image_codebook_config + and value != image_codebook_config[key] + and key != "transformers_version" + ): + # If specified in `image_codebook_config_dict` + if key in image_codebook_config_dict: + message = ( + f"`{key}` is found in both `image_codebook_config_dict` and `image_codebook_config` but " + f'with different values. The value `image_codebook_config_dict["{key}"]` will be used ' + "instead." + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`image_codebook_config_dict` is provided which will be used to initialize " + f'`FlavaImageCodebookConfig`. The value `image_codebook_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `image_codebook_config` with the ones in `_image_codebook_config_dict`. + image_codebook_config.update(_image_codebook_config_dict) + + if image_config is None: + image_config = {} + logger.info("`image_config` is `None`. initializing the `FlavaImageConfig` with default values.") + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `FlavaTextConfig` with default values.") + + if multimodal_config is None: + multimodal_config = {} + logger.info("`multimodal_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.") + + if image_codebook_config is None: + image_codebook_config = {} + logger.info( + "`image_codebook_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values." + ) + + self.image_config = FlavaImageConfig(**image_config) + self.text_config = FlavaTextConfig(**text_config) + self.multimodal_config = FlavaMultimodalConfig(**multimodal_config) + self.image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config) + self.projection_dim = projection_dim + self.init_codebook = init_codebook + + self.hidden_size = hidden_size + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = 1.0 + self.ce_ignore_index = ce_ignore_index + self.mim_weight = mim_weight + self.mlm_weight = mlm_weight + self.global_contrastive_weight = global_contrastive_weight + self.itm_weight = itm_weight + self.mmm_image_weight = mmm_image_weight + self.mmm_text_weight = mmm_text_weight + self.global_backprop_contrastive = global_backprop_contrastive + self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder + self.return_loss = return_loss + + @classmethod + def from_configs( + cls, + image_config: FlavaImageConfig, + text_config: FlavaTextConfig, + multimodal_config: FlavaMultimodalConfig, + image_codebook_config: FlavaImageCodebookConfig, + **kwargs, + ): + r""" + Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model + configuration, flava multimodal model and flava codebook model configuration. + + Returns: + [`FlavaConfig`]: An instance of a configuration object + """ + + return cls( + image_config=image_config.to_dict(), + text_config=text_config.to_dict(), + multimodal_config=multimodal_config.to_dict(), + image_codebook_config=image_codebook_config.to_dict(), + **kwargs, + ) + + +__all__ = ["FlavaConfig", "FlavaImageCodebookConfig", "FlavaImageConfig", "FlavaMultimodalConfig", "FlavaTextConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/feature_extraction_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/feature_extraction_flava.py new file mode 100644 index 0000000000000000000000000000000000000000..19bcccc889f546442af71229c998880fbbb2db31 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/feature_extraction_flava.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for FLAVA.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_flava import FlavaImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class FlavaFeatureExtractor(FlavaImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class FlavaFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" + " use FlavaImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["FlavaFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava.py new file mode 100644 index 0000000000000000000000000000000000000000..7b4db246a8fa4195efd8d9840e351aae43af2a53 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava.py @@ -0,0 +1,706 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Flava.""" + +import math +import random +from collections.abc import Iterable +from functools import lru_cache +from typing import Any, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging +from ...utils.import_utils import requires + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +# These values are taken from CLIP +FLAVA_IMAGE_MEAN = OPENAI_CLIP_MEAN +FLAVA_IMAGE_STD = OPENAI_CLIP_STD +FLAVA_CODEBOOK_MEAN = [0.0, 0.0, 0.0] +FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0] +LOGIT_LAPLACE_EPS: float = 0.1 + + +# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py +class FlavaMaskingGenerator: + def __init__( + self, + input_size: Union[int, tuple[int, int]] = 14, + total_mask_patches: int = 75, + mask_group_max_patches: Optional[int] = None, + mask_group_min_patches: int = 16, + mask_group_min_aspect_ratio: Optional[float] = 0.3, + mask_group_max_aspect_ratio: Optional[float] = None, + ): + if not isinstance(input_size, tuple): + input_size = (input_size,) * 2 + self.height, self.width = input_size + + self.num_patches = self.height * self.width + self.total_mask_patches = total_mask_patches + + self.mask_group_min_patches = mask_group_min_patches + self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches + + mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio + self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio)) + + def __repr__(self): + repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % ( + self.height, + self.width, + self.mask_group_min_patches, + self.mask_group_max_patches, + self.total_mask_patches, + self.log_aspect_ratio[0], + self.log_aspect_ratio[1], + ) + return repr_str + + def get_shape(self): + return self.height, self.width + + def _mask(self, mask, max_mask_patches): + delta = 0 + for _attempt in range(10): + target_area = random.uniform(self.mask_group_min_patches, max_mask_patches) + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + height = int(round(math.sqrt(target_area * aspect_ratio))) + width = int(round(math.sqrt(target_area / aspect_ratio))) + if width < self.width and height < self.height: + top = random.randint(0, self.height - height) + left = random.randint(0, self.width - width) + + num_masked = mask[top : top + height, left : left + width].sum() + # Overlap + if 0 < height * width - num_masked <= max_mask_patches: + for i in range(top, top + height): + for j in range(left, left + width): + if mask[i, j] == 0: + mask[i, j] = 1 + delta += 1 + + if delta > 0: + break + return delta + + def __call__(self): + mask = np.zeros(shape=self.get_shape(), dtype=int) + mask_count = 0 + while mask_count < self.total_mask_patches: + max_mask_patches = self.total_mask_patches - mask_count + max_mask_patches = min(max_mask_patches, self.mask_group_max_patches) + + delta = self._mask(mask, max_mask_patches) + if delta == 0: + break + else: + mask_count += delta + + return mask + + +@requires(backends=("vision",)) +class FlavaImageProcessor(BaseImageProcessor): + r""" + Constructs a Flava image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in `preprocess`. + size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`): + Size of the image after resizing. Can be overridden by the `size` parameter in `preprocess`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in + `preprocess`. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the images. Can be overridden by the `do_center_crop` parameter in `preprocess`. + crop_size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`): + Size of image after the center crop `(crop_size["height"], crop_size["width"])`. Can be overridden by the + `crop_size` parameter in `preprocess`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in `preprocess`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in + `preprocess`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in `preprocess`. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + return_image_mask (`bool`, *optional*, defaults to `False`): + Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`. + input_size_patches (`int`, *optional*, defaults to 14): + Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden + by the `input_size_patches` parameter in `preprocess`. + total_mask_patches (`int`, *optional*, defaults to 75): + Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in + `preprocess`. + mask_group_min_patches (`int`, *optional*, defaults to 16): + Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches` + parameter in `preprocess`. + mask_group_max_patches (`int`, *optional*): + Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches` + parameter in `preprocess`. + mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3): + Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter + in `preprocess`. + mask_group_max_aspect_ratio (`float`, *optional*): + Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter + in `preprocess`. + codebook_do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize` + parameter in `preprocess`. `codebook_size`. + codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in + `preprocess`. + codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`): + Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample` + parameter in `preprocess`. + codebook_do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to crop the input for codebook at the center. If the input size is smaller than + `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be + overridden by the `codebook_do_center_crop` parameter in `preprocess`. + codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Desired output size for codebook input when applying center-cropping. Can be overridden by the + `codebook_crop_size` parameter in `preprocess`. + codebook_do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be + overridden by the `codebook_do_rescale` parameter in `preprocess`. + codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Defines the scale factor to use if rescaling the codebook image. Can be overridden by the + `codebook_rescale_factor` parameter in `preprocess`. + codebook_do_map_pixels (`bool`, *optional*, defaults to `True`): + Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the + `codebook_do_map_pixels` parameter in `preprocess`. + codebook_do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can + be overridden by the `codebook_do_normalize` parameter in `preprocess`. + codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`): + The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden + by the `codebook_image_mean` parameter in `preprocess`. + codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`): + The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can + be overridden by the `codebook_image_std` parameter in `preprocess`. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, Iterable[float]]] = None, + image_std: Optional[Union[float, Iterable[float]]] = None, + # Mask related params + return_image_mask: bool = False, + input_size_patches: int = 14, + total_mask_patches: int = 75, + mask_group_min_patches: int = 16, + mask_group_max_patches: Optional[int] = None, + mask_group_min_aspect_ratio: float = 0.3, + mask_group_max_aspect_ratio: Optional[float] = None, + # Codebook related params + return_codebook_pixels: bool = False, + codebook_do_resize: bool = True, + codebook_size: Optional[bool] = None, + codebook_resample: int = PILImageResampling.LANCZOS, + codebook_do_center_crop: bool = True, + codebook_crop_size: Optional[int] = None, + codebook_do_rescale: bool = True, + codebook_rescale_factor: Union[int, float] = 1 / 255, + codebook_do_map_pixels: bool = True, + codebook_do_normalize: bool = True, + codebook_image_mean: Optional[Union[float, Iterable[float]]] = None, + codebook_image_std: Optional[Union[float, Iterable[float]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 224, "width": 224} + size = get_size_dict(size) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, param_name="crop_size") + + codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112} + codebook_size = get_size_dict(codebook_size, param_name="codebook_size") + codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112} + codebook_crop_size = get_size_dict(codebook_crop_size, param_name="codebook_crop_size") + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else FLAVA_IMAGE_MEAN + self.image_std = image_std if image_std is not None else FLAVA_IMAGE_STD + + self.return_image_mask = return_image_mask + self.input_size_patches = input_size_patches + self.total_mask_patches = total_mask_patches + self.mask_group_min_patches = mask_group_min_patches + self.mask_group_max_patches = mask_group_max_patches + self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio + self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio + + self.return_codebook_pixels = return_codebook_pixels + self.codebook_do_resize = codebook_do_resize + self.codebook_size = codebook_size + self.codebook_resample = codebook_resample + self.codebook_do_center_crop = codebook_do_center_crop + self.codebook_crop_size = codebook_crop_size + self.codebook_do_rescale = codebook_do_rescale + self.codebook_rescale_factor = codebook_rescale_factor + self.codebook_do_map_pixels = codebook_do_map_pixels + self.codebook_do_normalize = codebook_do_normalize + self.codebook_image_mean = codebook_image_mean + self.codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else FLAVA_CODEBOOK_MEAN + self.codebook_image_std = codebook_image_std if codebook_image_std is not None else FLAVA_CODEBOOK_STD + + @classmethod + def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `FlavaImageProcessor.from_pretrained(checkpoint, codebook_size=600)` + """ + image_processor_dict = image_processor_dict.copy() + if "codebook_size" in kwargs: + image_processor_dict["codebook_size"] = kwargs.pop("codebook_size") + if "codebook_crop_size" in kwargs: + image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size") + return super().from_dict(image_processor_dict, **kwargs) + + @lru_cache + def masking_generator( + self, + input_size_patches, + total_mask_patches, + mask_group_min_patches, + mask_group_max_patches, + mask_group_min_aspect_ratio, + mask_group_max_aspect_ratio, + ) -> FlavaMaskingGenerator: + return FlavaMaskingGenerator( + input_size=input_size_patches, + total_mask_patches=total_mask_patches, + mask_group_min_patches=mask_group_min_patches, + mask_group_max_patches=mask_group_max_patches, + mask_group_min_aspect_ratio=mask_group_min_aspect_ratio, + mask_group_max_aspect_ratio=mask_group_max_aspect_ratio, + ) + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def map_pixels(self, image: np.ndarray) -> np.ndarray: + return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS + + def _preprocess_image( + self, + image: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[dict[str, int]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_map_pixels: Optional[bool] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[ChannelDimension] = None, + ) -> np.ndarray: + """Preprocesses a single image.""" + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + # All transformations expect numpy arrays. + image = to_numpy_array(image) + + if do_rescale and is_scaled_image(image): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(image) + + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + + if do_map_pixels: + image = self.map_pixels(image) + + if data_format is not None: + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[dict[str, int]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + # Mask related params + return_image_mask: Optional[bool] = None, + input_size_patches: Optional[int] = None, + total_mask_patches: Optional[int] = None, + mask_group_min_patches: Optional[int] = None, + mask_group_max_patches: Optional[int] = None, + mask_group_min_aspect_ratio: Optional[float] = None, + mask_group_max_aspect_ratio: Optional[float] = None, + # Codebook related params + return_codebook_pixels: Optional[bool] = None, + codebook_do_resize: Optional[bool] = None, + codebook_size: Optional[dict[str, int]] = None, + codebook_resample: Optional[int] = None, + codebook_do_center_crop: Optional[bool] = None, + codebook_crop_size: Optional[dict[str, int]] = None, + codebook_do_rescale: Optional[bool] = None, + codebook_rescale_factor: Optional[float] = None, + codebook_do_map_pixels: Optional[bool] = None, + codebook_do_normalize: Optional[bool] = None, + codebook_image_mean: Optional[Iterable[float]] = None, + codebook_image_std: Optional[Iterable[float]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation. + return_image_mask (`bool`, *optional*, defaults to `self.return_image_mask`): + Whether to return the image mask. + input_size_patches (`int`, *optional*, defaults to `self.input_size_patches`): + Size of the patches to extract from the image. + total_mask_patches (`int`, *optional*, defaults to `self.total_mask_patches`): + Total number of patches to extract from the image. + mask_group_min_patches (`int`, *optional*, defaults to `self.mask_group_min_patches`): + Minimum number of patches to extract from the image. + mask_group_max_patches (`int`, *optional*, defaults to `self.mask_group_max_patches`): + Maximum number of patches to extract from the image. + mask_group_min_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_min_aspect_ratio`): + Minimum aspect ratio of the patches to extract from the image. + mask_group_max_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_max_aspect_ratio`): + Maximum aspect ratio of the patches to extract from the image. + return_codebook_pixels (`bool`, *optional*, defaults to `self.return_codebook_pixels`): + Whether to return the codebook pixels. + codebook_do_resize (`bool`, *optional*, defaults to `self.codebook_do_resize`): + Whether to resize the codebook pixels. + codebook_size (`dict[str, int]`, *optional*, defaults to `self.codebook_size`): + Size of the codebook pixels. + codebook_resample (`int`, *optional*, defaults to `self.codebook_resample`): + Resampling filter to use if resizing the codebook pixels. This can be one of the enum + `PILImageResampling`, Only has an effect if `codebook_do_resize` is set to `True`. + codebook_do_center_crop (`bool`, *optional*, defaults to `self.codebook_do_center_crop`): + Whether to center crop the codebook pixels. + codebook_crop_size (`dict[str, int]`, *optional*, defaults to `self.codebook_crop_size`): + Size of the center crop of the codebook pixels. Only has an effect if `codebook_do_center_crop` is set + to `True`. + codebook_do_rescale (`bool`, *optional*, defaults to `self.codebook_do_rescale`): + Whether to rescale the codebook pixels values between [0 - 1]. + codebook_rescale_factor (`float`, *optional*, defaults to `self.codebook_rescale_factor`): + Rescale factor to rescale the codebook pixels by if `codebook_do_rescale` is set to `True`. + codebook_do_map_pixels (`bool`, *optional*, defaults to `self.codebook_do_map_pixels`): + Whether to map the codebook pixels values. + codebook_do_normalize (`bool`, *optional*, defaults to `self.codebook_do_normalize`): + Whether to normalize the codebook pixels. + codebook_image_mean (`float` or `list[float]`, *optional*, defaults to `self.codebook_image_mean`): + Codebook pixels mean to normalize the codebook pixels by if `codebook_do_normalize` is set to `True`. + codebook_image_std (`float` or `list[float]`, *optional*, defaults to `self.codebook_image_std`): + Codebook pixels standard deviation to normalize the codebook pixels by if `codebook_do_normalize` is + set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size") + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + return_image_mask = return_image_mask if return_image_mask is not None else self.return_image_mask + input_size_patches = input_size_patches if input_size_patches is not None else self.input_size_patches + total_mask_patches = total_mask_patches if total_mask_patches is not None else self.total_mask_patches + mask_group_min_patches = ( + mask_group_min_patches if mask_group_min_patches is not None else self.mask_group_min_patches + ) + mask_group_max_patches = ( + mask_group_max_patches if mask_group_max_patches is not None else self.mask_group_max_patches + ) + mask_group_min_aspect_ratio = ( + mask_group_min_aspect_ratio + if mask_group_min_aspect_ratio is not None + else self.mask_group_min_aspect_ratio + ) + mask_group_max_aspect_ratio = ( + mask_group_max_aspect_ratio + if mask_group_max_aspect_ratio is not None + else self.mask_group_max_aspect_ratio + ) + + return_codebook_pixels = ( + return_codebook_pixels if return_codebook_pixels is not None else self.return_codebook_pixels + ) + codebook_do_resize = codebook_do_resize if codebook_do_resize is not None else self.codebook_do_resize + codebook_size = codebook_size if codebook_size is not None else self.codebook_size + codebook_size = get_size_dict(codebook_size, param_name="codebook_size") + codebook_resample = codebook_resample if codebook_resample is not None else self.codebook_resample + codebook_do_rescale = codebook_do_rescale if codebook_do_rescale is not None else self.codebook_do_rescale + codebook_rescale_factor = ( + codebook_rescale_factor if codebook_rescale_factor is not None else self.codebook_rescale_factor + ) + codebook_do_center_crop = ( + codebook_do_center_crop if codebook_do_center_crop is not None else self.codebook_do_center_crop + ) + codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else self.codebook_crop_size + codebook_crop_size = get_size_dict(codebook_crop_size, param_name="codebook_crop_size") + codebook_do_map_pixels = ( + codebook_do_map_pixels if codebook_do_map_pixels is not None else self.codebook_do_map_pixels + ) + codebook_do_normalize = ( + codebook_do_normalize if codebook_do_normalize is not None else self.codebook_do_normalize + ) + codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else self.codebook_image_mean + codebook_image_std = codebook_image_std if codebook_image_std is not None else self.codebook_image_std + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + processed_images = [ + self._preprocess_image( + image=img, + do_resize=do_resize, + size=size, + resample=resample, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_map_pixels=False, + data_format=data_format, + input_data_format=input_data_format, + ) + for img in images + ] + data = {"pixel_values": processed_images} + + if return_codebook_pixels: + codebook_images = [ + self._preprocess_image( + image=img, + do_resize=codebook_do_resize, + size=codebook_size, + resample=codebook_resample, + do_center_crop=codebook_do_center_crop, + crop_size=codebook_crop_size, + do_rescale=codebook_do_rescale, + rescale_factor=codebook_rescale_factor, + do_normalize=codebook_do_normalize, + image_mean=codebook_image_mean, + image_std=codebook_image_std, + do_map_pixels=codebook_do_map_pixels, + data_format=data_format, + input_data_format=input_data_format, + ) + for img in images + ] + data["codebook_pixel_values"] = codebook_images + + if return_image_mask: + mask_generator = self.masking_generator( + input_size_patches=input_size_patches, + total_mask_patches=total_mask_patches, + mask_group_min_patches=mask_group_min_patches, + mask_group_max_patches=mask_group_max_patches, + mask_group_min_aspect_ratio=mask_group_min_aspect_ratio, + mask_group_max_aspect_ratio=mask_group_max_aspect_ratio, + ) + masks = [mask_generator() for _ in images] + data["bool_masked_pos"] = masks + + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["FlavaImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..732d25e71f697e083e784199c1db1782298951ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava_fast.py @@ -0,0 +1,491 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for Flava.""" + +import math +import random +from collections.abc import Iterable +from functools import lru_cache +from typing import Any, Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + BatchFeature, + DefaultFastImageProcessorKwargs, + get_size_dict, +) +from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images +from ...image_utils import ImageInput, PILImageResampling, SizeDict, pil_torch_interpolation_mapping +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, +) +from .image_processing_flava import ( + FLAVA_CODEBOOK_MEAN, + FLAVA_CODEBOOK_STD, + FLAVA_IMAGE_MEAN, + FLAVA_IMAGE_STD, + LOGIT_LAPLACE_EPS, +) + + +class FlavaMaskingGenerator: + def __init__( + self, + input_size: Union[int, tuple[int, int]] = 14, + total_mask_patches: int = 75, + mask_group_max_patches: Optional[int] = None, + mask_group_min_patches: int = 16, + mask_group_min_aspect_ratio: Optional[float] = 0.3, + mask_group_max_aspect_ratio: Optional[float] = None, + ): + if not isinstance(input_size, tuple): + input_size = (input_size,) * 2 + self.height, self.width = input_size + + self.num_patches = self.height * self.width + self.total_mask_patches = total_mask_patches + + self.mask_group_min_patches = mask_group_min_patches + self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches + + mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio + self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio)) + + def __repr__(self): + repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % ( + self.height, + self.width, + self.mask_group_min_patches, + self.mask_group_max_patches, + self.total_mask_patches, + self.log_aspect_ratio[0], + self.log_aspect_ratio[1], + ) + return repr_str + + def get_shape(self): + return self.height, self.width + + def _mask(self, mask, max_mask_patches): + delta = 0 + for _attempt in range(10): + target_area = random.uniform(self.mask_group_min_patches, max_mask_patches) + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + height = int(round(math.sqrt(target_area * aspect_ratio))) + width = int(round(math.sqrt(target_area / aspect_ratio))) + if width < self.width and height < self.height: + top = random.randint(0, self.height - height) + left = random.randint(0, self.width - width) + + num_masked = mask[top : top + height, left : left + width].sum() + # Overlap + if 0 < height * width - num_masked <= max_mask_patches: + zeros_pos = mask[top : top + height, left : left + width] == 0 + mask[top : top + height, left : left + width][zeros_pos] = 1 + delta += zeros_pos.sum() + + if delta > 0: + break + return delta + + def __call__(self): + mask = torch.zeros(self.get_shape(), dtype=torch.int) + mask_count = 0 + while mask_count < self.total_mask_patches: + max_mask_patches = self.total_mask_patches - mask_count + max_mask_patches = min(max_mask_patches, self.mask_group_max_patches) + + delta = self._mask(mask, max_mask_patches) + if delta == 0: + break + else: + mask_count += delta + + return mask + + +class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + Args: + return_image_mask (`bool`, *optional*, defaults to `False`): + Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`. + input_size_patches (`int`, *optional*, defaults to 14): + Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden + by the `input_size_patches` parameter in `preprocess`. + total_mask_patches (`int`, *optional*, defaults to 75): + Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in + `preprocess`. + mask_group_min_patches (`int`, *optional*, defaults to 16): + Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches` + parameter in `preprocess`. + mask_group_max_patches (`int`, *optional*): + Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches` + parameter in `preprocess`. + mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3): + Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter + in `preprocess`. + mask_group_max_aspect_ratio (`float`, *optional*): + Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter + in `preprocess`. + return_codebook_pixels (`bool`, *optional*, defaults to `False`): + Whether to return the codebook pixel values. + codebook_do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize` + parameter in `preprocess`. `codebook_size`. + codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in + `preprocess`. + codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`): + Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample` + parameter in `preprocess`. + codebook_do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to crop the input for codebook at the center. If the input size is smaller than + `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be + overridden by the `codebook_do_center_crop` parameter in `preprocess`. + codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Desired output size for codebook input when applying center-cropping. Can be overridden by the + `codebook_crop_size` parameter in `preprocess`. + codebook_do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be + overridden by the `codebook_do_rescale` parameter in `preprocess`. + codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Defines the scale factor to use if rescaling the codebook image. Can be overridden by the + `codebook_rescale_factor` parameter in `preprocess`. + codebook_do_map_pixels (`bool`, *optional*, defaults to `True`): + Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the + `codebook_do_map_pixels` parameter in `preprocess`. + codebook_do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can + be overridden by the `codebook_do_normalize` parameter in `preprocess`. + codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`): + The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden + by the `codebook_image_mean` parameter in `preprocess`. + codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`): + The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can + be overridden by the `codebook_image_std` parameter in `preprocess`. + """ + + # Mask related params + return_image_mask: Optional[bool] + input_size_patches: Optional[int] + total_mask_patches: Optional[int] + mask_group_min_patches: Optional[int] + mask_group_max_patches: Optional[int] + mask_group_min_aspect_ratio: Optional[float] + mask_group_max_aspect_ratio: Optional[float] + # Codebook related params + return_codebook_pixels: Optional[bool] + codebook_do_resize: Optional[bool] + codebook_size: Optional[bool] + codebook_resample: Optional[int] + codebook_do_center_crop: Optional[bool] + codebook_crop_size: Optional[int] + codebook_do_rescale: Optional[bool] + codebook_rescale_factor: Optional[Union[int, float]] + codebook_do_map_pixels: Optional[bool] + codebook_do_normalize: Optional[bool] + codebook_image_mean: Optional[Union[float, Iterable[float]]] + codebook_image_std: Optional[Union[float, Iterable[float]]] + + +@auto_docstring +class FlavaImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = FLAVA_IMAGE_MEAN + image_std = FLAVA_IMAGE_STD + size = {"height": 224, "width": 224} + crop_size = {"height": 224, "width": 224} + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + + # Mask related params + return_image_mask = False + input_size_patches = 14 + total_mask_patches = 75 + mask_group_min_patches = 16 + mask_group_max_patches = None + mask_group_min_aspect_ratio = 0.3 + mask_group_max_aspect_ratio = None + # Codebook related params + return_codebook_pixels = False + codebook_do_resize = True + codebook_size = {"height": 112, "width": 112} + # LANCZOS resample does not support torch Tensor. Use BICUBIC as closest alternative + codebook_resample = PILImageResampling.BICUBIC + codebook_do_center_crop = True + codebook_crop_size = {"height": 112, "width": 112} + codebook_do_rescale = True + codebook_rescale_factor = 1 / 255 + codebook_do_map_pixels = True + codebook_do_normalize = True + codebook_image_mean = FLAVA_CODEBOOK_MEAN + codebook_image_std = FLAVA_CODEBOOK_STD + valid_kwargs = FlavaFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + @classmethod + def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `FlavaImageProcessor.from_pretrained(checkpoint, codebook_size=600)` + """ + image_processor_dict = image_processor_dict.copy() + if "codebook_size" in kwargs: + image_processor_dict["codebook_size"] = kwargs.pop("codebook_size") + if "codebook_crop_size" in kwargs: + image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size") + return super().from_dict(image_processor_dict, **kwargs) + + @lru_cache + def masking_generator( + self, + input_size_patches, + total_mask_patches, + mask_group_min_patches, + mask_group_max_patches, + mask_group_min_aspect_ratio, + mask_group_max_aspect_ratio, + ) -> FlavaMaskingGenerator: + return FlavaMaskingGenerator( + input_size=input_size_patches, + total_mask_patches=total_mask_patches, + mask_group_min_patches=mask_group_min_patches, + mask_group_max_patches=mask_group_max_patches, + mask_group_min_aspect_ratio=mask_group_min_aspect_ratio, + mask_group_max_aspect_ratio=mask_group_max_aspect_ratio, + ) + + def map_pixels(self, image: "torch.Tensor") -> "torch.Tensor": + return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS + + def _further_process_kwargs( + self, + size: Optional[SizeDict] = None, + crop_size: Optional[SizeDict] = None, + default_to_square: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + codebook_size: Optional[SizeDict] = None, + codebook_crop_size: Optional[SizeDict] = None, + codebook_image_mean: Optional[Union[float, list[float]]] = None, + codebook_image_std: Optional[Union[float, list[float]]] = None, + codebook_resample: Optional[PILImageResampling] = None, + data_format: Optional[ChannelDimension] = None, + **kwargs, + ) -> dict: + """ + Update kwargs that need further processing before being validated + Can be overridden by subclasses to customize the processing of kwargs. + """ + if kwargs is None: + kwargs = {} + if size is not None: + size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) + if crop_size is not None: + crop_size = SizeDict(**get_size_dict(crop_size, param_name="crop_size")) + if isinstance(image_mean, list): + image_mean = tuple(image_mean) + if isinstance(image_std, list): + image_std = tuple(image_std) + if data_format is None: + data_format = ChannelDimension.FIRST + if codebook_size is not None: + codebook_size = SizeDict(**get_size_dict(size=codebook_size, default_to_square=default_to_square)) + if codebook_crop_size is not None: + codebook_crop_size = SizeDict(**get_size_dict(codebook_crop_size, param_name="codebook_crop_size")) + if isinstance(codebook_image_mean, list): + codebook_image_mean = tuple(codebook_image_mean) + if isinstance(codebook_image_std, list): + codebook_image_std = tuple(codebook_image_std) + + kwargs["size"] = size + kwargs["crop_size"] = crop_size + kwargs["image_mean"] = image_mean + kwargs["image_std"] = image_std + kwargs["codebook_size"] = codebook_size + kwargs["codebook_crop_size"] = codebook_crop_size + kwargs["codebook_image_mean"] = codebook_image_mean + kwargs["codebook_image_std"] = codebook_image_std + kwargs["data_format"] = data_format + kwargs["codebook_interpolation"] = ( + pil_torch_interpolation_mapping[codebook_resample] + if isinstance(codebook_resample, (PILImageResampling, int)) + else codebook_resample + ) + + # torch resize uses interpolation instead of resample + # Check if resample is an int before checking if it's an instance of PILImageResampling + # because if pillow < 9.1.0, resample is an int and PILImageResampling is a module. + # Checking PILImageResampling will fail with error `TypeError: isinstance() arg 2 must be a type or tuple of types`. + resample = kwargs.pop("resample") + kwargs["interpolation"] = ( + pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample + ) + + return kwargs + + def _preprocess_image( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + do_map_pixels: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + ) -> "torch.Tensor": + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + if do_map_pixels: + stacked_images = self.map_pixels(image=stacked_images) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return processed_images + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + # Mask related params + return_image_mask: Optional[bool], + input_size_patches: Optional[int], + total_mask_patches: Optional[int], + mask_group_min_patches: Optional[int], + mask_group_max_patches: Optional[int], + mask_group_min_aspect_ratio: Optional[float], + mask_group_max_aspect_ratio: Optional[float], + # Codebook related params + return_codebook_pixels: Optional[bool], + codebook_do_resize: Optional[bool], + codebook_size: Optional[SizeDict], + codebook_interpolation: Optional["F.InterpolationMode"], + codebook_do_center_crop: Optional[bool], + codebook_crop_size: Optional[SizeDict], + codebook_do_rescale: Optional[bool], + codebook_rescale_factor: Optional[float], + codebook_do_map_pixels: Optional[bool], + codebook_do_normalize: Optional[bool], + codebook_image_mean: Optional[Union[float, list[float]]], + codebook_image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + processed_images = self._preprocess_image( + images=images, + do_resize=do_resize, + size=size, + interpolation=interpolation, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + do_map_pixels=False, + image_mean=image_mean, + image_std=image_std, + disable_grouping=disable_grouping, + return_tensors=return_tensors, + ) + data = { + "pixel_values": processed_images, + } + + if return_codebook_pixels: + codebook_processed_images = self._preprocess_image( + images=images, + do_resize=codebook_do_resize, + size=codebook_size, + interpolation=codebook_interpolation, + do_center_crop=codebook_do_center_crop, + crop_size=codebook_crop_size, + do_rescale=codebook_do_rescale, + rescale_factor=codebook_rescale_factor, + do_normalize=codebook_do_normalize, + do_map_pixels=codebook_do_map_pixels, + image_mean=codebook_image_mean, + image_std=codebook_image_std, + disable_grouping=disable_grouping, + return_tensors=return_tensors, + ) + data["codebook_pixel_values"] = codebook_processed_images + + if return_image_mask: + mask_generator = self.masking_generator( + input_size_patches=input_size_patches, + total_mask_patches=total_mask_patches, + mask_group_min_patches=mask_group_min_patches, + mask_group_max_patches=mask_group_max_patches, + mask_group_min_aspect_ratio=mask_group_min_aspect_ratio, + mask_group_max_aspect_ratio=mask_group_max_aspect_ratio, + ) + masks = [mask_generator() for _ in range(len(images))] + masks = torch.stack(masks, dim=0) if return_tensors else masks + data["bool_masked_pos"] = masks + + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["FlavaImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/modeling_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/modeling_flava.py new file mode 100644 index 0000000000000000000000000000000000000000..266c3e96af5a263db2b49a31223f9652563ae213 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/modeling_flava.py @@ -0,0 +1,2030 @@ +# coding=utf-8 +# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch FLAVA model.""" + +import collections +import math +from collections import OrderedDict +from dataclasses import dataclass +from typing import Any, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, filter_out_non_signature_kwargs, logging, torch_int +from .configuration_flava import ( + FlavaConfig, + FlavaImageCodebookConfig, + FlavaImageConfig, + FlavaMultimodalConfig, + FlavaTextConfig, +) + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_CODEBOOK_DOC = "facebook/flava-image-codebook" + +LOGIT_SCALE_CLAMP_MIN = 0 +LOGIT_SCALE_CLAMP_MAX = 4.6052 + +FlavaPossibleConfigs = Union[FlavaTextConfig, FlavaImageConfig, FlavaMultimodalConfig] + + +@dataclass +@auto_docstring( + custom_intro=""" + Output from FlavaModel containing embeddings and outputs from individual encoders. + + Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a + transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and + `text_projection` layers on `image_embeddings` and `text_embeddings` respectively. + """ +) +class FlavaModelOutput(ModelOutput): + r""" + image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present): + The image embeddings which are basically the pooled output of [`FlavaImageModel`]. + image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present): + The output of the [`FlavaImageModel`]. + text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present): + The text embeddings which are basically the pooled output of [`FlavaTextModel`]. + text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present): + The output of the [`FlavaTextModel`]. + multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`): + The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`]. + multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`): + The output of the [`FlavaMultimodalModel`]. + """ + + image_embeddings: Optional[torch.FloatTensor] = None + image_output: Optional[BaseModelOutputWithPooling] = None + text_embeddings: Optional[torch.FloatTensor] = None + text_output: Optional[BaseModelOutputWithPooling] = None + multimodal_embeddings: Optional[torch.FloatTensor] = None + multimodal_output: Optional[BaseModelOutputWithPooling] = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_output", "image_output", "multimodal_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Class representing pretraining losses from FLAVA model + """ +) +class FlavaLosses(ModelOutput): + r""" + mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.): + Masked Image Modeling loss as used in BeIT calculated only for unimodal image data. + mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.): + Masked Language Modeling loss as used in BERT calculated only for unimodal text data. + itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.): + Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on + masked pairs in FLAVA. + global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.): + Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text + data. This is calculated on unmasked images and texts. + mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.): + Masked Multimodal Modeling loss's image component calculated on paired image-text data. + mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.): + Masked Multimodal Modeling loss's text component calculated on paired image-text data. + """ + + mim: Optional[torch.FloatTensor] = None + mlm: Optional[torch.FloatTensor] = None + itm: Optional[torch.FloatTensor] = None + global_contrastive: Optional[torch.FloatTensor] = None + mmm_image: Optional[torch.FloatTensor] = None + mmm_text: Optional[torch.FloatTensor] = None + + def all_none(self) -> bool: + all_none = True + for v in self.values(): + if v is not None: + all_none = False + break + return all_none + + +@dataclass +@auto_docstring( + custom_intro=""" + Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders. + + Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a + transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and + `text_projection` layers on `image_embeddings` and `text_embeddings` respectively. + """ +) +class FlavaForPreTrainingOutput(ModelOutput): + r""" + loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True): + Total loss calculated for this model. + loss_info (`FlavaLosses`): + Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on + the keys. + image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present): + The image embeddings which are basically the pooled output of [`FlavaImageModel`]. + image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present): + The output of the [`FlavaImageModel`]. + text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present): + The text embeddings which are basically the pooled output of [`FlavaTextModel`]. + text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present): + The output of the [`FlavaTextModel`]. + multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`): + The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`]. + multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`): + The output of the [`FlavaMultimodalModel`]. + image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present): + The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos` + to create masked images. + image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present): + The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images. + text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present): + The text embeddings which are basically the pooled output of [`FlavaTextModel`]. + text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present): + The output of the [`FlavaTextModel`]. + multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present): + The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`]. + multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present): + The output of the [`FlavaMultimodalModel`]. + mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not): + The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is + returned when `bool_masked_pos` has some of the patches masked. + mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not): + The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of + the tokens masked. + itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present): + The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA. + contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's + `image_projection` and `text_projection` layers respectively. This represents the image-text similarity + scores. This is calculated on unmasked images and texts. + contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's + `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and + texts. + mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present): + The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened + output is returned when `bool_masked_pos` has some of the patches masked. + mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present): + The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has + some of the tokens masked. + """ + + loss: Optional[torch.FloatTensor] = None + loss_info: FlavaLosses = None + image_embeddings: Optional[torch.FloatTensor] = None + image_output: Optional[BaseModelOutputWithPooling] = None + text_embeddings: Optional[torch.FloatTensor] = None + text_output: Optional[BaseModelOutputWithPooling] = None + multimodal_embeddings: Optional[torch.FloatTensor] = None + multimodal_output: Optional[BaseModelOutputWithPooling] = None + image_masked_embeddings: Optional[torch.FloatTensor] = None + image_masked_output: Optional[BaseModelOutputWithPooling] = None + text_masked_embeddings: Optional[torch.FloatTensor] = None + text_masked_output: Optional[BaseModelOutputWithPooling] = None + multimodal_masked_embeddings: Optional[torch.FloatTensor] = None + multimodal_masked_output: Optional[BaseModelOutputWithPooling] = None + mim_logits: Optional[torch.FloatTensor] = None + mlm_logits: Optional[torch.FloatTensor] = None + itm_logits: Optional[torch.FloatTensor] = None + contrastive_logits_per_image: Optional[torch.FloatTensor] = None + contrastive_logits_per_text: Optional[torch.FloatTensor] = None + mmm_image_logits: Optional[torch.FloatTensor] = None + mmm_text_logits: Optional[torch.FloatTensor] = None + + def to_tuple(self) -> tuple[Any]: + transformer_outputs = [ + "text_output", + "image_output", + "multimodal_output", + "text_masked_output", + "image_masked_output", + "multimodal_masked_output", + ] + return tuple(self[k] if k not in transformer_outputs else getattr(self, k).to_tuple() for k in self.keys()) + + +# Based on timm implementation, which can be found here: +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/image_transformer.py +class FlavaImageEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. Optionally, also the mask token. + """ + + def __init__(self, config: FlavaImageConfig, use_mask_token: bool = False) -> None: + super().__init__() + + use_mask_token = use_mask_token or config.mask_token + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None + self.patch_embeddings = PatchEmbeddings( + image_size=config.image_size, + patch_size=config.patch_size, + num_channels=config.num_channels, + embed_dim=config.hidden_size, + ) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + batch_size, seq_len, _ = embeddings.size() + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + # B X H X W = B X HW + if bool_masked_pos.dim() == 3: + bool_masked_pos = bool_masked_pos.view(bool_masked_pos.size(0), -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +# Based on timm implementation, which can be found here: +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/image_transformer.py +class PatchEmbeddings(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, + image_size: int = 224, + patch_size: Union[int, tuple[int, int]] = 16, + num_channels: int = 3, + embed_dim: int = 768, + ): + super().__init__() + if not isinstance(image_size, collections.abc.Iterable): + image_size = (image_size, image_size) + if not isinstance(patch_size, collections.abc.Iterable): + patch_size = (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if not interpolate_pos_encoding: + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" + f" ({self.image_size[0]}*{self.image_size[1]})." + ) + x = self.projection(pixel_values).flatten(2).transpose(1, 2) + return x + + +class FlavaTextEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + ): + input_shape = input_ids.size() + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class FlavaSelfAttention(nn.Module): + def __init__(self, config: FlavaPossibleConfigs) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + key_layer = ( + self.key(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class FlavaSelfOutput(nn.Module): + """ + The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other + models), due to the layernorm applied before each block. + """ + + def __init__(self, config: FlavaPossibleConfigs) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class FlavaAttention(nn.Module): + def __init__(self, config: FlavaPossibleConfigs) -> None: + super().__init__() + self.attention = FlavaSelfAttention(config) + self.output = FlavaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_outputs = self.attention( + hidden_states, attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions + ) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class FlavaIntermediate(nn.Module): + def __init__(self, config: FlavaPossibleConfigs) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + # Copied from transformers.models.vit.modeling_vit.ViTIntermediate.forward + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +class FlavaOutput(nn.Module): + def __init__(self, config: FlavaPossibleConfigs) -> None: + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # Copied from transformers.models.vit.modeling_vit.ViTOutput.forward + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + hidden_states = hidden_states + input_tensor + + return hidden_states + + +class FlavaLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: FlavaPossibleConfigs) -> None: + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = FlavaAttention(config) + self.intermediate = FlavaIntermediate(config) + self.output = FlavaOutput(config) + + # TODO: Check fp32 layer norm possibility + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.layernorm_before(hidden_states), # in ViT, layernorm is applied before self-attention + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = attention_output + hidden_states + + # in ViT, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + outputs = (layer_output,) + outputs + + return outputs + + +class FlavaEncoder(nn.Module): + def __init__(self, config: FlavaConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([FlavaLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions + ) + + +class FlavaPooler(nn.Module): + def __init__(self, config: FlavaPossibleConfigs): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@auto_docstring +class FlavaPreTrainedModel(PreTrainedModel): + config: FlavaConfig + base_model_prefix = "flava" + supports_gradient_checkpointing = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, FlavaMaskedPredictionHead): + module.bias.data.zero_() + elif isinstance(module, FlavaImageEmbeddings): + module.cls_token.data.zero_() + module.position_embeddings.data.zero_() + if module.mask_token is not None: + module.mask_token.data.zero_() + elif isinstance(module, FlavaMultimodalModel): + if module.use_cls_token: + module.cls_token.data.zero_() + elif isinstance(module, FlavaModel): + module.logit_scale.data.fill_(self.config.logit_scale_init_value) + + +@auto_docstring +class FlavaImageModel(FlavaPreTrainedModel): + config: FlavaImageConfig + # This override allows us to load FlavaImageModel from FlavaModel/FlavaForPreTraining checkpoints. + base_model_prefix = "flava.image_model" + main_input_name = "pixel_values" + + def __init__(self, config: FlavaImageConfig, add_pooling_layer: bool = True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + + self.config = config + + self.embeddings = FlavaImageEmbeddings(config) + self.encoder = FlavaEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = FlavaPooler(config) if add_pooling_layer else None + + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.patch_embeddings + + def set_input_embeddings(self, value: nn.Module): + self.embeddings.patch_embeddings = value + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring +class FlavaTextModel(FlavaPreTrainedModel): + config: FlavaTextConfig + # This override allows us to load FlavaTextModel from FlavaModel/FlavaForPreTraining checkpoints. + base_model_prefix = "flava.text_model" + + def __init__(self, config: FlavaTextConfig, add_pooling_layer: bool = True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + self.embeddings = FlavaTextEmbeddings(config) + self.encoder = FlavaEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = FlavaPooler(config) if add_pooling_layer else None + + self.post_init() + + def get_input_embeddings(self) -> PatchEmbeddings: + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value: nn.Module): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + [What are token type IDs?](../glossary#token-type-ids) + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None: + raise ValueError("You have to specify input_ids") + + input_shape = input_ids.size() + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=input_ids.device) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, input_ids.device + ) + + embedding_output = self.embeddings( + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring +class FlavaMultimodalModel(FlavaPreTrainedModel): + config: FlavaMultimodalConfig + # This override allows us to load FlavaMultimodalModel from FlavaModel/FlavaForPreTraining checkpoints. + base_model_prefix = "flava.multimodal_model" + main_input_name = "hidden_states" + + def __init__(self, config: FlavaMultimodalConfig, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + self.use_cls_token = self.config.use_cls_token + if self.use_cls_token: + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + + self.encoder = FlavaEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = FlavaPooler(config) if add_pooling_layer else None + + self.post_init() + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`): + The concatenated hidden states of unimodal encoders. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length, _ = hidden_states.size() + + if self.use_cls_token: + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + hidden_states = torch.cat((cls_tokens, hidden_states), dim=1) + seq_length += 1 + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length), device=hidden_states.device) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, (batch_size, seq_length), hidden_states.device + ) + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring +class FlavaModel(FlavaPreTrainedModel): + config: FlavaConfig + + def __init__(self, config: FlavaConfig): + super().__init__(config) + + if not isinstance(config.text_config, FlavaTextConfig): + raise TypeError( + "config.text_config is expected to be of type FlavaTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.image_config, FlavaImageConfig): + raise TypeError( + "config.image_config is expected to be of type FlavaImageConfig but is of type" + f" {type(config.image_config)}." + ) + + if not isinstance(config.multimodal_config, FlavaMultimodalConfig): + raise TypeError( + "config.multimodal_config is expected to be of type FlavaMultimodalConfig but " + + f"is of type {type(config.multimodal_config)}." + ) + + text_config = config.text_config + image_config = config.image_config + multimodal_config = config.multimodal_config + + self.projection_dim = config.projection_dim + self.text_hidden_size = text_config.hidden_size + self.image_hidden_size = image_config.hidden_size + self.mm_hidden_size = multimodal_config.hidden_size + + self.text_model = FlavaTextModel(text_config) + self.image_model = FlavaImageModel(image_config) + self.multimodal_model = FlavaMultimodalModel(multimodal_config) + + self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim) + self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + + self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size) + self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size) + # Initialize weights and apply final processing + self.post_init() + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_text_features( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + [What are token type IDs?](../glossary#token-type-ids) + + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`FlavaTextModel`]. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, FlavaModel + + >>> model = FlavaModel.from_pretrained("{0}") + >>> processor = AutoProcessor.from_pretrained("{0}") + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt" + ... ) + >>> with torch.inference_mode(): + ... text_features = model.get_text_features(**inputs) + ``` + """ + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + ) + pooled_output = text_outputs.last_hidden_state + text_features = self.text_projection(pooled_output) + + return text_features + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_image_features( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`FlavaImageModel`]. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, FlavaModel + >>> from transformers.image_utils import load_image + + >>> model = FlavaModel.from_pretrained("{0}") + >>> processor = AutoProcessor.from_pretrained("{0}") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = load_image(url) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> with torch.inference_mode(): + ... image_features = model.get_image_features(**inputs) + ``` + """ + image_outputs: BaseModelOutputWithPooling = self.image_model( + pixel_values=pixel_values, + bool_masked_pos=bool_masked_pos, + attention_mask=attention_mask, + head_mask=head_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + pooled_output = image_outputs.last_hidden_state + image_features = self.image_projection(pooled_output) + + return image_features + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + image_attention_mask: Optional[torch.Tensor] = None, + skip_multimodal_encoder: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: bool = True, + return_dict: Optional[bool] = None, + ) -> Union[tuple, FlavaOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + [What are token type IDs?](../glossary#token-type-ids) + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + image_attention_mask (`torch.Tensor` of shape `(batch_size, image_num_patches)`, *optional*): + Mask to avoid performing attention on padding pixel values for image inputs. Mask values selected in `[0, 1]`: + - 1 for pixel values that are real (i.e., **not masked**), + - 0 for pixel values that are padding (i.e., **masked**). + skip_multimodal_encoder (*bool*, *optional*): + Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, FlavaModel + + >>> model = FlavaModel.from_pretrained("facebook/flava-full") + >>> processor = AutoProcessor.from_pretrained("facebook/flava-full") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True) + + >>> outputs = model(**inputs) + + >>> image_embeddings = outputs.image_embeddings + >>> text_embeddings = outputs.text_embeddings + >>> multimodal_embeddings = outputs.multimodal_embeddings + + >>> outputs.image_embeddings.shape + torch.Size([1, 197, 768]) + + >>> text_embeddings.shape + torch.Size([1, 7, 768]) + + >>> multimodal_embeddings.shape + torch.Size([1, 205, 768]) + ``` + """ + + return_dict = return_dict if return_dict is not None else self.config.return_dict + if not output_hidden_states: + raise ValueError("FLAVA model requires hidden states to work. Please set `output_hidden_states=True`") + image_embeddings = None + image_states = None + image_mm_projection = None + image_output = None + if pixel_values is not None: + image_output = self.image_model( + pixel_values=pixel_values, + bool_masked_pos=bool_masked_pos, + attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_embeddings, image_states = image_output[0], image_output[2] + # Note that these states don't use final layernorm in the transformer model + image_mm_projection = self.image_to_mm_projection(image_states[-1]) + + text_embeddings = None + text_states = None + text_mm_projection = None + text_output = None + if input_ids is not None: + text_output = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + token_type_ids=token_type_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + text_embeddings, text_states = text_output[0], text_output[2] + # Note that these states don't use final layernorm in the transformer model + text_mm_projection = self.text_to_mm_projection(text_states[-1]) + + multimodal_embeddings = None + multimodal_output = None + if image_mm_projection is not None and text_mm_projection is not None and not skip_multimodal_encoder: + if attention_mask is not None: + batch_size, seq_len, _ = image_mm_projection.shape + if self.multimodal_model.use_cls_token: + seq_len += 1 + attention_mask_image = torch.ones(batch_size, seq_len, device=image_mm_projection.device) + attention_multimodal = torch.cat([attention_mask_image, attention_mask], dim=1) + else: + attention_multimodal = None + multimodal_input = torch.cat([image_mm_projection, text_mm_projection], dim=1) + multimodal_output = self.multimodal_model( + multimodal_input, attention_mask=attention_multimodal, return_dict=return_dict + ) + multimodal_embeddings = multimodal_output[0] + + if not return_dict: + return ( + image_embeddings, + image_output, + text_embeddings, + text_output, + multimodal_embeddings, + multimodal_output, + ) + + return FlavaModelOutput( + image_embeddings=image_embeddings, + image_output=image_output, + text_embeddings=text_embeddings, + text_output=text_output, + multimodal_embeddings=multimodal_embeddings, + multimodal_output=multimodal_output, + ) + + +class FlavaImageCodebookResPath(nn.Module): + def __init__(self, in_size: int, out_size: int, **kwargs): + super().__init__() + hid_size = out_size // 4 + + path = OrderedDict() + path["relu_1"] = nn.ReLU() + path["conv_1"] = nn.Conv2d(in_size, hid_size, kernel_size=3, padding=1) + path["relu_2"] = nn.ReLU() + path["conv_2"] = nn.Conv2d(hid_size, hid_size, kernel_size=3, padding=1) + path["relu_3"] = nn.ReLU() + path["conv_3"] = nn.Conv2d(hid_size, hid_size, kernel_size=3, padding=1) + path["relu_4"] = nn.ReLU() + path["conv_4"] = nn.Conv2d(hid_size, out_size, kernel_size=1, padding=0) + + self.path = nn.Sequential(path) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.path(x) + + +class FlavaImageCodebookBlock(nn.Module): + def __init__(self, in_size: int, out_size: int, num_layers: int, **kwargs): + super().__init__() + + self.post_gain = 1 / (num_layers**2) + + if in_size != out_size: + self.id_path = nn.Conv2d(in_size, out_size, kernel_size=1, padding=0) + else: + self.id_path = nn.Identity() + + self.res_path = FlavaImageCodebookResPath(in_size, out_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.id_path(x) + self.post_gain * self.res_path(x) + + +class FlavaImageCodebookLayerGroup(nn.Module): + def __init__(self, num_blocks: int, num_layers: int, in_size: int, out_size: int, use_pool: bool = True): + super().__init__() + blocks = OrderedDict() + for i in range(num_blocks): + if i == 0: + blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(in_size, out_size, num_layers) + else: + blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(out_size, out_size, num_layers) + + if use_pool: + blocks["pool"] = nn.MaxPool2d(kernel_size=2) + + self.group = nn.Sequential(blocks) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.group(x) + + +# Inspired by DALLE Encoder in https://github.com/openai/DALL-E/blob/5be4b236bc3ade6943662354117a0e83752cc322/dall_e/encoder.py#L42 +@auto_docstring( + custom_intro=""" + The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used + to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use + `get_codebook_indices` to get image tokens for an image. + """ +) +class FlavaImageCodebook(FlavaPreTrainedModel): + base_model_prefix = "" + config: FlavaImageCodebookConfig + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + + def __init__( + self, + config: FlavaImageCodebookConfig, + **kwargs: Any, + ): + super().__init__(config) + + self.config = config + self.num_groups = config.num_groups + self.input_channels = config.input_channels + self.num_blocks_per_group = config.num_blocks_per_group + self.hidden_size = config.hidden_size + self.vocab_size = config.vocab_size + + num_layers = self.num_groups * self.num_blocks_per_group + + output_blocks = OrderedDict() + output_blocks["relu"] = nn.ReLU() + output_blocks["conv"] = nn.Conv2d(8 * self.hidden_size, self.vocab_size, kernel_size=1, padding=0) + + blocks = OrderedDict() + blocks["input"] = nn.Conv2d(self.input_channels, 1 * self.hidden_size, kernel_size=7, padding=3) + blocks["group_1"] = FlavaImageCodebookLayerGroup( + self.num_blocks_per_group, num_layers, 1 * self.hidden_size, 1 * self.hidden_size + ) + blocks["group_2"] = FlavaImageCodebookLayerGroup( + self.num_blocks_per_group, num_layers, 1 * self.hidden_size, 2 * self.hidden_size + ) + blocks["group_3"] = FlavaImageCodebookLayerGroup( + self.num_blocks_per_group, num_layers, 2 * self.hidden_size, 4 * self.hidden_size + ) + blocks["group_4"] = FlavaImageCodebookLayerGroup( + self.num_blocks_per_group, num_layers, 4 * self.hidden_size, 8 * self.hidden_size, use_pool=False + ) + blocks["output"] = nn.Sequential(output_blocks) + + self.blocks = nn.Sequential(blocks) + + self.post_init() + + if self.config.freeze: + for param in self.parameters(): + param.requires_grad = False + + def get_codebook_indices(self, pixel_values: torch.Tensor) -> torch.Tensor: + f""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing + `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoImageProcessor, FlavaImageCodebook + + >>> model = FlavaImageCodebook.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}") + >>> image_processor = AutoImageProcessor.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt") + >>> inputs = dict(pixel_values=inputs.codebook_pixel_values) + + >>> outputs = model.get_codebook_indices(**inputs) + ``` + """ + z_logits = self.blocks(pixel_values) + return torch.argmax(z_logits, axis=1) + + def get_codebook_probs(self, pixel_values: torch.Tensor) -> torch.Tensor: + z_logits = self.blocks(pixel_values) + return nn.Softmax(dim=1)(z_logits) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + f""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing + `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoImageProcessor, FlavaImageCodebook + + >>> model = FlavaImageCodebook.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}") + >>> image_processor = AutoImageProcessor.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt") + >>> inputs = dict(pixel_values=inputs.codebook_pixel_values) + + >>> outputs = model(**inputs) + >>> print(outputs.shape) + (1, 196) + ``` + """ + if len(pixel_values.shape) != 4: + raise ValueError(f"input shape {pixel_values.shape} is not 4d") + if pixel_values.shape[1] != self.input_channels: + raise ValueError(f"input has {pixel_values.shape[1]} channels but model built for {self.input_channels}") + return self.blocks(pixel_values) + + +class FlavaPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class FlavaMaskedPredictionHead(nn.Module): + def __init__(self, config, weight=None): + super().__init__() + self.config = config + self.transform = FlavaPredictionHeadTransform(config) + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + if weight is not None: + self.decoder.weight = weight + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, x): + x = self.transform(x) + x = self.decoder(x) + return x + + +class FlavaITMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.pooler = FlavaPooler(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, x): + x = self.pooler(x) + x = self.seq_relationship(x) + return x + + +class FlavaGlobalContrastiveHead(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.global_backprop_contrastive = config.global_backprop_contrastive + + def forward(self, image_embeddings, text_embeddings, logit_scale): + temperature = torch.exp(logit_scale) + if not torch.distributed.is_available() or not torch.distributed.is_initialized(): + labels = torch.arange(image_embeddings.size(0), device=image_embeddings.device) + image_embeddings_all = [image_embeddings] + text_embeddings_all = [text_embeddings] + else: + local_batch_size = image_embeddings.size(0) + world_size = torch.distributed.get_world_size() + + if self.global_backprop_contrastive: + # `torch.distributed.nn.functional.all_gather` does backprop on all active workers + # whereas `torch.distributed.all_gather` does only backpropagates on the current worker. + image_embeddings_all = torch.distributed.nn.functional.all_gather(image_embeddings) + text_embeddings_all = torch.distributed.nn.functional.all_gather(text_embeddings) + else: + image_embeddings_all = [torch.zeros_like(text_embeddings) for _ in range(world_size)] + text_embeddings_all = [torch.zeros_like(image_embeddings) for _ in range(world_size)] + torch.distributed.all_gather(image_embeddings_all, image_embeddings) + torch.distributed.all_gather(text_embeddings_all, text_embeddings) + + labels = local_batch_size * torch.distributed.get_rank() + torch.arange( + local_batch_size, device=image_embeddings.device + ) + + image_embeddings_all = torch.cat(image_embeddings_all) + text_embeddings_all = torch.cat(text_embeddings_all) + + logits_per_image = torch.matmul(image_embeddings, text_embeddings_all.transpose(0, 1)) * temperature + logits_per_text = torch.matmul(text_embeddings, image_embeddings_all.transpose(0, 1)) * temperature + + return logits_per_image, logits_per_text, labels + + +@auto_docstring( + custom_intro=""" + The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs. + """ +) +class FlavaForPreTraining(FlavaPreTrainedModel): + # Those are linked to xxx.bias + _tied_weights_keys = [ + "mmm_text_head.decoder.bias", + "mmm_image_head.decoder.bias", + "mlm_head.decoder.bias", + "mim_head.decoder.bias", + ] + + def __init__(self, config: FlavaConfig, image_codebook: Optional[nn.Module] = None): + r""" + image_codebook ([`nn.Module`]): + If passed, the image codebook will be set to this. Otherwise, it will be initialized using the + image_codebook_config defined in the config first as the first parameter. + """ + super().__init__(config) + self.flava = FlavaModel(config) + + self.image_codebook = image_codebook + if self.image_codebook is None and config.init_codebook: + self.image_codebook = FlavaImageCodebook(config.image_codebook_config) + + # Levarage text and image encoder configs to create the masked + # head since it has the right vocab + self.mim_head = FlavaMaskedPredictionHead(config.image_config) + self.mlm_head = FlavaMaskedPredictionHead(config.text_config) + self.itm_head = FlavaITMHead(config) + self.mmm_image_head = FlavaMaskedPredictionHead(config.image_config) + self.mmm_text_head = FlavaMaskedPredictionHead(config.text_config) + self.global_contrastive_head = FlavaGlobalContrastiveHead(config) + + self.image_vocab_size = config.image_config.vocab_size + self.text_vocab_size = config.text_config.vocab_size + self.mlm_weight = config.mlm_weight + self.mim_weight = config.mim_weight + self.global_contrastive_weight = config.global_contrastive_weight + self.ce_ignore_index = config.ce_ignore_index + self.itm_weight = config.itm_weight + self.mmm_image_weight = config.mmm_image_weight + self.mmm_text_weight = config.mmm_text_weight + self.skip_unmasked_multimodal_encoder = config.skip_unmasked_multimodal_encoder + + self.post_init() + + def _resize_to_2d(self, x: torch.Tensor): + if x.dim() > 2: + x = x.view(x.size(0), -1) + return x + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + input_ids_masked: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + codebook_pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + image_attention_mask: Optional[torch.Tensor] = None, + skip_unmasked_multimodal_encoder: Optional[bool] = None, + mlm_labels: Optional[torch.Tensor] = None, + mim_labels: Optional[torch.Tensor] = None, + itm_labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: bool = True, + return_dict: Optional[bool] = None, + return_loss: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], FlavaForPreTrainingOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + input_ids_masked (`torch.LongTensor` of shape `(batch_size, text_seq_len)`): + Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task + to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with + [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) + codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*): + Pixel values for image patches that are used to compute the image codebook labels for masked image modeling. + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + [What are token type IDs?](../glossary#token-type-ids) + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + image_attention_mask (`torch.FloatTensor` of shape `(batch_size, image_num_patches)`, *optional*): + Mask to avoid performing attention on padding token indices specifically for images. Mask values selected + in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + skip_unmasked_multimodal_encoder (*bool*, *optional*): + Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked + multimodal embeddings or outputs as of now. + mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*): + Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction). + Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with + indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, + ..., text_config.vocab_size - 1]`. + mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*): + Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ..., + image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only + computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are + generated automatically using the image codebook assigned to the model. By default, it uses + [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels. + itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*): + Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match. + The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well. + return_loss (`bool`, *optional*, default to None): + Whether to return calculated loss or not. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import FlavaForPreTraining, AutoProcessor + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full") + >>> processor = AutoProcessor.from_pretrained("facebook/flava-full") + + >>> text = ["a photo of a cat"] + + >>> inputs = processor( + ... images=[image], + ... text=text, + ... return_masks=True, + ... return_codebook_pixels=True, + ... padding=True, + ... max_length=77, + ... return_tensors="pt", + ... ) + + + >>> output = model(**inputs) + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_loss = return_loss if return_loss is not None else self.config.return_loss + + skip_unmasked_multimodal_encoder = ( + skip_unmasked_multimodal_encoder + if skip_unmasked_multimodal_encoder is not None + else self.skip_unmasked_multimodal_encoder + ) + + if input_ids_masked is None and input_ids is not None: + logger.warning( + "`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to" + " `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if" + " you are doing inference on unmasked text..." + ) + input_ids_masked = input_ids + + flava_output = self.flava( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + image_attention_mask=image_attention_mask, + # Don't need unmasked multimodal embedding for anything so skip it + # NOTE: ITM uses masked version + skip_multimodal_encoder=skip_unmasked_multimodal_encoder, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + # Pass true to have deterministic outputs + return_dict=True, + ) + + flava_masked_output = self.flava( + input_ids=input_ids_masked, + pixel_values=pixel_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + image_attention_mask=image_attention_mask, + bool_masked_pos=bool_masked_pos, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + pos_mask = None + + image_embeddings = flava_output.image_embeddings + text_embeddings = flava_output.text_embeddings + image_masked_embeddings = flava_masked_output.image_embeddings + text_masked_embeddings = flava_masked_output.text_embeddings + multimodal_masked_embeddings = flava_masked_output.multimodal_embeddings + + total_loss = mim_loss = mlm_loss = mmm_text_loss = mmm_image_loss = gc_loss = itm_loss = None + mim_logits = mlm_logits = mmm_text_logits = mmm_image_logits = None + itm_logits = logits_per_image = logits_per_text = None + + # Calculate mim_labels if necessary from the image_codebook + if image_masked_embeddings is not None or multimodal_masked_embeddings is not None: + if mim_labels is None and return_loss: + if self.image_codebook is None: + raise RuntimeError( + "`return_loss` is set to True but the image codebook is not initialized and no `mim_labels` " + " have been passed. Reinstantiate the model with `init_codebook` set to True or " + "pass in your custom `mim_labels`" + ) + if codebook_pixel_values is None: + raise ValueError( + "`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. " + "Call `AutoProcessor` with `return_codebook_pixels` set to True" + ) + mim_labels = self.image_codebook.get_codebook_indices(codebook_pixel_values) + # Unimodal MIM Loss + # If multimodal embeddings are present, we will calculate MMM loss + if self.mim_weight > 0 and image_masked_embeddings is not None and multimodal_masked_embeddings is None: + sequence_for_image = image_masked_embeddings + + if mim_labels is not None: + mim_labels = self._resize_to_2d(mim_labels) + bool_masked_pos = self._resize_to_2d(bool_masked_pos) + mim_labels[bool_masked_pos.ne(True)] = self.ce_ignore_index + + sequence_for_image = sequence_for_image[:, -mim_labels.size(1) :, :] + masked_tokens = mim_labels.ne(self.ce_ignore_index) + mim_labels_filtered = mim_labels[masked_tokens] + sequence_for_image = sequence_for_image[masked_tokens, :] + mim_logits = self.mim_head(sequence_for_image) + if return_loss: + mim_loss = nn.functional.cross_entropy( + mim_logits.view(-1, self.image_vocab_size), mim_labels_filtered.view(-1) + ) + mim_loss *= self.mim_weight + else: + mim_logits = self.mim_head(sequence_for_image) + + # Unimodal MLM Loss + if self.mlm_weight > 0 and text_masked_embeddings is not None and multimodal_masked_embeddings is None: + sequence_for_text = text_masked_embeddings + if mlm_labels is not None: + mlm_labels = self._resize_to_2d(mlm_labels) + sequence_for_text = sequence_for_text[:, -mlm_labels.size(1) :, :] + masked_tokens = mlm_labels.ne(self.ce_ignore_index) + mlm_labels_filtered = mlm_labels[masked_tokens] + sequence_for_text = sequence_for_text[masked_tokens, :] + mlm_logits = self.mlm_head(sequence_for_text) + if return_loss: + mlm_loss = nn.functional.cross_entropy( + mlm_logits.view(-1, self.text_vocab_size), mlm_labels_filtered.view(-1) + ) + mlm_loss *= self.mlm_weight + else: + mlm_logits = self.mlm_head(sequence_for_text) + + # ITM Loss + if self.itm_weight > 0 and multimodal_masked_embeddings is not None: + itm_logits = self.itm_head(multimodal_masked_embeddings) + + if itm_labels is not None: + pos_pairs = itm_labels.ne(0) + pos_mask = torch.where(pos_pairs.any(), pos_pairs, pos_pairs.new([True])) + if return_loss: + itm_loss = nn.functional.cross_entropy(itm_logits, itm_labels) + itm_loss *= self.itm_weight + + if multimodal_masked_embeddings is not None: + multimodal_masked_embeddings = multimodal_masked_embeddings[pos_mask] + + if mlm_labels is not None: + mlm_labels = mlm_labels[pos_mask] + + if mim_labels is not None: + mim_labels = mim_labels[pos_mask] + bool_masked_pos = bool_masked_pos[pos_mask] + + # MMM Image Loss + if multimodal_masked_embeddings is not None and self.mmm_image_weight > 0: + sequence_for_image = multimodal_masked_embeddings + end_index = image_masked_embeddings.size(1) - 1 + sequence_for_image = sequence_for_image[:, 2 : 2 + end_index, :] + + if mim_labels is not None: + mim_labels = self._resize_to_2d(mim_labels) + bool_masked_pos = self._resize_to_2d(bool_masked_pos) + mim_labels[bool_masked_pos.ne(True)] = self.ce_ignore_index + + masked_tokens = mim_labels.ne(self.ce_ignore_index) + mim_labels_filtered = mim_labels[masked_tokens] + sequence_for_image = sequence_for_image[masked_tokens, :] + mmm_image_logits = self.mmm_image_head(sequence_for_image) + if return_loss: + mmm_image_loss = nn.functional.cross_entropy( + mmm_image_logits.view(-1, self.image_vocab_size), mim_labels_filtered.view(-1) + ) + mmm_image_loss *= self.mmm_image_weight + else: + mmm_image_logits = self.mmm_image_head(sequence_for_image) + + # MMM Text Loss + if multimodal_masked_embeddings is not None and self.mmm_text_weight > 0: + sequence_for_text = multimodal_masked_embeddings + sequence_for_text = sequence_for_text[:, -text_masked_embeddings.size(1) :, :] + + if mlm_labels is not None: + mlm_labels = self._resize_to_2d(mlm_labels) + masked_tokens = mlm_labels.ne(self.ce_ignore_index) + mlm_labels_filtered = mlm_labels[masked_tokens] + sequence_for_text = sequence_for_text[masked_tokens, :] + mmm_text_logits = self.mmm_text_head(sequence_for_text) + if return_loss: + mmm_text_loss = nn.functional.cross_entropy( + mmm_text_logits.view(-1, self.text_vocab_size), mlm_labels_filtered.view(-1) + ) + mmm_text_loss *= self.mmm_text_weight + else: + mmm_text_logits = self.mmm_text_head(sequence_for_text) + + # Global Contrastive Loss + if image_embeddings is not None and text_embeddings is not None and self.global_contrastive_weight > 0: + text_embedding = self.flava.text_projection(text_embeddings[:, 0, :]) + text_embedding = nn.functional.normalize(text_embedding, dim=-1) + + image_embedding = self.flava.image_projection(image_embeddings[:, 0, :]) + image_embedding = nn.functional.normalize(image_embedding, dim=-1) + + self.flava.logit_scale.data.clamp_(LOGIT_SCALE_CLAMP_MIN, LOGIT_SCALE_CLAMP_MAX) + + logits_per_image, logits_per_text, gc_labels = self.global_contrastive_head( + image_embedding, text_embedding, self.flava.logit_scale + ) + + # Apply ITM negative mask if any + if pos_mask is not None: + logits_per_image = logits_per_image[pos_mask] + logits_per_text = logits_per_text[pos_mask] + gc_labels = gc_labels[pos_mask] + + if return_loss: + gc_loss_image = nn.functional.cross_entropy(logits_per_image, gc_labels) + gc_loss_text = nn.functional.cross_entropy(logits_per_text, gc_labels) + gc_loss = (gc_loss_image + gc_loss_text) / 2 + gc_loss *= self.global_contrastive_weight + + flava_losses = FlavaLosses( + mim=mim_loss, + mlm=mlm_loss, + itm=itm_loss, + global_contrastive=gc_loss, + mmm_image=mmm_image_loss, + mmm_text=mmm_text_loss, + ) + + if return_loss and not flava_losses.all_none(): + total_loss = sum(loss if loss is not None else 0 for loss in flava_losses.values()) + + if not return_dict: + output = ( + image_embeddings, + flava_output.image_output.to_tuple() if flava_output.image_output is not None else None, + text_embeddings, + flava_output.text_output.to_tuple() if flava_output.text_output is not None else None, + flava_output.multimodal_embeddings, + flava_output.multimodal_output.to_tuple() if flava_output.multimodal_output is not None else None, + image_masked_embeddings, + flava_masked_output.image_output.to_tuple() if flava_masked_output.image_output is not None else None, + text_masked_embeddings, + flava_masked_output.text_output.to_tuple() if flava_masked_output.text_output is not None else None, + multimodal_masked_embeddings, + flava_masked_output.multimodal_output.to_tuple() + if flava_masked_output.multimodal_output is not None + else None, + mim_logits, + mlm_logits, + itm_logits, + logits_per_image, + logits_per_image, + mmm_image_logits, + mmm_text_logits, + ) + if return_loss and not flava_losses.all_none(): + output = ( + total_loss, + flava_losses, + ) + output + + # Filter None as transformer by default won't handle it + return tuple(x for x in output if x is None) + + return FlavaForPreTrainingOutput( + loss=total_loss, + loss_info=flava_losses, + image_embeddings=image_embeddings, + image_output=flava_output.image_output, + text_embeddings=text_embeddings, + text_output=flava_output.text_output, + multimodal_embeddings=flava_output.multimodal_embeddings, + multimodal_output=flava_output.multimodal_output, + image_masked_embeddings=image_masked_embeddings, + image_masked_output=flava_masked_output.image_output, + text_masked_embeddings=text_masked_embeddings, + text_masked_output=flava_masked_output.text_output, + multimodal_masked_embeddings=multimodal_masked_embeddings, + multimodal_masked_output=flava_masked_output.multimodal_output, + mim_logits=mim_logits, + mlm_logits=mlm_logits, + itm_logits=itm_logits, + contrastive_logits_per_image=logits_per_image, + contrastive_logits_per_text=logits_per_text, + mmm_image_logits=mmm_image_logits, + mmm_text_logits=mmm_text_logits, + ) + + +__all__ = [ + "FlavaForPreTraining", + "FlavaImageCodebook", + "FlavaImageModel", + "FlavaModel", + "FlavaMultimodalModel", + "FlavaPreTrainedModel", + "FlavaTextModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/processing_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/processing_flava.py new file mode 100644 index 0000000000000000000000000000000000000000..ceebdb6efa49134c3d078a8dd03b08da7c59fb9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/processing_flava.py @@ -0,0 +1,103 @@ +# coding=utf-8 +# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for FLAVA +""" + +import warnings +from collections.abc import Iterable +from typing import Optional, Union + +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin + + +class FlavaImagesKwargs(ImagesKwargs): + # Mask related params + return_image_mask: Optional[bool] + input_size_patches: Optional[int] + total_mask_patches: Optional[int] + mask_group_min_patches: Optional[int] + mask_group_max_patches: Optional[int] + mask_group_min_aspect_ratio: Optional[float] + mask_group_max_aspect_ratio: Optional[float] + # Codebook related params + return_codebook_pixels: Optional[bool] + codebook_do_resize: Optional[bool] + codebook_size: Optional[bool] + codebook_resample: Optional[int] + codebook_do_center_crop: Optional[bool] + codebook_crop_size: Optional[int] + codebook_do_rescale: Optional[bool] + codebook_rescale_factor: Optional[Union[int, float]] + codebook_do_map_pixels: Optional[bool] + codebook_do_normalize: Optional[bool] + codebook_image_mean: Optional[Union[float, Iterable[float]]] + codebook_image_std: Optional[Union[float, Iterable[float]]] + + +class FlavaProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: FlavaImagesKwargs + _defaults = {} + + +class FlavaProcessor(ProcessorMixin): + r""" + Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor. + + [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the + [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information. + + Args: + image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input. + tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "FlavaImageProcessor" + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + valid_processor_kwargs = FlavaProcessorKwargs + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + feature_extractor = None + if "feature_extractor" in kwargs: + warnings.warn( + "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" + " instead.", + FutureWarning, + ) + feature_extractor = kwargs.pop("feature_extractor") + + image_processor = image_processor if image_processor is not None else feature_extractor + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + + @property + def feature_extractor_class(self): + warnings.warn( + "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.", + FutureWarning, + ) + return self.image_processor_class + + @property + def feature_extractor(self): + warnings.warn( + "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.", + FutureWarning, + ) + return self.image_processor + + +__all__ = ["FlavaProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/configuration_fuyu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/configuration_fuyu.py new file mode 100644 index 0000000000000000000000000000000000000000..40da84e2e780821f26765333a2cee51030e0bea4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/configuration_fuyu.py @@ -0,0 +1,215 @@ +# coding=utf-8 +# Copyright 2023 Adept AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fuyu model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class FuyuConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an + Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the + [adept/fuyu-8b](https://huggingface.co/adept/fuyu-8b). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 262144): + Vocabulary size of the Fuyu model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`FuyuForCausalLM`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 16384): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 36): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 64): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 16384): + The maximum sequence length that this model might ever be used with. + image_size (`int`, *optional*, defaults to 300): + The input image size. + patch_size (`int`, *optional*, defaults to 30): + The input vision transformer encoding patch size. + num_channels (`int`, *optional*, defaults to 3): + The input image number of channels. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. Whether to tie weight embeddings + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie input and output embeddings. + rope_theta (`float`, *optional*, defaults to 25000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + qk_layernorm (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the Queries and Keys after projecting the hidden states + hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio after applying the MLP to the hidden states. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio after computing the attention scores. + partial_rotary_factor (`float`, *optional*, defaults to 0.5): + Percentage of the query and keys which will have rotary embedding. + + pad_token_id (`int`, *optional*): + The id of the *padding* token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the *beginning-of-sequence* token. + eos_token_id (`Union[int, list[int]]`, *optional*, defaults to 2): + The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + image_token_id (`int`, *optional*, defaults to 71011): + The id of the image placeholder token. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize the `language``[`Aut`]. + + ```python + >>> from transformers import FuyuConfig + + >>> # Initializing a Fuyu fuyu-7b style configuration + >>> configuration = FuyuConfig() + ```""" + + model_type = "fuyu" + sub_configs = {"text_config": AutoConfig} + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=262144, + hidden_size=4096, + intermediate_size=16384, + num_hidden_layers=36, + num_attention_heads=64, + hidden_act="relu2", + max_position_embeddings=16384, + image_size=300, + patch_size=30, + num_channels=3, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + tie_word_embeddings=False, + rope_theta=25000.0, + rope_scaling=None, + qk_layernorm=True, + hidden_dropout=0.0, + attention_dropout=0.0, + partial_rotary_factor=0.5, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + image_token_id=71011, + text_config=None, + **kwargs, + ): + if text_config is None: + text_config = { + "vocab_size": vocab_size, + "max_position_embeddings": max_position_embeddings, + "hidden_size": hidden_size, + "intermediate_size": intermediate_size, + "num_hidden_layers": num_hidden_layers, + "num_attention_heads": num_attention_heads, + "hidden_act": hidden_act, + "initializer_range": initializer_range, + "layer_norm_eps": layer_norm_eps, + "use_cache": use_cache, + "rope_theta": rope_theta, + "rope_scaling": rope_scaling, + "qk_layernorm": qk_layernorm, + "hidden_dropout": hidden_dropout, + "attention_dropout": attention_dropout, + "partial_rotary_factor": partial_rotary_factor, + "pad_token_id": pad_token_id, + "bos_token_id": bos_token_id, + "eos_token_id": eos_token_id, + "tie_word_embeddings": tie_word_embeddings, + } + logger.info("text_config is None. initializing the text model with default values.") + text_model_type = text_config.get("model_type", "persimmon") + self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + + self._vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.qk_layernorm = qk_layernorm + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.partial_rotary_factor = partial_rotary_factor + self.image_token_id = image_token_id + self._rope_scaling_validation() + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + + +__all__ = ["FuyuConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75fd33b864ad4f79952c2cc200e0f7463487ea73 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff4dce97e8d9ca46814f5f7dfa539c9671a66939 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04b13775c660925697a7772373cc0ae6731ef28b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75af5d891127f73d4623327cd32d7af3da7fb20e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4989362133b183d823f74918ebec4e4bb37f08c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..413b2dcdaa8a1211d30d2a1551e40fa04d29aa87 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4418a73de64afdc17a2e39be34729d24e937970f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e20d9814e9f328d81d239db08e26afa1613fabaa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b2133e5b7c31ede5c7c0e91e4697fd260d1312b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f272c26bec1f425cc7839e2abf5f210fd984c98 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..309fa053272dd3904836b34c752337c1a773e9fb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/processing_git.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/processing_git.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..689d15179be9eee38378f286ee287d58533ca18a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/processing_git.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c4fe84b6a647aba705f5043964eda1ae9a8803c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bfc2b32688c827912dab312888c9d585e0504c3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ffba882e79453821d8398fa810b8fae7fd164c7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1983d78e86a651cef70e3ae00ddb3054ee972f23 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8f5622e7d4ce992e732798d02188d46063bc8df Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd16fb4a5770058abd4d56849a34ad3bbee25b38 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f633d74e82300345a2eb81b289621a92951b237 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4da69bbbebfc94795056a3d7d7659141c1de374a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..00b6ccc53fc0efb0fc88c2f95586276cd40010fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_got_ocr2 import * + from .image_processing_got_ocr2 import * + from .image_processing_got_ocr2_fast import * + from .modeling_got_ocr2 import * + from .processing_got_ocr2 import * + + +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py new file mode 100644 index 0000000000000000000000000000000000000000..eb22e62bfd7df1f56b8c5b13ac6fcacb90df6696 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -0,0 +1,840 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/got_ocr2/modular_got_ocr2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_got_ocr2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import collections +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from transformers.utils.generic import check_model_inputs + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ..auto import AutoModel +from .configuration_got_ocr2 import GotOcr2Config, GotOcr2VisionConfig + + +class GotOcr2MLPBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim) + self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size) + self.act = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.lin1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.lin2(hidden_states) + return hidden_states + + +class GotOcr2VisionAttention(nn.Module): + """Multi-head Attention block with relative position embeddings.""" + + def __init__(self, config, window_size): + super().__init__() + input_size = ( + (config.image_size // config.patch_size, config.image_size // config.patch_size) + if window_size == 0 + else (window_size, window_size) + ) + + self.num_attention_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + self.scale = head_dim**-0.5 + self.dropout = config.attention_dropout + + self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias) + self.proj = nn.Linear(config.hidden_size, config.hidden_size) + + self.use_rel_pos = config.use_rel_pos + if self.use_rel_pos: + if input_size is None: + raise ValueError("Input size must be provided if using relative positional encoding.") + + # initialize relative positional embeddings + self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + + def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. + + Args: + q_size (int): + size of the query. + k_size (int): + size of key k. + rel_pos (`torch.Tensor`): + relative position embeddings (L, channel). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + + return rel_pos_resized[relative_coords.long()] + + def get_decomposed_rel_pos( + self, + query: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: tuple[int, int], + k_size: tuple[int, int], + ) -> torch.Tensor: + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py + + Args: + query (`torch.Tensor`): + query q in the attention layer with shape (batch_size, query_height * query_width, channel). + rel_pos_h (`torch.Tensor`): + relative position embeddings (Lh, channel) for height axis. + rel_pos_w (`torch.Tensor`): + relative position embeddings (Lw, channel) for width axis. + q_size (tuple): + spatial sequence size of query q with (query_height, query_width). + k_size (tuple): + spatial sequence size of key k with (key_height, key_width). + + Returns: + decomposed_rel_pos (`torch.Tensor`): + decomposed relative position embeddings. + """ + query_height, query_width = q_size + key_height, key_width = k_size + relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h) + relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w) + + batch_size, _, dim = query.shape + reshaped_query = query.reshape(batch_size, query_height, query_width, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height) + rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width) + + decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] + + return decomposed_rel_pos + + def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, height, width, _ = hidden_states.shape + # qkv with shape (3, batch_size, nHead, height * width, channel) + qkv = ( + self.qkv(hidden_states) + .reshape(batch_size, height * width, 3, self.num_attention_heads, -1) + .permute(2, 0, 3, 1, 4) + ) + # q, k, v with shape (batch_size * nHead, height * width, channel) + query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0) + + attn_weights = (query * self.scale) @ key.transpose(-2, -1) + + if self.use_rel_pos: + decomposed_rel_pos = self.get_decomposed_rel_pos( + query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width) + ) + decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights) + attn_weights = attn_weights + decomposed_rel_pos + + attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype) + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1) + attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1) + + attn_output = self.proj(attn_output) + return attn_output, attn_weights + + +class GotOcr2VisionLayer(GradientCheckpointingLayer): + def __init__(self, config, window_size): + super().__init__() + self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attn = GotOcr2VisionAttention(config, window_size) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.mlp = GotOcr2MLPBlock(config) + self.window_size = window_size + + def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]: + """ + Args: + Partition into non-overlapping windows with padding if needed. + hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window + size. + + Returns: + windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel]. + (pad_height, pad_width): padded height and width before partition + """ + batch_size, height, width, channel = hidden_states.shape + + pad_h = (window_size - height % window_size) % window_size + pad_w = (window_size - width % window_size) % window_size + hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h)) + pad_height, pad_width = height + pad_h, width + pad_w + + hidden_states = hidden_states.reshape( + batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel + ) + windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel) + return windows, (pad_height, pad_width) + + def window_unpartition( + self, windows: torch.Tensor, window_size: int, padding_shape: tuple[int, int], original_shape: tuple[int, int] + ) -> torch.Tensor: + """ + Args: + Window unpartition into original sequences and removing padding. + hidden_states (tensor): + input tokens with [batch_size * num_windows, window_size, window_size, channel]. + window_size (int): + window size. + padding_shape (Tuple): + padded height and width (pad_height, pad_width). + original_shape (Tuple): original height and width (height, width) before padding. + + Returns: + hidden_states: unpartitioned sequences with [batch_size, height, width, channel]. + """ + pad_height, pad_width = padding_shape + height, width = original_shape + batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size) + hidden_states = windows.reshape( + batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1 + ) + hidden_states = ( + hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1) + ) + + hidden_states = hidden_states[:, :height, :width, :].contiguous() + return hidden_states + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]: + residual = hidden_states + hidden_states = self.layer_norm1(hidden_states) + # Window partition + if self.window_size > 0: + height, width = hidden_states.shape[1], hidden_states.shape[2] + hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size) + + hidden_states, attn_weights = self.attn( + hidden_states=hidden_states, + ) + # Reverse window partition + if self.window_size > 0: + hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width)) + + hidden_states = residual + hidden_states + layernorm_output = self.layer_norm2(hidden_states) + hidden_states = hidden_states + self.mlp(layernorm_output) + return hidden_states + + +@auto_docstring +class GotOcr2PreTrainedModel(PreTrainedModel): + config: GotOcr2Config + base_model_prefix = "" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = False + _supports_sdpa = False + + _can_compile_fullgraph = True + _supports_flex_attn = False + _supports_attention_backend = True + + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, GotOcr2VisionAttention): + if module.use_rel_pos: + module.rel_pos_h.data.zero_() + module.rel_pos_w.data.zero_() + elif isinstance(module, GotOcr2VisionEncoder): + if module.pos_embed is not None: + module.pos_embed.data.zero_() + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for got_ocr2 vision model's outputs that also contains image embeddings obtained by applying the projection + layer to the pooler_output. + """ +) +class GotOcr2VisionEncoderOutput(ModelOutput): + r""" + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +class GotOcr2PatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values): + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})." + ) + embeddings = self.projection(pixel_values).permute(0, 2, 3, 1) + return embeddings + + +class GotOcr2LayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): + super().__init__(normalized_shape, eps=eps, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features + + +class GotOcr2VisionNeck(nn.Module): + def __init__(self, config: GotOcr2VisionConfig): + super().__init__() + self.config = config + + self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False) + self.layer_norm1 = GotOcr2LayerNorm(config.output_channels, data_format="channels_first") + self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False) + self.layer_norm2 = GotOcr2LayerNorm(config.output_channels, data_format="channels_first") + + def forward(self, hidden_states): + hidden_states = hidden_states.permute(0, 3, 1, 2) + hidden_states = self.conv1(hidden_states) + hidden_states = self.layer_norm1(hidden_states) + + hidden_states = self.conv2(hidden_states) + hidden_states = self.layer_norm2(hidden_states) + return hidden_states + + +class GotOcr2VisionEncoder(GotOcr2PreTrainedModel): + _can_record_outputs = {"hidden_states": GotOcr2VisionLayer, "attentions": GotOcr2VisionAttention} + + def __init__(self, config: GotOcr2VisionConfig): + super().__init__(config) + self.config = config + self.image_size = config.image_size + self.patch_embed = GotOcr2PatchEmbeddings(config) + + self.pos_embed = None + if config.use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + self.pos_embed = nn.Parameter( + torch.zeros( + 1, + config.image_size // config.patch_size, + config.image_size // config.patch_size, + config.hidden_size, + ) + ) + + self.layers = nn.ModuleList() + for i in range(config.num_hidden_layers): + layer = GotOcr2VisionLayer( + config, + window_size=config.window_size if i not in config.global_attn_indexes else 0, + ) + self.layers.append(layer) + + self.neck = GotOcr2VisionNeck(config) + + self.gradient_checkpointing = False + + def get_input_embeddings(self): + return self.patch_embed + + @check_model_inputs(tie_last_hidden_states=False) + def forward( + self, pixel_values: Optional[torch.FloatTensor] = None, **kwargs: Unpack[TransformersKwargs] + ) -> GotOcr2VisionEncoderOutput: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.patch_embed(pixel_values) + if self.pos_embed is not None: + hidden_states = hidden_states + self.pos_embed + for layer_module in self.layers: + hidden_states = layer_module(hidden_states) + hidden_states = self.neck(hidden_states) + return GotOcr2VisionEncoderOutput( + last_hidden_state=hidden_states, + ) + + +class GotOcr2MultiModalProjector(nn.Module): + def __init__(self, config: GotOcr2Config): + super().__init__() + vision_output_channels = config.vision_config.output_channels + language_hidden_size = config.text_config.hidden_size + self.conv_upsampler1 = nn.Conv2d( + vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.conv_upsampler2 = nn.Conv2d( + vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False + ) + self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size) + + def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor: + hidden_state = self.conv_upsampler1(vision_embeddings) + hidden_state = self.conv_upsampler2(hidden_state) + hidden_state = hidden_state.flatten(2).permute(0, 2, 1) + hidden_state = self.multimodal_projector(hidden_state) + return hidden_state + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for GotOcr2 causal language model (or autoregressive) outputs. + """ +) +class GotOcr2CausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for GotOcr2 outputs, with hidden states and attentions. + """ +) +class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring( + custom_intro=""" + The GotOcr2 model which consists of a vision backbone and a language model, without a language modeling head. + """ +) +class GotOcr2Model(GotOcr2PreTrainedModel): + _checkpoint_conversion_mapping = {"language_model.model": "language_model"} + + def __init__(self, config: GotOcr2Config): + super().__init__(config) + self.vision_tower = GotOcr2VisionEncoder(config.vision_config) + + self.multi_modal_projector = GotOcr2MultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_outputs = self.vision_tower(pixel_values).last_hidden_state + return self.multi_modal_projector(image_outputs) + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, GotOcr2ModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype)) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **kwargs, + ) + + return GotOcr2ModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + The GOT_OCR2 model which consists of a vision backbone and a language model. + """ +) +class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = { + "^language_model.model": "model.language_model", + "^vision_tower": "model.vision_tower", + "^multi_modal_projector": "model.multi_modal_projector", + "^language_model.lm_head": "lm_head", + } + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: GotOcr2Config): + super().__init__(config) + self.model = GotOcr2Model(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + **kwargs, + ) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, GotOcr2CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, GotOcr2ForConditionalGeneration, TextStreamer + + >>> model = GotOcr2ForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda") + >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") + + >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda") + + >>> # Generate + >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) + >>> generate_ids = model.generate( + ... **inputs, + ... do_sample=False, + ... tokenizer = processor.tokenizer, + ... stop_strings='<|im_end|>', + ... streamer=streamer, + ... max_new_tokens=4096, + ... ) + "You should keep in mind what features from the module should be used, especially + when you're planning to sell a template." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return GotOcr2CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["GotOcr2PreTrainedModel", "GotOcr2Model", "GotOcr2ForConditionalGeneration"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py new file mode 100644 index 0000000000000000000000000000000000000000..0ecf39fcd03b577b406868a639d3ce8ee9425e3d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py @@ -0,0 +1,483 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...cache_utils import Cache +from ...configuration_utils import PretrainedConfig +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import auto_docstring, can_return_tuple, logging +from ..auto import CONFIG_MAPPING, AutoConfig +from ..llava.modeling_llava import ( + LlavaCausalLMOutputWithPast, + LlavaForConditionalGeneration, + LlavaModel, + LlavaModelOutputWithPast, + LlavaPreTrainedModel, + TransformersKwargs, +) +from ..sam.modeling_sam import ( + SamMLPBlock, + SamPreTrainedModel, + SamVisionAttention, + SamVisionEncoder, + SamVisionLayer, +) + + +logger = logging.get_logger(__name__) + + +class GotOcr2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GotOcr2VisionModel`]. It is used to instantiate a GOT_OCR2 + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + defaults will yield a similar configuration to that of the SAM ViT-h + [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + output_channels (`int`, *optional*, defaults to 256): + Dimensionality of the output channels in the Patch Encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of channels in the input image. + image_size (`int`, *optional*, defaults to 1024): + Expected resolution. Target size of the resized input image. + patch_size (`int`, *optional*, defaults to 16): + Size of the patches to be extracted from the input image. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 1e-10): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to query, key, value projections. + use_abs_pos (`bool`, *optional*, defaults to `True`): + Whether to use absolute position embedding. + use_rel_pos (`bool`, *optional*, defaults to `True`): + Whether to use relative position embedding. + window_size (`int`, *optional*, defaults to 14): + Window size for relative position. + global_attn_indexes (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`): + The indexes of the global attention layers. + mlp_dim (`int`, *optional*, defaults to 3072): + The dimensionality of the MLP layer in the Transformer encoder. + """ + + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + output_channels=256, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=1024, + patch_size=16, + hidden_act="gelu", + layer_norm_eps=1e-06, + attention_dropout=0.0, + initializer_range=1e-10, + qkv_bias=True, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + global_attn_indexes=[2, 5, 8, 11], + mlp_dim=3072, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.output_channels = output_channels + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.qkv_bias = qkv_bias + self.use_abs_pos = use_abs_pos + self.use_rel_pos = use_rel_pos + self.window_size = window_size + self.global_attn_indexes = global_attn_indexes + self.mlp_dim = mlp_dim + + +class GotOcr2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GotOcr2ForConditionalGeneration`]. It is used to instantiate a + GotOcr2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of GOT-OCR-2.0. + + e.g [stepfun-ai/GOT-OCR-2.0-hf](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + image_token_index (`int`, *optional*, defaults to 151859): + The image token index to encode the image prompt. + image_seq_length (`int`, *optional*, defaults to 576): + Sequence length of one image embedding. + pad_token_id (`int`, *optional*, defaults to -1): + Padding token id. + + ```python + >>> from transformers import GotOcr2ForConditionalGeneration, GotOcr2Config + + >>> # Initializing a GotOcr2 style configuration + >>> configuration = GotOcr2Config() + + >>> # Initializing a model from the Qwen2-VL-7B style configuration + >>> model = GotOcr2ForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "got_ocr2" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AutoConfig, "vision_config": GotOcr2VisionConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_index=151859, + image_seq_length=576, + pad_token_id=-1, + **kwargs, + ): + self.image_token_index = image_token_index + self.image_seq_length = image_seq_length + self.pad_token_id = pad_token_id + + if vision_config is None: + self.vision_config = GotOcr2VisionConfig() + elif isinstance(vision_config, dict): + self.vision_config = GotOcr2VisionConfig(**vision_config) + elif isinstance(vision_config, GotOcr2VisionConfig): + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "qwen2") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["qwen2"]( + vocab_size=151860, + hidden_size=1024, + intermediate_size=2816, + num_hidden_layers=24, + num_attention_heads=16, + num_key_value_heads=16, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=True, + rope_theta=1000000.0, + rope_scaling=None, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=21, + attention_dropout=0.0, + ) + + self.text_config = text_config + + super().__init__(**kwargs) + + +class GotOcr2MLPBlock(SamMLPBlock): + pass + + +class GotOcr2VisionAttention(SamVisionAttention): + pass + + +class GotOcr2VisionLayer(SamVisionLayer): + def __init__(self, config, window_size): + super().__init__(config, window_size) + self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attn = GotOcr2VisionAttention(config, window_size) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.mlp = GotOcr2MLPBlock(config) + self.window_size = window_size + + +class GotOcr2PreTrainedModel(SamPreTrainedModel): + pass + + +class GotOcr2VisionEncoder(SamVisionEncoder, GotOcr2PreTrainedModel): + pass + + +class GotOcr2MultiModalProjector(nn.Module): + def __init__(self, config: GotOcr2Config): + super().__init__() + vision_output_channels = config.vision_config.output_channels + language_hidden_size = config.text_config.hidden_size + self.conv_upsampler1 = nn.Conv2d( + vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.conv_upsampler2 = nn.Conv2d( + vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False + ) + self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size) + + def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor: + hidden_state = self.conv_upsampler1(vision_embeddings) + hidden_state = self.conv_upsampler2(hidden_state) + hidden_state = hidden_state.flatten(2).permute(0, 2, 1) + hidden_state = self.multimodal_projector(hidden_state) + return hidden_state + + +class GotOcr2CausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + pass + + +class GotOcr2ModelOutputWithPast(LlavaModelOutputWithPast): + pass + + +class GotOcr2PreTrainedModel(LlavaPreTrainedModel): + _supports_flash_attn = False + _supports_sdpa = False + _supports_flex_attn = False + + def _init_weights(self, module): + PreTrainedModel._init_weights(self, module) + if isinstance(module, GotOcr2VisionAttention): + if module.use_rel_pos: + module.rel_pos_h.data.zero_() + module.rel_pos_w.data.zero_() + elif isinstance(module, GotOcr2VisionEncoder): + if module.pos_embed is not None: + module.pos_embed.data.zero_() + + +class GotOcr2Model(LlavaModel): + def __init__(self, config: GotOcr2Config): + super().__init__(config) + self.vision_tower = GotOcr2VisionEncoder(config.vision_config) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_outputs = self.vision_tower(pixel_values).last_hidden_state + return self.multi_modal_projector(image_outputs) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, GotOcr2ModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype)) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **kwargs, + ) + + return GotOcr2ModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +class GotOcr2ForConditionalGeneration(LlavaForConditionalGeneration): + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, GotOcr2CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, GotOcr2ForConditionalGeneration, TextStreamer + + >>> model = GotOcr2ForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda") + >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") + + >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda") + + >>> # Generate + >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) + >>> generate_ids = model.generate( + ... **inputs, + ... do_sample=False, + ... tokenizer = processor.tokenizer, + ... stop_strings='<|im_end|>', + ... streamer=streamer, + ... max_new_tokens=4096, + ... ) + "You should keep in mind what features from the module should be used, especially + when you're planning to sell a template." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return GotOcr2CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + +__all__ = [ + "GotOcr2VisionConfig", + "GotOcr2Config", + "GotOcr2PreTrainedModel", + "GotOcr2Model", + "GotOcr2ForConditionalGeneration", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py new file mode 100644 index 0000000000000000000000000000000000000000..16c062ec63ade1310971f8797291439b59bad5ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py @@ -0,0 +1,261 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional, Union + +import numpy as np + +from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...utils import is_vision_available, logging + + +if is_vision_available(): + from ...image_utils import load_images + +logger = logging.get_logger(__name__) + + +class GotOcr2TextKwargs(TextKwargs, total=False): + format: Optional[bool] + + +class GotOcr2ImagesKwargs(ImagesKwargs, total=False): + box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]] + color: Optional[str] + num_image_tokens: Optional[int] + multi_page: Optional[bool] + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + + +class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: GotOcr2TextKwargs + images_kwargs: GotOcr2ImagesKwargs + _defaults = { + "text_kwargs": { + "padding": False, + "format": False, + }, + "images_kwargs": { + "num_image_tokens": 256, + "multi_page": False, + "crop_to_patches": False, + "min_patches": 1, + "max_patches": 12, + }, + } + + +def preprocess_box_annotation(box: Union[list, tuple], image_size: tuple[int, int]) -> list: + """ + Convert box annotation to the format [x1, y1, x2, y2] in the range [0, 1000]. + """ + width, height = image_size + if len(box) == 4: + box[0] = int(box[0] / width * 1000) + box[1] = int(box[1] / height * 1000) + box[2] = int(box[2] / width * 1000) + box[3] = int(box[3] / height * 1000) + else: + raise ValueError("Box must be a list or tuple of lists in the form [x1, y1, x2, y2].") + + return list(box) + + +class GotOcr2Processor(ProcessorMixin): + r""" + Constructs a GotOcr2 processor which wraps a [`GotOcr2ImageProcessor`] and + [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and + tokenizer functionalities. See the [`~GotOcr2Processor.__call__`] and [`~GotOcr2Processor.decode`] for more information. + Args: + image_processor ([`GotOcr2ImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "PreTrainedTokenizerFast" + + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + self.message_start_token = "<|im_start|>" + self.message_end_token = "<|im_end|>" + self.img_start_token = "" + self.img_end_token = "" + self.img_pad_token = "" + self.image_token = "" # keep the above for BC, but we need to call it `image_token` + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail." + + def _make_list_of_inputs(self, images, text, box, color, multi_page): + if not isinstance(images, (list, tuple)): + images = [images] + if multi_page: + logger.warning("Multi-page inference is enabled but only one image is passed.") + images = [images] + elif isinstance(images[0], (list, tuple)) and not multi_page: + raise ValueError("Nested images are only supported with `multi_page` set to `True`.") + elif not isinstance(images[0], (list, tuple)) and multi_page: + images = [images] + + if isinstance(text, str): + text = [text] + + if not isinstance(box[0], (list, tuple)): + # Use the same box for all images + box = [box for _ in range(len(images))] + if not isinstance(color, (list, tuple)): + color = [color for _ in range(len(images))] + + return images, text, box, color + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + audio=None, + videos=None, + **kwargs: Unpack[GotOcr2ProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text` + is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and + `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to + GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + format (`bool`, *optional*): + If set, will add the format token to the query, and the model will return the OCR result with formatting. + box (`list[float]`, `list[tuple[float, float]]`, `list[tuple[float, float, float, float]]`, *optional*): + The box annotation to be added to the query. If a list of floats or a tuple of floats is provided, it + will be interpreted as [x1, y1, x2, y2]. If a list of tuples is provided, each tuple should be in the + form (x1, y1, x2, y2). + color (`str`, *optional*): + The color annotation to be added to the query. The model will return the OCR result within the box with + the specified color. + multi_page (`bool`, *optional*): + If set, will enable multi-page inference. The model will return the OCR result across multiple pages. + crop_to_patches (`bool`, *optional*): + If set, will crop the image to patches. The model will return the OCR result upon the patch reference. + min_patches (`int`, *optional*): + The minimum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to + `True`. + max_patches (`int`, *optional*): + The maximum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to + `True`. + + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + + output_kwargs = self._merge_kwargs( + GotOcr2ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + format_output = output_kwargs["text_kwargs"].pop("format") + num_image_tokens = output_kwargs["images_kwargs"].pop("num_image_tokens") + box = output_kwargs["images_kwargs"].pop("box", [None]) + color = output_kwargs["images_kwargs"].pop("color", None) + multi_page = output_kwargs["images_kwargs"].pop("multi_page") + + crop_to_patches = output_kwargs["images_kwargs"].get("crop_to_patches") + images, text, box, color = self._make_list_of_inputs(images, text, box, color, multi_page) + if multi_page: + # save the number of pages per batch + num_pages_per_batch = [len(image_group) for image_group in images] + # flatten the list of images + images = [image for image_group in images for image in image_group] + else: + num_pages_per_batch = [1 for _ in range(len(images))] + # Load images as we need to know the image size + images = load_images(images) + image_sizes = [image.size for image in images] + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + num_patches_array = image_inputs.pop("num_patches") + if text is None: + text = [] + patch_indices = np.cumsum(num_pages_per_batch) + for index, (num_pages, box_single, color_single) in enumerate(zip(num_pages_per_batch, box, color)): + current_patch_index = patch_indices[index - 1] if index > 0 else 0 + num_patches = sum(num_patches_array[current_patch_index : current_patch_index + num_pages]) + if box_single[0] is not None: + box_single = preprocess_box_annotation(box_single, image_sizes[index]) + query = ( + f"{f'[{color_single}] ' if color_single is not None else ''}" + f"{str(box_single) if box_single[0] is not None else ''} " + "OCR" + f"{' with format' if format_output else ''}" + f"{' across multi pages' if multi_page else ''}" + f"{' upon the patch reference' if crop_to_patches else ''}" + ": " + ) + prompt = ( + self.message_start_token + + self.system_query + + self.message_end_token + + self.message_start_token + + "user\n" + + self.img_start_token + + self.img_pad_token * num_image_tokens * num_patches + + self.img_end_token + + "\n" + + query + + self.message_end_token + + self.message_start_token + + "assistant\n" + ) + text.append(prompt) + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + +__all__ = ["GotOcr2Processor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f01899e668e3a86548db3f59c7f42d70746385ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_gpt2 import * + from .modeling_flax_gpt2 import * + from .modeling_gpt2 import * + from .modeling_tf_gpt2 import * + from .tokenization_gpt2 import * + from .tokenization_gpt2_fast import * + from .tokenization_gpt2_tf import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/configuration_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/configuration_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..db5151a2ba15635a7943744799b0689fc96790d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/configuration_gpt2.py @@ -0,0 +1,274 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""OpenAI GPT-2 configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import Any, Optional + +from ... import PreTrainedTokenizer, TensorType, is_torch_available +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfigWithPast, PatchingSpec +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class GPT2Config(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to + instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the GPT-2 + [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50257): + Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`]. + n_positions (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_embd (`int`, *optional*, defaults to 768): + Dimensionality of the embeddings and hidden states. + n_layer (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + n_inner (`int`, *optional*): + Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd + activation_function (`str`, *optional*, defaults to `"gelu_new"`): + Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + summary_type (`string`, *optional*, defaults to `"cls_index"`): + Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and + [`TFGPT2DoubleHeadsModel`]. + + Has to be one of the following options: + + - `"last"`: Take the last token hidden state (like XLNet). + - `"first"`: Take the first token hidden state (like BERT). + - `"mean"`: Take the mean of all tokens hidden states. + - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). + - `"attn"`: Not implemented now, use multi-head attention. + summary_use_proj (`bool`, *optional*, defaults to `True`): + Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and + [`TFGPT2DoubleHeadsModel`]. + + Whether or not to add a projection after the vector extraction. + summary_activation (`str`, *optional*): + Argument used when doing sequence summary. Used in for the multiple choice head in + [`GPT2DoubleHeadsModel`]. + + Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation. + summary_proj_to_labels (`bool`, *optional*, defaults to `True`): + Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and + [`TFGPT2DoubleHeadsModel`]. + + Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes. + summary_first_dropout (`float`, *optional*, defaults to 0.1): + Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and + [`TFGPT2DoubleHeadsModel`]. + + The dropout ratio to be used after the projection and activation. + scale_attn_weights (`bool`, *optional*, defaults to `True`): + Scale attention weights by dividing by sqrt(hidden_size).. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + bos_token_id (`int`, *optional*, defaults to 50256): + Id of the beginning of sentence token in the vocabulary. + eos_token_id (`int`, *optional*, defaults to 50256): + Id of the end of sentence token in the vocabulary. + scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights by `1 / layer_idx + 1`. + reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): + Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention + dot-product/softmax to float() when training with mixed precision. + + Example: + + ```python + >>> from transformers import GPT2Config, GPT2Model + + >>> # Initializing a GPT2 configuration + >>> configuration = GPT2Config() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = GPT2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gpt2" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=50257, + n_positions=1024, + n_embd=768, + n_layer=12, + n_head=12, + n_inner=None, + activation_function="gelu_new", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + summary_type="cls_index", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + scale_attn_weights=True, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + scale_attn_by_inverse_layer_idx=False, + reorder_and_upcast_attn=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + self.scale_attn_weights = scale_attn_weights + self.use_cache = use_cache + self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx + self.reorder_and_upcast_attn = reorder_and_upcast_attn + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + +class GPT2OnnxConfig(OnnxConfigWithPast): + def __init__( + self, + config: PretrainedConfig, + task: str = "default", + patching_specs: Optional[list[PatchingSpec]] = None, + use_past: bool = False, + ): + super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past) + if not getattr(self._config, "pad_token_id", None): + # TODO: how to do that better? + self._config.pad_token_id = 0 + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}}) + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"} + else: + common_inputs["attention_mask"] = {0: "batch", 1: "sequence"} + + return common_inputs + + @property + def num_layers(self) -> int: + return self._config.n_layer + + @property + def num_attention_heads(self) -> int: + return self._config.n_head + + def generate_dummy_inputs( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + # We need to order the input in the way they appears in the forward() + ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]}) + + # Need to add the past_keys + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + past_shape = ( + batch, + self.num_attention_heads, + past_key_values_length, + self._config.hidden_size // self.num_attention_heads, + ) + ordered_inputs["past_key_values"] = [ + (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers) + ] + + ordered_inputs["attention_mask"] = common_inputs["attention_mask"] + if self.use_past: + mask_dtype = ordered_inputs["attention_mask"].dtype + ordered_inputs["attention_mask"] = torch.cat( + [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + + return ordered_inputs + + @property + def default_onnx_opset(self) -> int: + return 13 + + +__all__ = ["GPT2Config", "GPT2OnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..8e419217c5a3642ee27f6f3df87e1c27c0d5ac79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py @@ -0,0 +1,782 @@ +# coding=utf-8 +# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, +) +from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_gpt2 import GPT2Config + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "openai-community/gpt2" +_CONFIG_FOR_DOC = "GPT2Config" + + +GPT2_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`GPT2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +GPT2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class FlaxConv1D(nn.Module): + features: int + use_bias: bool = True + dtype: Any = jnp.float32 + precision: Any = None + + @nn.compact + def __call__(self, inputs): + inputs = jnp.asarray(inputs, self.dtype) + kernel = self.param("kernel", jax.nn.initializers.normal(stddev=0.02), (self.features, inputs.shape[-1])) + kernel = jnp.asarray(kernel.transpose(), self.dtype) + y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())), precision=self.precision) + if self.use_bias: + bias = self.param("bias", jax.nn.initializers.zeros, (self.features,)) + bias = jnp.asarray(bias, self.dtype) + y = y + bias + return y + + +class FlaxGPT2Attention(nn.Module): + config: GPT2Config + dtype: jnp.dtype = jnp.float32 + causal: bool = True + is_cross_attention: bool = False + + def setup(self): + config = self.config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + + if self.is_cross_attention: + self.c_attn = FlaxConv1D(2 * self.embed_dim, dtype=self.dtype) + self.q_attn = FlaxConv1D(self.embed_dim, dtype=self.dtype) + else: + self.c_attn = FlaxConv1D(3 * self.embed_dim, dtype=self.dtype) + self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype) + + self.resid_dropout = nn.Dropout(rate=config.resid_pdrop) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,)) + + @nn.compact + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + key_value_states: Optional[jnp.ndarray] = None, + attention_mask=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + ): + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + if not is_cross_attention: + qkv_out = self.c_attn(hidden_states) + query, key, value = jnp.split(qkv_out, 3, axis=2) + else: + q_out = self.q_attn(hidden_states) + (query,) = jnp.split(q_out, 1, axis=2) + kv_out = self.c_attn(key_value_states) + key, value = jnp.split(kv_out, 2, axis=2) + + query = self._split_heads(query) + key = self._split_heads(key) + value = self._split_heads(value) + + query_length, key_length = query.shape[1], key.shape[1] + + if self.causal: + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + dropout_rng = None + if not deterministic and self.config.attn_pdrop > 0.0: + dropout_rng = self.make_rng("dropout") + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask) + + # transform boolean mask into float mask + if attention_mask is not None: + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + # usual dot product attention + attn_weights = dot_product_attention_weights( + query, + key, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attn_pdrop, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value) + attn_output = self._merge_heads(attn_output) + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output, deterministic=deterministic) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class FlaxGPT2MLP(nn.Module): + config: GPT2Config + intermediate_size: int + dtype: jnp.dtype = jnp.float32 + + def setup(self): + embed_dim = self.config.hidden_size + self.c_fc = FlaxConv1D(self.intermediate_size, dtype=self.dtype) + self.c_proj = FlaxConv1D(embed_dim, dtype=self.dtype) + self.act = ACT2FN[self.config.activation_function] + self.dropout = nn.Dropout(rate=self.config.resid_pdrop) + + def __call__(self, hidden_states, deterministic: bool = True): + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +class FlaxGPT2Block(nn.Module): + config: GPT2Config + dtype: jnp.dtype = jnp.float32 + + def setup(self): + hidden_size = self.config.hidden_size + inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size + + self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype) + self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + if self.config.add_cross_attention: + self.crossattention = FlaxGPT2Attention( + config=self.config, dtype=self.dtype, causal=False, is_cross_attention=True + ) + self.ln_cross_attn = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + ): + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_outputs = self.attn( + hidden_states, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + # residual connection + attn_output = attn_outputs[0] # output_attn: a, (attentions) + outputs = attn_outputs[1:] + # residual connection + hidden_states = attn_output + residual + + # Cross-Attention Block + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + residual = hidden_states + hidden_states = self.ln_cross_attn(hidden_states) + cross_attn_outputs = self.crossattention( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attn_output = cross_attn_outputs[0] + # residual connection + hidden_states = residual + attn_output + outputs = outputs + cross_attn_outputs[1:] # add cross attentions if we output attention weights + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic) + # residual connection + hidden_states = residual + feed_forward_hidden_states + + outputs = (hidden_states,) + outputs + + return outputs + + +class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = GPT2Config + base_model_prefix = "transformer" + module_class: nn.Module = None + + def __init__( + self, + config: GPT2Config, + input_shape: tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + if self.config.add_cross_attention: + encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + else: + module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False) + + random_params = module_init_outputs["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length)) + attention_mask = jnp.ones_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + def __call__( + self, + input_ids, + attention_mask=None, + position_ids=None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + params: Optional[dict] = None, + past_key_values: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if encoder_hidden_states is not None and encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = input_ids.shape + + if position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") + + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + if attention_mask is None: + attention_mask = jnp.ones((batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPT2Attention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + jnp.array(position_ids, dtype="i4"), + encoder_hidden_states, + encoder_attention_mask, + not train, + False, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + return outputs + + +class FlaxGPT2BlockCollection(nn.Module): + config: GPT2Config + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.blocks = [ + FlaxGPT2Block(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for block in self.blocks: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = block( + hidden_states, + attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # this contains possible `None` values - `FlaxGPT2Module` will filter them out + outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions) + + return outputs + + +class FlaxGPT2Module(nn.Module): + config: GPT2Config + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.embed_dim = self.config.hidden_size + + self.wte = nn.Embed( + self.config.vocab_size, + self.embed_dim, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.wpe = nn.Embed( + self.config.max_position_embeddings, + self.embed_dim, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.embd_pdrop) + self.h = FlaxGPT2BlockCollection(self.config, dtype=self.dtype) + self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic=True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + input_embeds = self.wte(input_ids.astype("i4")) + position_embeds = self.wpe(position_ids.astype("i4")) + + hidden_states = input_embeds + position_embeds + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + + outputs = self.h( + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.ln_f(hidden_states) + + if output_hidden_states: + all_hidden_states = outputs[1] + (hidden_states,) + outputs = (hidden_states, all_hidden_states) + outputs[2:] + else: + outputs = (hidden_states,) + outputs[1:] + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=outputs[1], + attentions=outputs[2], + cross_attentions=outputs[3], + ) + + +@add_start_docstrings( + "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", + GPT2_START_DOCSTRING, +) +class FlaxGPT2Model(FlaxGPT2PreTrainedModel): + module_class = FlaxGPT2Module + + +append_call_sample_docstring( + FlaxGPT2Model, + _CHECKPOINT_FOR_DOC, + FlaxBaseModelOutputWithPastAndCrossAttentions, + _CONFIG_FOR_DOC, +) + + +class FlaxGPT2LMHeadModule(nn.Module): + config: GPT2Config + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.transformer = FlaxGPT2Module(self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.config.vocab_size, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.transformer( + input_ids, + attention_mask, + position_ids, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T + lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=lm_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + GPT2_START_DOCSTRING, +) +class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel): + module_class = FlaxGPT2LMHeadModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since GPT2 uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice( + extended_attention_mask, attention_mask.astype("i4"), (0, 0) + ) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxGPT2LMHeadModel, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = ["FlaxGPT2LMHeadModel", "FlaxGPT2Model", "FlaxGPT2PreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..ae0786179464115b880ab5d5b4c771292ad5b2db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_gpt2.py @@ -0,0 +1,1638 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI GPT-2 model.""" + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, get_activation +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + QuestionAnsweringModelOutput, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer +from ...utils import ( + ModelOutput, + add_start_docstrings, + auto_docstring, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from ...utils.model_parallel_utils import assert_device_map, get_device_map +from .configuration_gpt2 import GPT2Config + + +logger = logging.get_logger(__name__) + + +def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): + """Load tf checkpoints in a pytorch model""" + try: + import re + + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(gpt2_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array.squeeze()) + + for name, array in zip(names, arrays): + name = name[6:] # skip "model/" + name = name.split("/") + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+\d+", m_name): + scope_names = re.split(r"(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "w" or scope_names[0] == "g": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "b": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "wpe" or scope_names[0] == "wte": + pointer = getattr(pointer, scope_names[0]) + pointer = getattr(pointer, "weight") + else: + pointer = getattr(pointer, scope_names[0]) + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) + + if module.scale_attn_weights: + attn_weights = attn_weights / torch.full( + [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device + ) + + # Layer-wise attention scaling + if module.scale_attn_by_inverse_layer_idx: + attn_weights = attn_weights / float(module.layer_idx + 1) + + if not module.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value) + + if attention_mask is not None: + # Apply the attention mask + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise + attn_weights = attn_weights.type(value.dtype) + attn_weights = module.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2) + + return attn_output, attn_weights + + +class GPT2Attention(nn.Module): + def __init__(self, config, is_cross_attention=False, layer_idx=None): + super().__init__() + self.config = config + max_positions = config.max_position_embeddings + self.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( + 1, 1, max_positions, max_positions + ), + persistent=False, + ) + self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.split_size = self.embed_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + + self.scale_attn_weights = config.scale_attn_weights + self.is_cross_attention = is_cross_attention + + # Layer-wise attention scaling, reordering, and upcasting + self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx + self.layer_idx = layer_idx + self.reorder_and_upcast_attn = config.reorder_and_upcast_attn + + if self.is_cross_attention: + self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim) + self.q_attn = Conv1D(self.embed_dim, self.embed_dim) + else: + self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim) + self.c_proj = Conv1D(self.embed_dim, self.embed_dim) + + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + self.is_causal = True + + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) + + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + + # Update hyper params + self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads)) + self.num_heads = self.num_heads - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None): + # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM) + bsz, num_heads, q_seq_len, dk = query.size() + _, _, k_seq_len, _ = key.size() + + # Preallocate attn_weights for `baddbmm` + attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device) + + # Compute Scale Factor + scale_factor = 1.0 + if self.scale_attn_weights: + scale_factor /= float(value.size(-1)) ** 0.5 + + if self.scale_attn_by_inverse_layer_idx: + scale_factor /= float(self.layer_idx + 1) + + # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk)) + with torch.autocast(query.device.type, enabled=False): + q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len) + attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor) + attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise + if attn_weights.dtype != torch.float32: + raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32") + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2) + + return attn_output, attn_weights + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]: + is_cross_attention = encoder_hidden_states is not None + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + if is_cross_attention: + if not hasattr(self, "q_attn"): + raise ValueError( + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`." + ) + query_states = self.q_attn(hidden_states) + attention_mask = encoder_attention_mask + + # Try to get key/value states from cache if possible + if past_key_values is not None and is_updated: + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states, value_states = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2) + shape_kv = (*key_states.shape[:-1], -1, self.head_dim) + key_states = key_states.view(shape_kv).transpose(1, 2) + value_states = value_states.view(shape_kv).transpose(1, 2) + else: + query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2) + shape_kv = (*key_states.shape[:-1], -1, self.head_dim) + key_states = key_states.view(shape_kv).transpose(1, 2) + value_states = value_states.view(shape_kv).transpose(1, 2) + + shape_q = (*query_states.shape[:-1], -1, self.head_dim) + query_states = query_states.view(shape_q).transpose(1, 2) + + if (past_key_values is not None and not is_cross_attention) or ( + past_key_values is not None and is_cross_attention and not is_updated + ): + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_values.is_updated[self.layer_idx] = True + + is_causal = attention_mask is None and query_states.shape[-2] > 1 and not is_cross_attention + + using_eager = self.config._attn_implementation == "eager" + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + if using_eager and self.reorder_and_upcast_attn: + attn_output, attn_weights = self._upcast_and_reordered_attn( + query_states, key_states, value_states, attention_mask, head_mask + ) + else: + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + head_mask=head_mask, + dropout=self.attn_dropout.p if self.training else 0.0, + is_causal=is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape(*attn_output.shape[:-2], -1).contiguous() + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + return attn_output, attn_weights + + +class GPT2MLP(nn.Module): + def __init__(self, intermediate_size, config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = Conv1D(intermediate_size, embed_dim) + self.c_proj = Conv1D(embed_dim, intermediate_size) + self.act = ACT2FN[config.activation_function] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor: + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class GPT2Block(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + hidden_size = config.hidden_size + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size + + self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = GPT2Attention(config=config, layer_idx=layer_idx) + self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + if config.add_cross_attention: + self.crossattention = GPT2Attention(config=config, is_cross_attention=True, layer_idx=layer_idx) + self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = GPT2MLP(inner_dim, config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_output, self_attn_weights = self.attn( + hidden_states, + past_key_values=past_key_values, + cache_position=cache_position, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs, + ) + # residual connection + hidden_states = attn_output + residual + + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + residual = hidden_states + hidden_states = self.ln_cross_attn(hidden_states) + cross_attn_output, cross_attn_weights = self.crossattention( + hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + # residual connection + hidden_states = residual + cross_attn_output + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + if encoder_hidden_states is not None: + outputs += (cross_attn_weights,) + + return outputs + + +# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->GPT2 +class GPT2SequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`GPT2Config`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: + + - `"last"` -- Take the last token hidden state (like XLNet) + - `"first"` -- Take the first token hidden state (like Bert) + - `"mean"` -- Take the mean of all tokens hidden states + - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - `"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + def __init__(self, config: GPT2Config): + super().__init__() + + self.summary_type = getattr(config, "summary_type", "last") + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = nn.Identity() + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + activation_string = getattr(config, "summary_activation", None) + self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity() + + self.first_dropout = nn.Identity() + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = nn.Identity() + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + def forward( + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + ) -> torch.FloatTensor: + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `torch.FloatTensor`: The summary of the sequence hidden states. + """ + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = hidden_states.mean(dim=1) + elif self.summary_type == "cls_index": + if cls_index is None: + cls_index = torch.full_like( + hidden_states[..., :1, :], + hidden_states.shape[-2] - 1, + dtype=torch.long, + ) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +@auto_docstring +class GPT2PreTrainedModel(PreTrainedModel): + config: GPT2Config + load_tf_weights = load_tf_weights_in_gpt2 + base_model_prefix = "transformer" + is_parallelizable = True + supports_gradient_checkpointing = True + _no_split_modules = ["GPT2Block"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = True + _supports_sdpa = True + _supports_attention_backend = True + + _can_compile_fullgraph = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if name == "c_proj.weight": + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of models predicting if two sentences are consecutive or not. + """ +) +class GPT2DoubleHeadsModelOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided): + Multiple choice classification loss. + logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`): + Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + """ + + loss: Optional[torch.FloatTensor] = None + mc_loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + mc_logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +PARALLELIZE_DOCSTRING = r""" + This is an experimental feature and is a subject to change at a moment's notice. + + Uses a device map to distribute attention modules of the model across several devices. If no device map is given, + it will evenly distribute blocks across all devices. + + Args: + device_map (`dict[int, list]`, *optional*): + A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always + automatically mapped to the first device (for esoteric reasons). That means that the first device should + have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the + following number of attention modules: + + - openai-community/gpt2: 12 + - openai-community/gpt2-medium: 24 + - openai-community/gpt2-large: 36 + - openai-community/gpt2-xl: 48 + + Example: + + ```python + # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules: + model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl") + device_map = { + 0: [0, 1, 2, 3, 4, 5, 6, 7, 8], + 1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], + 2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34], + 3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], + } + model.parallelize(device_map) + ``` +""" +DEPARALLELIZE_DOCSTRING = r""" + Moves the model to cpu from a model parallel state. + + Example: + + ```python + # On a 4 GPU machine with openai-community/gpt2-large: + model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large") + device_map = { + 0: [0, 1, 2, 3, 4, 5, 6, 7], + 1: [8, 9, 10, 11, 12, 13, 14, 15], + 2: [16, 17, 18, 19, 20, 21, 22, 23], + 3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], + } + model.parallelize(device_map) # Splits the model across several devices + model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() + ``` +""" + + +@auto_docstring +class GPT2Model(GPT2PreTrainedModel): + _supports_param_buffer_assignment = False + + def __init__(self, config): + super().__init__(config) + + self.embed_dim = config.hidden_size + + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + self._attn_implementation = config._attn_implementation + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + # Check validity of device_map + warnings.warn( + "`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your" + " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1," + " ...}", + FutureWarning, + ) + self.device_map = ( + get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map + ) + assert_device_map(self.device_map, len(self.h)) + self.model_parallel = True + self.first_device = "cpu" if "cpu" in self.device_map else "cuda:" + str(min(self.device_map.keys())) + self.last_device = "cuda:" + str(max(self.device_map.keys())) + self.wte = self.wte.to(self.first_device) + self.wpe = self.wpe.to(self.first_device) + # Load onto devices + for k, v in self.device_map.items(): + for block in v: + cuda_device = "cuda:" + str(k) + self.h[block] = self.h[block].to(cuda_device) + # ln_f to last + self.ln_f = self.ln_f.to(self.last_device) + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) + self.model_parallel = False + self.device_map = None + self.first_device = "cpu" + self.last_device = "cpu" + self.wte = self.wte.to("cpu") + self.wpe = self.wpe.to("cpu") + for index in range(len(self.h)): + self.h[index] = self.h[index].to("cpu") + self.ln_f = self.ln_f.to("cpu") + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].attn.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # based on pattern from src/transformers/models/whisper/modeling_whisper.py::WhisperDecoder + if use_cache: + if past_key_values is None: + past_key_values = DynamicCache(config=self.config) + elif isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. " + "You should pass an instance of `Cache` instead, e.g. " + "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + if self.config.add_cross_attention and not isinstance(past_key_values, EncoderDecoderCache): + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config)) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device) + + # Attention mask. + # ._update_causal_mask() and ._prepare_4d_causal_attention_mask_with_cache_position() copied from LlamaModel + if attention_mask is not None and attention_mask.ndim < 4: + attention_mask = attention_mask.view(batch_size, -1) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None + if self.config.add_cross_attention and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + if _use_sdpa: + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + mask=encoder_attention_mask, dtype=inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + elif self._attn_implementation != "flash_attention_2": + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + + output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),) + + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, block in enumerate(self.h): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + if isinstance(head_mask, torch.Tensor): + head_mask = head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + past_key_values if not (self.gradient_checkpointing and self.training) else None, + cache_position, + causal_mask, + head_mask[i], + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs, + ) + + hidden_states = outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (outputs[2],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = hidden_states.view(output_shape) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_values = past_key_values if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """ +) +class GPT2LMHeadModel(GPT2PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = GPT2Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + warnings.warn( + "`GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load" + " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':" + " 0, 'transformer.h.1': 1, ...}", + FutureWarning, + ) + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> Union[tuple, CausalLMOutputWithCrossAttentions]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + cache_position=cache_position, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + # Flatten the tokens + loss = self.loss_function( + logits, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for + RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the + input embeddings, the classification head takes as input the input of a specified classification token index in the + input sequence). + """ +) +class GPT2DoubleHeadsModel(GPT2PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + config.num_labels = 1 + self.transformer = GPT2Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + self.multiple_choice_head = GPT2SequenceSummary(config) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + warnings.warn( + "`GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should" + " load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your" + " own `device_map` but it needs to be a dictionary module_name to device, so for instance" + " {'transformer.h.0': 0, 'transformer.h.1': 1, ...}", + FutureWarning, + ) + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.multiple_choice_head = self.multiple_choice_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + mc_token_ids: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + mc_labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple, GPT2DoubleHeadsModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input): + Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) - + 1]`. + labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to + `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]` + mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above) + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel + + >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + >>> model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2") + + >>> # Add a [CLS] to the vocabulary (we should train it also!) + >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"}) + >>> # Update the model embeddings with the new vocabulary size + >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) + + >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] + >>> encoded_choices = [tokenizer.encode(s) for s in choices] + >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] + + >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 + >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 + + >>> outputs = model(input_ids, mc_token_ids=mc_token_ids) + >>> lm_logits = outputs.logits + >>> mc_logits = outputs.mc_logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + cache_position=cache_position, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + lm_logits = self.lm_head(hidden_states) + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) + + mc_loss = None + if mc_labels is not None: + loss_fct = CrossEntropyLoss() + mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) + lm_loss = None + if labels is not None: + labels = labels.to(lm_logits.device) + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (lm_logits, mc_logits) + transformer_outputs[1:] + if mc_loss is not None: + output = (mc_loss,) + output + return ((lm_loss,) + output) if lm_loss is not None else output + + return GPT2DoubleHeadsModelOutput( + loss=lm_loss, + mc_loss=mc_loss, + logits=lm_logits, + mc_logits=mc_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The GPT2 Model transformer with a sequence classification head on top (linear layer). + + [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """ +) +class GPT2ForSequenceClassification(GPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = GPT2Model(config) + self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutputWithPast]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size, sequence_length = input_ids.shape[:2] + else: + batch_size, sequence_length = inputs_embeds.shape[:2] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + last_non_pad_token = -1 + elif input_ids is not None: + # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id + non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) + token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) + last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) + else: + last_non_pad_token = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +class GPT2ForTokenClassification(GPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = GPT2Model(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +class GPT2ForQuestionAnswering(GPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = GPT2Model(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1).to(start_logits.device) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1).to(end_logits.device) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "GPT2DoubleHeadsModel", + "GPT2ForQuestionAnswering", + "GPT2ForSequenceClassification", + "GPT2ForTokenClassification", + "GPT2LMHeadModel", + "GPT2Model", + "GPT2PreTrainedModel", + "load_tf_weights_in_gpt2", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..42e23fc290151f09d47a30efca1cb7f4e4a3d669 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py @@ -0,0 +1,1238 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 OpenAI GPT-2 model.""" + +from __future__ import annotations + +from dataclasses import dataclass + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, + TFSequenceClassifierOutputWithPast, +) +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFConv1D, + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + TFSequenceSummary, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_gpt2 import GPT2Config + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "openai-community/gpt2" +_CONFIG_FOR_DOC = "GPT2Config" + + +class TFAttention(keras.layers.Layer): + def __init__(self, nx, config, scale=False, is_cross_attention=False, **kwargs): + super().__init__(**kwargs) + + n_state = nx # in Attention: n_state=768 (nx=n_embd) + # [switch nx => n_state from Block to Attention to keep identical to TF implementation] + assert n_state % config.n_head == 0 + self.n_head = config.n_head + self.split_size = n_state + self.scale = scale + self.output_attentions = config.output_attentions + + self.is_cross_attention = is_cross_attention + + if self.is_cross_attention: + self.c_attn = TFConv1D(n_state * 2, nx, initializer_range=config.initializer_range, name="c_attn") + self.q_attn = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="q_attn") + else: + self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") + + self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") + self.attn_dropout = keras.layers.Dropout(config.attn_pdrop) + self.resid_dropout = keras.layers.Dropout(config.resid_pdrop) + self.pruned_heads = set() + self.embed_dim = n_state + + def prune_heads(self, heads): + pass + + @staticmethod + def causal_attention_mask(nd, ns, dtype): + """ + 1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]), + -1, ns-nd), but doesn't produce garbage on TPUs. + """ + i = tf.range(nd)[:, None] + j = tf.range(ns) + m = i >= j - ns + nd + return tf.cast(m, dtype) + + def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False): + # q, k, v have shape [batch, heads, sequence, features] + w = tf.matmul(q, k, transpose_b=True) + if self.scale: + dk = tf.cast(shape_list(k)[-1], dtype=w.dtype) # scale attention_scores + w = w / tf.math.sqrt(dk) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + + # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. + _, _, nd, ns = shape_list(w) + b = self.causal_attention_mask(nd, ns, dtype=w.dtype) + b = tf.reshape(b, [1, 1, nd, ns]) + w = w * b - 1e4 * (1 - b) + + if attention_mask is not None: + # Apply the attention mask + attention_mask = tf.cast(attention_mask, dtype=w.dtype) + w = w + attention_mask + + w = stable_softmax(w, axis=-1) + w = self.attn_dropout(w, training=training) + + # Mask heads if we want to + if head_mask is not None: + w = w * head_mask + + outputs = [tf.matmul(w, v)] + if output_attentions: + outputs.append(w) + return outputs + + def merge_heads(self, x): + x = tf.transpose(x, [0, 2, 1, 3]) + x_shape = shape_list(x) + new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]] + return tf.reshape(x, new_x_shape) + + def split_heads(self, x): + x_shape = shape_list(x) + new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head] + x = tf.reshape(x, new_x_shape) + return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features) + + def call( + self, + x, + layer_past, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + use_cache, + output_attentions, + training=False, + ): + if encoder_hidden_states is not None: + if not hasattr(self, "q_attn"): + raise ValueError( + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`." + ) + + query = self.q_attn(x) + kv_out = self.c_attn(encoder_hidden_states) + key, value = tf.split(kv_out, 2, axis=2) + attention_mask = encoder_attention_mask + else: + x = self.c_attn(x) + query, key, value = tf.split(x, 3, axis=2) + + query = self.split_heads(query) + key = self.split_heads(key) + value = self.split_heads(value) + if layer_past is not None: + past_key, past_value = tf.unstack(layer_past, axis=0, num=2) + key = tf.concat([past_key, key], axis=-2) + value = tf.concat([past_value, value], axis=-2) + + # to cope with keras serialization + if use_cache: + present = tf.stack([key, value], axis=0) + else: + present = (None,) + + attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training) + a = attn_outputs[0] + + a = self.merge_heads(a) + a = self.c_proj(a) + a = self.resid_dropout(a, training=training) + + outputs = [a, present] + attn_outputs[1:] + return outputs # a, present, (attentions) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if self.is_cross_attention: + c_attn_shape = 2 * self.embed_dim + else: + c_attn_shape = 3 * self.embed_dim + if getattr(self, "c_proj", None) is not None: + with tf.name_scope(self.c_proj.name): + self.c_proj.build([None, None, self.embed_dim]) + if getattr(self, "c_attn", None) is not None: + with tf.name_scope(self.c_attn.name): + self.c_attn.build([None, None, c_attn_shape]) + if getattr(self, "q_attn", None) is not None: + with tf.name_scope(self.q_attn.name): + self.q_attn.build([None, None, self.embed_dim]) + + +class TFMLP(keras.layers.Layer): + def __init__(self, n_state, config, **kwargs): + super().__init__(**kwargs) + nx = config.n_embd + self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") + self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") + self.act = get_tf_activation(config.activation_function) + self.dropout = keras.layers.Dropout(config.resid_pdrop) + self.intermediate_size = n_state + self.embed_dim = nx + + def call(self, x, training=False): + h = self.act(self.c_fc(x)) + h2 = self.c_proj(h) + h2 = self.dropout(h2, training=training) + return h2 + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "c_fc", None) is not None: + with tf.name_scope(self.c_fc.name): + self.c_fc.build([None, None, self.intermediate_size]) + if getattr(self, "c_proj", None) is not None: + with tf.name_scope(self.c_proj.name): + self.c_proj.build([None, None, self.embed_dim]) + + +class TFBlock(keras.layers.Layer): + def __init__(self, config, scale=False, **kwargs): + super().__init__(**kwargs) + nx = config.n_embd + inner_dim = config.n_inner if config.n_inner is not None else 4 * nx + self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") + self.attn = TFAttention(nx, config, scale, name="attn") + self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") + + if config.add_cross_attention: + self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True) + self.ln_cross_attn = keras.layers.LayerNormalization( + epsilon=config.layer_norm_epsilon, name="ln_cross_attn" + ) + + self.mlp = TFMLP(inner_dim, config, name="mlp") + self.hidden_size = config.hidden_size + + def call( + self, + x, + layer_past, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + use_cache, + output_attentions, + training=False, + ): + a = self.ln_1(x) + output_attn = self.attn( + a, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + use_cache=use_cache, + output_attentions=output_attentions, + training=training, + ) + a = output_attn[0] # output_attn: a, present, (attentions) + outputs = output_attn[1:] + x = x + a + + # Cross-Attention Block + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + + ca = self.ln_cross_attn(x) + output_cross_attn = self.crossattention( + ca, + layer_past=None, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=False, + output_attentions=output_attentions, + training=training, + ) + ca = output_cross_attn[0] # output_attn: a, present, (cross_attentions) + x = x + ca + outputs = outputs + output_cross_attn[2:] # add cross attentions if we output attention weights + + m = self.ln_2(x) + m = self.mlp(m, training=training) + x = x + m + + outputs = [x] + outputs + return outputs # x, present, (attentions, cross_attentions) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "ln_1", None) is not None: + with tf.name_scope(self.ln_1.name): + self.ln_1.build([None, None, self.hidden_size]) + if getattr(self, "attn", None) is not None: + with tf.name_scope(self.attn.name): + self.attn.build(None) + if getattr(self, "ln_2", None) is not None: + with tf.name_scope(self.ln_2.name): + self.ln_2.build([None, None, self.hidden_size]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + if getattr(self, "ln_cross_attn", None) is not None: + with tf.name_scope(self.ln_cross_attn.name): + self.ln_cross_attn.build([None, None, self.hidden_size]) + + +@keras_serializable +class TFGPT2MainLayer(keras.layers.Layer): + config_class = GPT2Config + + def __init__(self, config, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + self.config = config + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.use_cache = config.use_cache + self.return_dict = config.use_return_dict + + self.num_hidden_layers = config.n_layer + self.n_embd = config.n_embd + self.n_positions = config.n_positions + self.initializer_range = config.initializer_range + + self.wte = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="wte", + ) + self.wpe = keras.layers.Embedding( + input_dim=config.n_positions, + output_dim=config.n_embd, + embeddings_initializer=get_initializer(config.initializer_range), + name="wpe", + ) + self.drop = keras.layers.Dropout(config.embd_pdrop) + self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] + self.ln_f = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") + self.embed_dim = config.hidden_size + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if past_key_values is None: + past_length = 0 + past_key_values = [None] * len(self.h) + else: + past_length = shape_list(past_key_values[0][0])[-2] + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0) + + if attention_mask is not None: + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask_shape = shape_list(attention_mask) + attention_mask = tf.reshape(attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + one_cst = tf.constant(1.0) + attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype) + attention_mask = tf.multiply(tf.subtract(one_cst, attention_mask), tf.constant(-10000.0)) + + # Copied from `modeling_tf_t5.py` with -1e9 -> -10000 + if self.config.add_cross_attention and encoder_attention_mask is not None: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=encoder_hidden_states.dtype) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + + encoder_attention_mask = encoder_extended_attention_mask + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.num_hidden_layers + # head_mask = tf.constant([0] * self.num_hidden_layers) + + position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = self.wte(input_ids) + + position_embeds = self.wpe(position_ids) + + if token_type_ids is not None: + token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) + token_type_embeds = self.wte(token_type_ids) + else: + token_type_embeds = tf.constant(0.0) + + position_embeds = tf.cast(position_embeds, dtype=inputs_embeds.dtype) + token_type_embeds = tf.cast(token_type_embeds, dtype=inputs_embeds.dtype) + hidden_states = inputs_embeds + position_embeds + token_type_embeds + hidden_states = self.drop(hidden_states, training=training) + + output_shape = input_shape + [shape_list(hidden_states)[-1]] + + presents = () if use_cache else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): + if output_hidden_states: + all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) + + outputs = block( + hidden_states, + layer_past, + attention_mask, + head_mask[i], + encoder_hidden_states, + encoder_attention_mask, + use_cache, + output_attentions, + training=training, + ) + + hidden_states, present = outputs[:2] + if use_cache: + presents = presents + (present,) + + if output_attentions: + all_attentions = all_attentions + (outputs[2],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (outputs[3],) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = tf.reshape(hidden_states, output_shape) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if output_attentions: + # let the number of heads free (-1) so we can extract attention even after head pruning + attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] + all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) + + if not return_dict: + return tuple( + v + for v in [hidden_states, presents, all_hidden_states, all_attentions, all_cross_attentions] + if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "wte", None) is not None: + with tf.name_scope(self.wte.name): + self.wte.build(None) + if getattr(self, "wpe", None) is not None: + with tf.name_scope(self.wpe.name): + self.wpe.build(None) + if getattr(self, "ln_f", None) is not None: + with tf.name_scope(self.ln_f.name): + self.ln_f.build([None, None, self.embed_dim]) + if getattr(self, "h", None) is not None: + for layer in self.h: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFGPT2PreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = GPT2Config + base_model_prefix = "transformer" + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias", r"h.\d+.crossattention.bias"] + + @property + def input_signature(self): + # Although GPT-2 supports token_type_ids in theory, in practice they are rarely used, and the implementation + # means that passing token_type_ids=0 yields different outputs from token_type_ids=None. + # Therefore, we remove the token_type_ids argument by default, even though it would usually be included. + return { + "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + } + + +@dataclass +class TFGPT2DoubleHeadsModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`): + Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). + past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor | None = None + mc_logits: tf.Tensor | None = None + past_key_values: list[tf.Tensor] | None = None + hidden_states: tuple[tf.Tensor] | None = None + attentions: tuple[tf.Tensor] | None = None + + +GPT2_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`GPT2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +GPT2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + past_key_values (`list[tf.Tensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see + `past_key_values` output below). Can be used to speed up sequential decoding. The token ids which have + their past given to this model should not be passed as input ids as they have already been computed. + attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for + `past_key_values`. In other words, the `attention_mask` always has to have the length: + `len(past_key_values) + len(input_ids)` + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", + GPT2_START_DOCSTRING, +) +class TFGPT2Model(TFGPT2PreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFGPT2MainLayer(config, name="transformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have + their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past`). Set to `False` during training, `True` during generation + """ + + outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +@add_start_docstrings( + """ + The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + GPT2_START_DOCSTRING, +) +class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFGPT2MainLayer(config, name="transformer") + + def get_output_embeddings(self): + return self.get_input_embeddings() + + def set_output_embeddings(self, value): + self.set_input_embeddings(value) + + def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs): + token_type_ids = kwargs.get("token_type_ids") + # only last token for inputs_ids if past is defined in kwargs + if past_key_values: + inputs = tf.expand_dims(inputs[:, -1], -1) + if token_type_ids is not None: + token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1) + + position_ids = kwargs.get("position_ids") + attention_mask = kwargs.get("attention_mask") + + if attention_mask is not None and position_ids is None: + position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True) + if past_key_values: + position_ids = tf.expand_dims(position_ids[:, -1], -1) + + return { + "input_ids": inputs, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "token_type_ids": token_type_ids, + } + + @unpack_inputs + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFCausalLMOutputWithCrossAttentions | tuple[tf.Tensor]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have + their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past`). Set to `False` during training, `True` during generation + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + + transformer_outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = transformer_outputs[0] + logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True) + + loss = None + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels, shifted_logits) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +@add_start_docstrings( + """ + The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for + RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the + input embeddings, the classification head takes as input the input of a specified classification token index in the + input sequence). + """, + GPT2_START_DOCSTRING, +) +class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + config.num_labels = 1 + self.transformer = TFGPT2MainLayer(config, name="transformer") + self.multiple_choice_head = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="multiple_choice_head" + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + mc_token_ids: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFGPT2DoubleHeadsModelOutput | tuple[tf.Tensor]: + r""" + mc_token_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input): + Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) - + 1]`. + + Return: + + Examples: + + ```python + >>> import tensorflow as tf + >>> from transformers import AutoTokenizer, TFGPT2DoubleHeadsModel + + >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + >>> model = TFGPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2") + + >>> # Add a [CLS] to the vocabulary (we should train it also!) + >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"}) + + >>> embedding_layer = model.resize_token_embeddings( + ... len(tokenizer) + ... ) # Update the model embeddings with the new vocabulary size + + >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] + >>> encoded_choices = [tokenizer.encode(s) for s in choices] + >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] + + >>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2 + >>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1 + + >>> outputs = model(input_ids, mc_token_ids=mc_token_ids) + >>> lm_prediction_scores, mc_prediction_scores = outputs[:2] + ```""" + + if input_ids is not None: + input_shapes = shape_list(input_ids) + else: + input_shapes = shape_list(inputs_embeds)[:-1] + + seq_length = input_shapes[-1] + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + transformer_outputs = self.transformer( + input_ids=flat_input_ids, + past_key_values=past_key_values, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=None, + encoder_attention_mask=None, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = transformer_outputs[0] + hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:]) + if return_dict and output_hidden_states: + # We do this to match the slightly odd PT behaviour - the final hidden state is reshaped to rank 4 when the + # input is rank 3, but all other hidden states remain at rank-3 (with the first 2 dims merged) + all_hidden_states = transformer_outputs.hidden_states[:-1] + (hidden_states,) + else: + all_hidden_states = None + lm_logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True) + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) + mc_logits = tf.squeeze(mc_logits, axis=-1) + + if not return_dict: + return (lm_logits, mc_logits) + transformer_outputs[1:] + + return TFGPT2DoubleHeadsModelOutput( + logits=lm_logits, + mc_logits=mc_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=all_hidden_states, + attentions=transformer_outputs.attentions, + ) + + @property + def input_signature(self): + return { + "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), + "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"), + } + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "multiple_choice_head", None) is not None: + with tf.name_scope(self.multiple_choice_head.name): + self.multiple_choice_head.build(None) + + +@add_start_docstrings( + """ + The GPT2 Model transformer with a sequence classification head on top (linear layer). + + [`TFGPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + GPT2_START_DOCSTRING, +) +class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.score = keras.layers.Dense( + config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="score", + use_bias=False, + ) + self.transformer = TFGPT2MainLayer(config, name="transformer") + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint="microsoft/DialogRPT-updown", + output_type=TFSequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSequenceClassifierOutputWithPast | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + logits_shape = shape_list(logits) + batch_size = logits_shape[0] + + if self.config.pad_token_id is None: + last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1) + else: + if input_ids is not None: + token_indices = tf.range(shape_list(input_ids)[-1]) + non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype) + last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1) + else: + last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1) + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + loss = None + + pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1) + + if labels is not None: + if self.config.pad_token_id is None and logits_shape[0] != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + + loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels])) + + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "score", None) is not None: + with tf.name_scope(self.score.name): + self.score.build([None, None, self.config.n_embd]) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +__all__ = [ + "TFGPT2DoubleHeadsModel", + "TFGPT2ForSequenceClassification", + "TFGPT2LMHeadModel", + "TFGPT2MainLayer", + "TFGPT2Model", + "TFGPT2PreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..608164ef2d83ab15bf7f99d33f9c6eb56ed1fcff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2.py @@ -0,0 +1,334 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + +import json +import os +from functools import lru_cache +from typing import Optional + +import regex as re + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +@lru_cache +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class GPT2Tokenizer(PreTrainedTokenizer): + """ + Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import GPT2Tokenizer + + >>> tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") + >>> tokenizer("Hello world")["input_ids"] + [15496, 995] + + >>> tokenizer(" Hello world")["input_ids"] + [18435, 995] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). + + + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + pad_token (`str`, *optional*): + The token used for padding, for example when batching sequences of different lengths. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (GPT2 tokenizer detect beginning of words by the preceding space). + add_bos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading + word just as any other word. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + pad_token=None, + add_prefix_space=False, + add_bos_token=False, + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + self.add_bos_token = add_bos_token + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + super().__init__( + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + add_bos_token=add_bos_token, + **kwargs, + ) + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if self.add_bos_token: + bos_token_ids = [self.bos_token_id] + else: + bos_token_ids = [] + + output = bos_token_ids + token_ids_0 + + if token_ids_1 is None: + return output + + return output + bos_token_ids + token_ids_1 + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if not self.add_bos_token: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if is_split_into_words or add_prefix_space: + text = " " + text + return (text, kwargs) + + +__all__ = ["GPT2Tokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..f81c155e864476cf49c24f91a0235c939f42d3e0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -0,0 +1,133 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + +from typing import Optional + +from ...tokenization_utils_base import BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_gpt2 import GPT2Tokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + + +class GPT2TokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import GPT2TokenizerFast + + >>> tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2") + >>> tokenizer("Hello world")["input_ids"] + [15496, 995] + + >>> tokenizer(" Hello world")["input_ids"] + [18435, 995] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. + tokenizer_file (`str`, *optional*): + Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (GPT2 tokenizer detect beginning of words by the preceding space). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = GPT2Tokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + add_prefix_space=False, + **kwargs, + ): + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + self.add_bos_token = kwargs.pop("add_bos_token", False) + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["GPT2TokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..145a45da0db6d36f75f5cec6091027e36541184e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py @@ -0,0 +1,119 @@ +import os +from typing import Optional, Union + +import tensorflow as tf +from tensorflow_text import pad_model_inputs + +from ...modeling_tf_utils import keras +from ...utils.import_utils import is_keras_nlp_available, requires +from .tokenization_gpt2 import GPT2Tokenizer + + +if is_keras_nlp_available(): + from keras_nlp.tokenizers import BytePairTokenizer + + +@requires(backends=("keras_nlp",)) +class TFGPT2Tokenizer(keras.layers.Layer): + """ + This is an in-graph tokenizer for GPT2. It should be initialized similarly to other tokenizers, using the + `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings + from an existing standard tokenizer object. + + In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run + when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options + than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes + straight from `tf.string` inputs to outputs. + + Args: + vocab (dict[str, int]): Vocabulary dict for Byte Pair Tokenizer + merges (list[str]): Merges list for Byte Pair Tokenizer + """ + + def __init__( + self, + vocab: dict[str, int], + merges: list[str], + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + ): + super().__init__() + self.pad_token_id = pad_token_id + self.max_length = max_length + self.vocab = vocab + self.merges = merges + + self.tf_tokenizer = BytePairTokenizer(vocab, merges, sequence_length=max_length) + + @classmethod + def from_tokenizer(cls, tokenizer: GPT2Tokenizer, *args, **kwargs): + """Creates TFGPT2Tokenizer from GPT2Tokenizer + + Args: + tokenizer (GPT2Tokenizer) + + Examples: + + ```python + from transformers import AutoTokenizer, TFGPT2Tokenizer + + tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + tf_tokenizer = TFGPT2Tokenizer.from_tokenizer(tokenizer) + ``` + """ + merges = [" ".join(m) for m in tokenizer.bpe_ranks] + vocab = tokenizer.get_vocab() + return cls(vocab, merges, *args, **kwargs) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs): + """Creates TFGPT2Tokenizer from pretrained GPT2Tokenizer + + Args: + pretrained_model_name_or_path (Union[str, os.PathLike]): Path to pretrained model + + Examples: + + ```python + from transformers import TFGPT2Tokenizer + + tf_tokenizer = TFGPT2Tokenizer.from_pretrained("openai-community/gpt2") + ``` + """ + tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) + return cls.from_tokenizer(tokenizer, *init_inputs, **kwargs) + + @classmethod + def from_config(cls, config): + """Creates TFGPT2Tokenizer from configurations + + Args: + config (Dict): Dictionary with keys such as stated in `get_config`. + """ + return cls(**config) + + def get_config(self): + return { + "vocab": self.vocab, + "merges": self.merges, + "max_length": self.max_length, + "pad_token_id": self.pad_token_id, + } + + def call(self, x, max_length: Optional[int] = None): + input_ids = self.tf_tokenizer(x) + attention_mask = tf.ones_like(input_ids) + + if self.pad_token_id is not None: + # pad the tokens up to max length + max_length = max_length if max_length is not None else self.max_length + + if max_length is not None: + input_ids, attention_mask = pad_model_inputs( + input_ids, max_seq_length=max_length, pad_value=self.pad_token_id + ) + + return {"attention_mask": attention_mask, "input_ids": input_ids} + + +__all__ = ["TFGPT2Tokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..94ba39d69ad638c706f6ac8491e2dea80e269929 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_gpt_neox_japanese import * + from .modeling_gpt_neox_japanese import * + from .tokenization_gpt_neox_japanese import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..320157334539b1d7c418c8cf97c8b57dc38629f7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -0,0 +1,167 @@ +# coding=utf-8 +# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GPTNeoX Japanese model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class GPTNeoXJapaneseConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GPTNeoXModelJapanese`]. It is used to instantiate + a GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the GPTNeoXJapanese + [abeja/gpt-neox-japanese-2.7b](https://huggingface.co/abeja/gpt-neox-japanese-2.7b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Default configs is set as 2.7B model + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the GPTNeoXJapanese model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`GPTNeoXJapanese`]. + hidden_size (`int`, *optional*, defaults to 2560): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_multiple_size (`int`, *optional*, defaults to 4): + Dimension of the "intermediate" layer in the Transformer encoder is calculated by hidden_size * + intermediate_multiple_size. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. + rotary_pct (`float`, *optional*, defaults to 1.00): + percentage of hidden dimensions to allocate to rotary embeddings + rotary_emb_base (`int`, *optional*, defaults to 10000) + base for computing rotary embeddings frequency + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the hidden layer. + Example: + + ```python + >>> from transformers import GPTNeoXJapaneseConfig, GPTNeoXJapaneseModel + + >>> # Initializing a GPTNeoXJapanese gpt-neox-japanese-2.7b style configuration + >>> configuration = GPTNeoXJapaneseConfig() + + >>> # Initializing a model (with random weights) from the gpt-neox-japanese-2.7b style configuration + >>> model = GPTNeoXJapaneseModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gpt_neox_japanese" + + def __init__( + self, + vocab_size=32000, + hidden_size=2560, + num_hidden_layers=32, + num_attention_heads=32, + intermediate_multiple_size=4, + hidden_act="gelu", + rotary_pct=1.00, + rotary_emb_base=10000, + max_position_embeddings=2048, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + bos_token_id=31996, + eos_token_id=31999, + rope_scaling=None, + attention_dropout=0.1, + hidden_dropout=0.0, + **kwargs, + ): + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_multiple_size = intermediate_multiple_size + self.hidden_act = hidden_act + self.rotary_pct = rotary_pct + self.partial_rotary_factor = rotary_pct + self.rotary_emb_base = rotary_emb_base + self.rope_theta = rotary_emb_base + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_scaling = rope_scaling + self.attention_dropout = attention_dropout + self.hidden_dropout = hidden_dropout + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, move it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + + +__all__ = ["GPTNeoXJapaneseConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..70399f376c7553fc5d7a1437b0a4b760732697ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -0,0 +1,755 @@ +# coding=utf-8 +# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch GPTNeoX model.""" + +import math +from typing import Optional, Union + +import torch +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, is_torch_flex_attn_available, logging +from .configuration_gpt_neox_japanese import GPTNeoXJapaneseConfig + + +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import BlockMask + + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +@auto_docstring +class GPTNeoXJapanesePreTrainedModel(PreTrainedModel): + config: GPTNeoXJapaneseConfig + base_model_prefix = "gpt_neox_japanese" + _no_split_modules = ["GPTNeoXJapaneseLayer"] + _skip_keys_device_placement = "past_key_values" + + _can_compile_fullgraph = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, GPTNeoXJapaneseAttention): + if module.dense_bias is not None: + module.dense_bias.data.zero_() + + +class GPTNeoXJapaneseAttention(nn.Module): + def __init__(self, config, use_bias=False, layer_idx=None): + super().__init__() + self.num_attention_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + self.head_size = self.hidden_size // self.num_attention_heads + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.layer_idx = layer_idx + self.rotary_ndims = int(self.head_size * config.rotary_pct) + self.rope_theta = config.rotary_emb_base + self.rotary_emb = GPTNeoXJapaneseRotaryEmbedding(config=config) + self.attention_dropout = nn.Dropout(config.attention_dropout) + self.norm_factor = math.sqrt(self.head_size) + + self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=False) + self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False) + # Activate bias if the last layer + self.use_bias = use_bias + self.dense_bias = nn.Parameter(torch.zeros(config.hidden_size)) if use_bias else None + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: torch.FloatTensor, + position_ids: torch.LongTensor, + head_mask: Optional[torch.FloatTensor] = None, + layer_past: Optional[Cache] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + ): + # Compute QKV + # Attention heads [batch, seq_len, hidden_size] + # --> [batch, seq_len, (np * 3 * head_size)] + qkv = self.query_key_value(hidden_states) + + # [batch, seq_len, (num_heads * 3 * head_size)] + # --> [batch, seq_len, num_heads, 3 * head_size] + new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size) + qkv = qkv.view(*new_qkv_shape) + + # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size] + query = qkv[..., : self.head_size].permute(0, 2, 1, 3) + key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3) + value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3) + + # Compute rotary embeddings on rotary_ndims + query_rot = query[..., : self.rotary_ndims] + query_pass = query[..., self.rotary_ndims :] + key_rot = key[..., : self.rotary_ndims] + key_pass = key[..., self.rotary_ndims :] + + cos, sin = position_embeddings + query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin) + query = torch.cat((query, query_pass), dim=-1).contiguous() + key = torch.cat((key, key_pass), dim=-1).contiguous() + + # Cache QKV values + if layer_past is not None: + cache_kwargs = { + "sin": sin, + "cos": cos, + "partial_rotation_size": self.rotary_ndims, + "cache_position": cache_position, + } + key, value = layer_past.update(key, value, self.layer_idx, cache_kwargs) + + # Compute attention + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + # Reshape outputs + attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size) + attn_output = self.dense(attn_output) + + return attn_output, attn_weights, self.dense_bias + + @classmethod + def _split_heads(cls, tensor, num_attention_heads, attn_head_size): + """ + Splits hidden dim into attn_head_size and num_attention_heads + """ + # tensor: [bs, seq_len, hidden_size] + new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size) + # -> [bs, seq_len, num_attention_heads, attn_head_size] + tensor = tensor.view(new_shape) + # -> [bs, num_attention_heads, seq_len, attn_head_size] + tensor = tensor.permute(0, 2, 1, 3) + return tensor + + @classmethod + def _merge_heads(cls, tensor, num_attention_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into hidden dim + """ + # tensor [bs, num_attention_heads, seq_len, attn_head_size] + tensor = tensor.permute(0, 2, 1, 3).contiguous() + # -> [bs, seq_len, num_attention_heads, attn_head_size] + tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size) + # -> [bs, seq_len, hidden_size] + return tensor + + def _attn(self, query, key, value, attention_mask=None, head_mask=None): + # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size] + # compute causal mask from causal mask buffer + batch_size, num_attention_heads, query_length, attn_head_size = query.size() + key_length = key.size(-2) + + query = query.view(batch_size * num_attention_heads, query_length, attn_head_size) + key = key.view(batch_size * num_attention_heads, key_length, attn_head_size) + + # [batch_size * num_heads, q_length, kv_length] + attn_scores = torch.zeros( + batch_size * num_attention_heads, + query_length, + key_length, + dtype=query.dtype, + device=key.device, + ) + attention_scores = torch.baddbmm( + attn_scores, + query, + key.transpose(1, 2), + beta=1.0, + alpha=1.0 / self.norm_factor, + ) + + attention_scores = attention_scores.view(batch_size, num_attention_heads, query_length, -1) + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attention_scores = attention_scores + causal_mask + + attn_weights = nn.functional.softmax(attention_scores, dim=-1) + attn_weights = self.attention_dropout(attn_weights) + attn_weights = attn_weights.to(value.dtype) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + return attn_output, attn_weights + + +# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoX->GPTNeoXJapanese +class GPTNeoXJapaneseRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: GPTNeoXJapaneseConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def bias_dropout_add(x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool) -> Tensor: + """add bias to x, apply dropout and residual connection + + Args: + x (Tensor): main path of output + bias (Tensor): None or attn_bias of the last attention layer + residual (Optional[Tensor]): residual value + prob (float): dropout probability + training (bool): whether in training mode or not + + Returns: + Tensor: dropout(x + bias) + residual + """ + if bias is not None: + x = x + bias + out = torch.nn.functional.dropout(x, p=prob, training=training) + if residual is not None: + out = residual + out + return out + + +class GPTNeoXJapaneseMLP(nn.Module): + def __init__(self, config): + super().__init__() + intermediate_size = int(config.hidden_size * config.intermediate_multiple_size) + self.dense_h_to_4h = nn.Linear(config.hidden_size, intermediate_size, bias=False) + # Project back to h. + self.dense_4h_to_h = nn.Linear(intermediate_size, config.hidden_size, bias=False) + self.act = ACT2FN[config.hidden_act] + + def forward(self, hidden_states): + intermediate = self.dense_h_to_4h(hidden_states) + intermediate = self.act(intermediate) + output = self.dense_4h_to_h(intermediate) + return output + + +class GPTNeoXJapaneseLayer(nn.Module): + def __init__(self, config, layer_number): + super().__init__() + self.layer_number = layer_number + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + # activate bias only last layer + self.attention = GPTNeoXJapaneseAttention( + config=config, use_bias=layer_number == config.num_hidden_layers - 1, layer_idx=layer_number + ) + self.mlp = GPTNeoXJapaneseMLP(config) + self.hidden_dropout = config.hidden_dropout + + def forward( + self, + hidden_states: Optional[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + layer_past: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + ): + residual = hidden_states + ln_out = self.input_layernorm(hidden_states) + attn_output, attn_weights, attn_bias = self.attention( + ln_out, + attention_mask=attention_mask, + layer_past=layer_past, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + position_ids=position_ids, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + # attn_output = (atten_output + bias) + residual + attn_output = bias_dropout_add( + attn_output, + bias=attn_bias.expand_as(residual) if attn_bias is not None else attn_bias, + residual=residual, + prob=self.hidden_dropout, + training=self.training, + ) + mlp_output = self.mlp(self.post_attention_layernorm(attn_output)) + + # attn_output = (mlp_output + mlp_bias) + atten_output + attn_output = bias_dropout_add( + mlp_output, bias=None, residual=attn_output, prob=self.hidden_dropout, training=self.training + ) + + return attn_output, attn_weights + + +@auto_docstring +class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList( + [GPTNeoXJapaneseLayer(config=config, layer_number=i) for i in range(config.num_hidden_layers)] + ) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.rotary_emb = GPTNeoXJapaneseRotaryEmbedding(config=config) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_in + + def set_input_embeddings(self, value): + self.embed_in = value + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, BaseModelOutputWithPast]: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, GPTNeoXJapaneseModel + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b") + >>> model = GPTNeoXJapaneseModel.from_pretrained("abeja/gpt-neox-japanese-2.7b") + + >>> inputs = tokenizer("日本語のGPT-neoxがHugging Faceで使えます😀", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_in(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + seq_length = inputs_embeds.shape[1] + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + head_mask=head_mask[i], + layer_past=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + hidden_states = outputs[0] + if output_attentions: + all_attentions = all_attentions + (outputs[1],) + + hidden_states = self.final_layer_norm(hidden_states) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, past_key_values, all_hidden_states, all_attentions] if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) + + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: Union[torch.Tensor, "BlockMask"], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool = False, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + return attention_mask + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + if using_compilable_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +@auto_docstring( + custom_intro=""" + GPTNeoXJapanese Model with a `language modeling` head on top for Classifier Model fine-tuning. + """ +) +class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel, GenerationMixin): + _tied_weights_keys = ["embed_out.weight"] + + def __init__(self, config): + super().__init__(config) + self.config = config + + self.gpt_neox_japanese = GPTNeoXJapaneseModel(config) + self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.embed_out + + def set_output_embeddings(self, new_embeddings): + self.embed_out = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b") + >>> config = GPTNeoXJapaneseConfig.from_pretrained("abeja/gpt-neox-japanese-2.7b") + >>> config.is_decoder = True + >>> model = GPTNeoXJapaneseForCausalLM.from_pretrained("abeja/gpt-neox-japanese-2.7b", config=config) + + >>> inputs = tokenizer("日本語のGPT-neoxがHugging Faceで使えます😀", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.gpt_neox_japanese( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + lm_logits = self.embed_out(hidden_states) + + lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(lm_logits.device) + + lm_loss = self.loss_function( + lm_logits, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithPast( + loss=lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "GPTNeoXJapaneseForCausalLM", + "GPTNeoXJapaneseLayer", + "GPTNeoXJapaneseModel", + "GPTNeoXJapanesePreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..584e74a8123e7cfdf31c4738a656a8417085e9a1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py @@ -0,0 +1,369 @@ +# coding=utf-8 +# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for GPTNeoXJapanese.""" + +import collections +import json +import os +import re +import sys +from typing import Optional + +import numpy as np + +from ...tokenization_utils_fast import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"} + + +def load_vocab_and_emoji(vocab_file, emoji_file): + """Loads a vocabulary file and emoji file into a dictionary.""" + with open(emoji_file, "r", encoding="utf-8") as f: + emoji = json.loads(f.read()) + + vocab = collections.OrderedDict() + raw_vocab = collections.OrderedDict() + ids_to_tokens = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as f: + token = f.readlines() + token = [[t.rstrip("\n")] if (t == "," or "," not in t) else t.rstrip("\n").split(",") for t in token] + for idx, b in enumerate(token): + ids_to_tokens[idx] = b + raw_vocab[",".join(b)] = idx + for wd in b: + vocab[wd] = idx + + return vocab, raw_vocab, ids_to_tokens, emoji + + +class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer): + """ + This tokenizer inherits from [`PreTrainedTokenizer`] and is based on Japanese special Sub-Word-Encoding that is + used in this repository (https://github.com/tanreinama/Japanese-BPEEncoder_V2). Check the repository for details. + Japanese has a relatively large vocabulary and there is no separation between words. Furthermore, the language is a + combination of hiragana, katakana, and kanji, and variants such as "1" and "①" are often used. In order to cope + with these, this tokenizer has the following features + - Subword-by-subword segmentation, which is intermediate between byte strings and morphological analysis. + - BPEs are created for each Kanji, Hiragana, and Katakana character, and there are no BPEs that cross character + types, such as Kanji + Hiragana or Hiragana + Katakana. + - All-byte encoding that does not require . + - Independent of UTF codes such as 2-byte and 3-byte characters + - Conversion of heterographs to the same token_id + - Emoji and Emoticon are grouped into 12 types as special tags. + + Example: + + ```python + >>> from transformers import GPTNeoXJapaneseTokenizer + + >>> tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b") + >>> # You can confirm both 慶応 and 慶應 are encoded to 17749 + >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"] + [30014, 26883, 26638, 27228, 25, 26650, 31732, 31679, 27809, 26638, 17749, 31592, 17749, 31593, 321, 1281] + + >>> # Both 慶応 and 慶應 are decoded to 慶応 + >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]) + '吾輩は猫である🐯。実は慶応(慶応)大学出身' + ``` + + Args: + vocab_file (`str`): + File containing the vocabulary. + emoji_file (`str`): + File containing the emoji. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding + bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + do_clean_text (`bool`, *optional*, defaults to `False`): + Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + emoji_file, + unk_token="<|endoftext|>", + pad_token="<|endoftext|>", + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + do_clean_text=False, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + if not os.path.isfile(emoji_file): + raise ValueError( + f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google" + " pretrained model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.do_clean_text = do_clean_text + self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file) + self.subword_tokenizer = SubWordJapaneseTokenizer( + vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji + ) + super().__init__( + unk_token=unk_token, + pad_token=pad_token, + bos_token=bos_token, + eos_token=eos_token, + do_clean_text=do_clean_text, + **kwargs, + ) + + @property + def vocab_size(self): + # self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab + return len(self.raw_vocab) + + def get_vocab(self): + return dict(self.raw_vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.subword_tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = "".join(tokens).strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + emoji_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"] + ) + else: + vocab_file = ( + (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"] + ) + emoji_file = ( + (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"] + ) + with open(vocab_file, "w", encoding="utf-8") as writer: + for token_index, token in self.ids_to_tokens.items(): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(",".join(token) + "\n") + index += 1 + with open(emoji_file, "w", encoding="utf-8") as writer: + json.dump(self.emoji, writer) + return vocab_file, emoji_file + + +class SubWordJapaneseTokenizer: + """ + https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the + original repository. + + MIT License + + Copyright (c) 2020 tanreinama + + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + documentation files (the "Software"), to deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all copies or substantial portions of + the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO + THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + + def __init__(self, vocab, ids_to_tokens, emoji): + self.vocab = vocab # same as swe + self.ids_to_tokens = ids_to_tokens # same as bpe + self.emoji = emoji + self.maxlen = np.max([len(w) for w in self.vocab]) + self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)") + self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*") + self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}") + self.content_repatter4 = re.compile( + r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*" + ) + self.content_repatter5 = re.compile( + r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*" + ) + # The original version of this regex displays catastrophic backtracking behaviour. We avoid this using + # possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly + # different regex that should generally have the same behaviour in most non-pathological cases. + if sys.version_info >= (3, 11): + self.content_repatter6 = re.compile( + r"(?:\d,\d{3}|[\d億])*+" + r"(?:\d,\d{3}|[\d万])*+" + r"(?:\d,\d{3}|[\d千])*+" + r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+" + r"(?:\(税込\)|\(税抜\)|\+tax)*" + ) + else: + self.content_repatter6 = re.compile( + r"(?:\d,\d{3}|[\d億万千])*" + r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+" + r"(?:\(税込\)|\(税抜\)|\+tax)*" + ) + keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿" + blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟" + self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "")) + + def __len__(self): + return len(self.ids_to_tokens) + + def clean_text(self, content): + content = self.content_repatter1.sub("", content) + content = self.content_repatter2.sub("", content) + content = self.content_repatter3.sub("", content) + content = self.content_repatter4.sub("", content) + content = self.content_repatter5.sub("", content) + content = self.content_repatter6.sub("", content) + content = content.translate(self.content_trans1) + while "" in content: + content = content.replace("", "") + return content + + def tokenize(self, text, clean=False): + text = text.replace(" ", "") + text = text.replace(" ", "") + text = text.replace("\r\n", "
") + text = text.replace("\n", "
") + text = text.replace("\r", "
") + text = text.replace("\t", "") + text = text.replace("—", "ー") + text = text.replace("−", "ー") + for k, v in self.emoji["emoji"].items(): + if k in text: + text = text.replace(k, v) + if clean: + text = self.clean_text(text) + + def check_simbol(x): + e = x.encode() + if len(x) == 1 and len(e) == 2: + c = (int(e[0]) << 8) + int(e[1]) + if ( + (c >= 0xC2A1 and c <= 0xC2BF) + or (c >= 0xC780 and c <= 0xC783) + or (c >= 0xCAB9 and c <= 0xCBBF) + or (c >= 0xCC80 and c <= 0xCDA2) + ): + return True + return False + + def checku2e(x): + e = x.encode() + if len(x) == 1 and len(e) == 3: + c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2]) + if c >= 0xE28080 and c <= 0xE2B07F: + return True + return False + + pos = 0 + result = [] + while pos < len(text): + end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3 + candidates = [] # (token_id, token, pos) + for e in range(end, pos, -1): + wd = text[pos:e] + if wd in self.vocab: + if wd[0] == "<" and len(wd) > 2: + candidates = [(self.vocab[wd], wd, e)] + break + else: + candidates.append((self.vocab[wd], wd, e)) + if len(candidates) > 0: + # the smallest token_id is adopted + _, wd, e = min(candidates, key=lambda x: x[0]) + result.append(wd) + pos = e + else: + end = pos + 1 + wd = text[pos:end] + if check_simbol(wd): + result.append("") + elif checku2e(wd): + result.append("") + else: + for i in wd.encode("utf-8"): + result.append("<|byte%d|>" % i) + pos = end + return result + + def convert_id_to_token(self, index, breakline="\n"): + words = [] + byte_tokens = [] + word = self.ids_to_tokens[index][0] + if word[:6] == "<|byte" and word[-2:] == "|>": + byte_tokens.append(int(word[6:-2])) + else: + if len(byte_tokens) > 0: + words.append(bytearray(byte_tokens).decode("utf-8", errors="replace")) + byte_tokens = [] + if word[:7] == "<|emoji" and word[-2:] == "|>": + words.append(self.emoji["emoji_inv"][word]) + elif word == "": + words.append(" ") + elif word == "
": + words.append(breakline) + elif word == "": + words.append("\t") + elif word == "": + words.append("▀") + elif word == "": + words.append("ǀ") + elif word == "": + words.append("‖") + else: + words.append(word) + if len(byte_tokens) > 0: + words.append(bytearray(byte_tokens).decode("utf-8", errors="replace")) + text = "".join(words) + return text + + +__all__ = ["GPTNeoXJapaneseTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e54fff573bc3284897d78701cceed524b666f26 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70df94b2f279f4aa8b1929346f0499bef5415919 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a38ad7b036115c6a8dfc8299d31067335308b5f1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58c328384d4f11d9d0f5b102fbfe7e721a34862f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f186b307eda97787c0fa0ae3c996777695a8707 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..416f1aaa3d5ee2038677eae16f36bf2bd5c93de9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66c7da841533d71cc7210ec81e523b6ce15fe13a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20524758bca5ac06c1ad7fc1b0512df1f9d494cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1859f05413a8160947a731ca9b0e41bd012c9726 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc3bb4c73c835739ddb032f15362540324528b86 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de5d111f021ba40a229a0980658f520022a8bc5b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e54160a374472bac9f9d45755321d20c80feaec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ebcf89b12667ea2ab25a8d3db2c0616b3355c70 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61ad2d5d8e1045b78b0a5bfddf88db6dfee29413 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b3db93b295a3c4ee749b8b7d11a023228311a67 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24a80c07bee62cc6258e87afc915ac14e404c44f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c9f03835c46ffc2934e9259148e90363a399db3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aacef232e012c0deb5d8c3c9b9c6abf457e87860 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..557ac68e7afe69c6a124f6bd0754184ea2b6afc5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35f806a9831c3bf9f24e567241e3bf699e7dec01 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2614fa62b5341ba29e46539288a293cc615e789a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa264f3e6130b1b348f4ec1d3fd0034f2ee235eb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fac78d81571e4b13017e49ce6699306a732c1804 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65ca021d2da2a20610a6bba549c15c82d7dafa3f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57da8394089340d4336b7674a76088e713d6d439 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ded1c7bc0aeae8833cf1b446f00d8296d490b286 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c506153a44cf06a3098563da5005d79f02b8bd52 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a14765a39e630128effef8bc0f3f9cc1e9292751 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db6b919b86c92a734e0b7025b791f45060524990 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..339d263f96e816b7b81d93d36f184567e66226d0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bab239e3ee6a52a5007dae8409ceebda112c465 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b8caa3956def119f07377bbf05da305a07bd532 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68c2a7670812d6052d3876034346c1d78d62af00 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cab0aa9516080207d9c57a2c97845509df57675 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8923af1de116219405577646ae2dcedee5602ccc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_ijepa import * + from .modeling_ijepa import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py new file mode 100644 index 0000000000000000000000000000000000000000..5f528adad0d55711f40ea21f58e1e0196822f449 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py @@ -0,0 +1,121 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""I-JEPA model configuration""" + +from ...configuration_utils import PretrainedConfig + + +class IJepaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`IJepaModel`]. It is used to instantiate an IJEPA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the I-JEPA + [facebook/ijepa_vith14_1k](https://huggingface.co/facebook/ijepa_vith14_1k) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + pooler_output_size (`int`, *optional*): + Dimensionality of the pooler layer. If None, defaults to `hidden_size`. + pooler_act (`str`, *optional*, defaults to `"tanh"`): + The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and + Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are + supported for Tensorflow. + + Example: + + ```python + >>> from transformers import IJepaConfig, IJepaModel + + >>> # Initializing a IJEPA ijepa-base-patch16-224 style configuration + >>> configuration = IJepaConfig() + + >>> # Initializing a model (with random weights) from the ijepa-base-patch16-224 style configuration + >>> model = IJepaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "ijepa" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + pooler_output_size=None, + pooler_act="tanh", + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size + self.pooler_act = pooler_act + + +__all__ = ["IJepaConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py new file mode 100644 index 0000000000000000000000000000000000000000..b26a78e49ab0e251cd5bdee256a01b8d9da61451 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py @@ -0,0 +1,540 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/ijepa/modular_ijepa.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_ijepa.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import collections.abc +from typing import Callable, Optional, Union + +import torch +import torch.nn as nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import TransformersKwargs, auto_docstring, torch_int +from ...utils.generic import can_return_tuple, check_model_inputs +from .configuration_ijepa import IJepaConfig + + +class IJepaPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config: IJepaConfig): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + if not interpolate_pos_encoding: + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" + f" ({self.image_size[0]}*{self.image_size[1]})." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class IJepaEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. Optionally, also the mask token. + """ + + def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None: + super().__init__() + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None + self.patch_embeddings = IJepaPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] + num_positions = self.position_embeddings.shape[1] + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + patch_pos_embed = self.position_embeddings + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return patch_pos_embed + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + if bool_masked_pos is not None: + seq_length = embeddings.shape[1] + mask_tokens = self.mask_token.expand(batch_size, seq_length, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + # add positional encoding to each token + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class IJepaSelfAttention(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size = hidden_states.shape[0] + new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size + + key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2) + query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + return context_layer, attention_probs + + +class IJepaSelfOutput(nn.Module): + """ + The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class IJepaAttention(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.attention = IJepaSelfAttention(config) + self.output = IJepaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + self_attn_output, _ = self.attention(hidden_states, head_mask) + output = self.output(self_attn_output, hidden_states) + return output + + +class IJepaIntermediate(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class IJepaOutput(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states + input_tensor + return hidden_states + + +class IJepaLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: IJepaConfig): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = IJepaAttention(config) + self.intermediate = IJepaIntermediate(config) + self.output = IJepaOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + hidden_states_norm = self.layernorm_before(hidden_states) + attention_output = self.attention(hidden_states_norm, head_mask) + + # first residual connection + hidden_states = attention_output + hidden_states + + # in IJepa, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + return layer_output + + +@auto_docstring +class IJepaPreTrainedModel(PreTrainedModel): + config: IJepaConfig + base_model_prefix = "ijepa" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": IJepaLayer, + "attentions": IJepaSelfAttention, + } + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, IJepaEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + if module.mask_token is not None: + module.mask_token.data.zero_() + + +class IJepaEncoder(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.config = config + self.layer = nn.ModuleList([IJepaLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> BaseModelOutput: + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module(hidden_states, layer_head_mask) + + return BaseModelOutput(last_hidden_state=hidden_states) + + +class IJepaPooler(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.pooler_output_size) + self.activation = ACT2FN[config.pooler_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@auto_docstring +class IJepaModel(IJepaPreTrainedModel): + def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + use_mask_token (`bool`, *optional*, defaults to `False`): + Whether to use a mask token for masked image modeling. + """ + super().__init__(config) + self.config = config + self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token) + self.encoder = IJepaEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = IJepaPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> IJepaPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?) + expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype + if pixel_values.dtype != expected_dtype: + pixel_values = pixel_values.to(expected_dtype) + + embedding_output = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding + ) + + encoder_outputs: BaseModelOutput = self.encoder(embedding_output, head_mask=head_mask) + + sequence_output = encoder_outputs.last_hidden_state + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output) + + +@auto_docstring( + custom_intro=""" + IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states) + e.g. for ImageNet. + + + + Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """ +) +class IJepaForImageClassification(IJepaPreTrainedModel): + def __init__(self, config: IJepaConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.ijepa = IJepaModel(config, add_pooling_layer=False) + + # Classifier head + self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> ImageClassifierOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + outputs: BaseModelOutputWithPooling = self.ijepa( + pixel_values, + head_mask=head_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + sequence_output = outputs.last_hidden_state + logits = self.classifier(sequence_output.mean(dim=1)) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config, **kwargs) + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["IJepaPreTrainedModel", "IJepaModel", "IJepaForImageClassification"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py new file mode 100644 index 0000000000000000000000000000000000000000..7b8e6e152f3c115db4a1712895239467ea45e7df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py @@ -0,0 +1,186 @@ +from typing import Optional, Union + +import torch +import torch.nn as nn + +from transformers.models.ijepa.configuration_ijepa import IJepaConfig + +from ...modeling_outputs import BaseModelOutputWithPooling, ImageClassifierOutput +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, torch_int +from ..vit.modeling_vit import ViTEmbeddings, ViTForImageClassification, ViTModel, ViTPreTrainedModel + + +class IJepaEmbeddings(ViTEmbeddings): + def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None: + super().__init__(config, use_mask_token) + # Remove cls_token from IJepaEmbeddings, as it is not used in the model + del self.cls_token + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] + num_positions = self.position_embeddings.shape[1] + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + patch_pos_embed = self.position_embeddings + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return patch_pos_embed + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + if bool_masked_pos is not None: + seq_length = embeddings.shape[1] + mask_tokens = self.mask_token.expand(batch_size, seq_length, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + # add positional encoding to each token + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +@auto_docstring +class IJepaPreTrainedModel(ViTPreTrainedModel): + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, IJepaEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + if module.mask_token is not None: + module.mask_token.data.zero_() + + +class IJepaModel(IJepaPreTrainedModel, ViTModel): + def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + use_mask_token (`bool`, *optional*, defaults to `False`): + Whether to use a mask token for masked image modeling. + """ + super().__init__(config) + self.config = config + self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token) + + +@auto_docstring( + custom_intro=""" + IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states) + e.g. for ImageNet. + + + + Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """ +) +class IJepaForImageClassification(IJepaPreTrainedModel, ViTForImageClassification): + def __init__(self, config: IJepaConfig): + super().__init__(config) + self.ijepa = IJepaModel(config, add_pooling_layer=False) + self.post_init() + + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> ImageClassifierOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + outputs: BaseModelOutputWithPooling = self.ijepa( + pixel_values, + head_mask=head_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + sequence_output = outputs.last_hidden_state + logits = self.classifier(sequence_output.mean(dim=1)) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config, **kwargs) + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "IJepaPreTrainedModel", + "IJepaModel", + "IJepaForImageClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f66a948f8f2ead454d7b12cd679da19215bf4fdf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37101c3567ec512a2477abdcb6d99fa34b358c91 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b11fb0bb4244097bbaaf32e65f1eaa5b56f7f2a8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e9257b8d9916e0f4b87d887889921fa83273ffd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bda3110565ed727ca07758fd7eeec9ba1205bd7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd4b9d3c05b62b13db68052ec3607a447fb3adfd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92d1f8ba340619b00abe949f0caf11fbbdba8cd3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc213ecbf133daa522a555788e80f91f00e86d94 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e62821ffb638998431b500af746549ff95539b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2bb27772d551980591bc074eb618f415854011b0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c775c1c75b0b8ce59280d752b080107259fe5d1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..030a813a6a416ce09d307fad444eb21792b42bd6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3cffd5a21c593601a2dd49bfd60c803c25647a63 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78a40bca366192133e77ddc92c385f71578bc518 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d624bfc614a3077a25ddbe48d9a25e940b722a09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fcab2da3935780829be685743783540fd7de7c1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b04b353e69f4f323a250e658bbe04236d863d68 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1b46e3797f8f868aa8c6e89d03efd8f2873198e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b5d4c46d23934b6301073e0f383da6bb86baaa4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c39c5a8e9ee6c96e4a056f8f82d0e6ce912250a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e35699a59342b713e0b22c99ff970db80a603f9e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c18666149670cb32212696021032e2df06a84ae Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf3ae90ba8372247ffd6d7e19cc10e73e9c13866 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff636bd8f995fcc21f2cc29848c7a6ce9b54aa44 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8ec5f416037f1751a468d3a17d2a892c622c4b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d409d6e492c72461e0c0da6769bb6a0f3df7cba1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b791531e3743248dd77066357d290ea7b4dbcdd3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6084c8a664470b0b9085a08ed202075a4f476b14 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc4ba56803aca18a4f294f880b63c17927070fd8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/configuration_lilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/configuration_lilt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47a4f57c798fd33591ae0845e01a11c262213d06 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/configuration_lilt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/modeling_lilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/modeling_lilt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3efa785517cf84b9177c4c31184a42e10e9d1395 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/modeling_lilt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31c7a8c60ab80dd82aaa50bc77a0dcf93de0539e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_flax_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_flax_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..378b5df98aa00b04a2c3adbec944274a5ee11f3c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_flax_llama.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..926aa533733f115f5df85ccc87f00593a1299022 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_llama.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..288c4d7dad3897a48567c216ab6b0eaa6a42e6ee Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a853afb0f4b3eb8a113d8922776285c72d10ea15 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..773c2c62c2e3667fb65e09e01a4ad9a56fd4cb46 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/configuration_llama4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/configuration_llama4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e5afdb17d5b319df8636cfb2a1886c3ab478bbb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/configuration_llama4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/modeling_llama4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/modeling_llama4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c8d7e77c6baef0f702f807bb807026d398c3898 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/modeling_llama4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/processing_llama4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/processing_llama4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d92fd8c800335898a162543e5cc763349038c94 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/processing_llama4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3c8429dc7e80c1ced93d7fa79b1b36d472eec26e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_llava_next import * + from .image_processing_llava_next import * + from .image_processing_llava_next_fast import * + from .modeling_llava_next import * + from .processing_llava_next import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/configuration_llava_next.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/configuration_llava_next.py new file mode 100644 index 0000000000000000000000000000000000000000..17ea71b1aa6421c8e2007cbb9399a69e9894288e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/configuration_llava_next.py @@ -0,0 +1,150 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Llava-NeXT model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class LlavaNextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`]. It is used to instantiate an + Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) + model. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + image_token_index (`int`, *optional*, defaults to 32000): + The image token index to encode the image prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -2): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): + A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list + of the form `(height, width)`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + image_seq_length (`int`, *optional*, defaults to 576): + Sequence length of one image embedding. + multimodal_projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. + + Example: + + ```python + >>> from transformers import LlavaNextForConditionalGeneration, LlavaNextConfig, CLIPVisionConfig, LlamaConfig + + >>> # Initializing a CLIP-vision config + >>> vision_config = CLIPVisionConfig() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration + >>> configuration = LlavaNextConfig(vision_config, text_config) + + >>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration + >>> model = LlavaNextForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llava_next" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_index=32000, + projector_hidden_act="gelu", + vision_feature_select_strategy="default", + vision_feature_layer=-2, + image_grid_pinpoints=None, + tie_word_embeddings=False, + image_seq_length=576, + multimodal_projector_bias=True, + **kwargs, + ): + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + self.image_seq_length = image_seq_length + self.multimodal_projector_bias = multimodal_projector_bias + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {vision_feature_select_strategy}" + ) + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + image_grid_pinpoints = ( + image_grid_pinpoints + if image_grid_pinpoints is not None + else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] + ) + self.image_grid_pinpoints = image_grid_pinpoints + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["clip_vision_model"]( + intermediate_size=4096, + hidden_size=1024, + patch_size=14, + image_size=336, + num_hidden_layers=24, + num_attention_heads=16, + vocab_size=32000, + projection_dim=768, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["llama"]() + + self.text_config = text_config + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +__all__ = ["LlavaNextConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next.py new file mode 100644 index 0000000000000000000000000000000000000000..350ce9db7dc6f53e11090bf8f222665f8fcaf9a9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next.py @@ -0,0 +1,724 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for LLaVa-NeXT.""" + +from collections.abc import Iterable +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import ( + BaseImageProcessor, + BatchFeature, + get_patch_output_size, + get_size_dict, + select_best_resolution, +) +from ...image_transforms import ( + PaddingMode, + convert_to_rgb, + get_resize_output_image_size, + pad, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, is_vision_available, logging + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + from PIL import Image + + +def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]: + """ + Divides an image into patches of a specified size. + + Args: + image (`np.ndarray`): + The input image. + patch_size (`int`): + The size of each patch. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + list: A list of np.ndarray representing the patches. + """ + patches = [] + height, width = get_image_size(image, channel_dim=input_data_format) + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + if input_data_format == ChannelDimension.LAST: + patch = image[i : i + patch_size, j : j + patch_size] + else: + patch = image[:, i : i + patch_size, j : j + patch_size] + patches.append(patch) + + return patches + + +def expand_to_square(image: np.ndarray, background_color, input_data_format) -> np.ndarray: + """ + Expands an image to a square by adding a background color. + """ + + height, width = get_image_size(image, channel_dim=input_data_format) + if width == height: + return image + elif width > height: + result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color + result[(width - height) // 2 : (width - height) // 2 + height, :] = image + return result + else: + result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color + result[:, (height - width) // 2 : (height - width) // 2 + width] = image + return result + + +class LlavaNextImageProcessor(BaseImageProcessor): + r""" + Constructs a LLaVa-NeXT image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques + for processing high resolution images as explained in the [LLaVa paper](https://huggingface.co/papers/2310.03744). + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`): + A list of possible resolutions to use for processing high resolution images. The best resolution is selected + based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values", "image_sizes"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + image_grid_pinpoints: Optional[list] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = True, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 224} + size = get_size_dict(size, default_to_square=False) + image_grid_pinpoints = ( + image_grid_pinpoints + if image_grid_pinpoints is not None + else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] + ) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.image_grid_pinpoints = image_grid_pinpoints + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_pad = do_pad + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize with CLIP->LLaVa + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + default_to_square = True + if "shortest_edge" in size: + size = size["shortest_edge"] + default_to_square = False + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + default_to_square=default_to_square, + input_data_format=input_data_format, + ) + + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def pad( + self, + image: np.ndarray, + padding: Union[int, tuple[int, int], Iterable[tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) + dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected + as input. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + + # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim + if isinstance(padding, int) or len(padding) != 4: + return pad(image, padding, mode, constant_values, data_format, input_data_format) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + if mode == PaddingMode.CONSTANT: + image = np.pad(image, padding, mode="constant", constant_values=constant_values) + elif mode == PaddingMode.REFLECT: + image = np.pad(image, padding, mode="reflect") + elif mode == PaddingMode.REPLICATE: + image = np.pad(image, padding, mode="edge") + elif mode == PaddingMode.SYMMETRIC: + image = np.pad(image, padding, mode="symmetric") + else: + raise ValueError(f"Invalid padding mode: {mode}") + image = ( + to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image + ) + return image + + def _preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Image.Image: + """ + Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + images = make_flat_list_of_images(images) + + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + all_images.append(image) + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + return images + + def _resize_for_patching( + self, image: np.ndarray, target_resolution: tuple, resample, input_data_format: ChannelDimension + ) -> np.ndarray: + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image (np.ndarray): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + np.ndarray: The resized and padded image. + """ + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) + + return resized_image + + def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple): + original_height, original_width = original_resolution + target_height, target_width = target_resolution + paste_x, r_x = divmod(target_width - original_width, 2) + paste_y, r_y = divmod(target_height - original_height, 2) + return (paste_y, paste_y + r_y), (paste_x, paste_x + r_x) + + def _pad_for_patching( + self, image: np.ndarray, target_resolution: tuple, input_data_format: ChannelDimension + ) -> np.ndarray: + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + new_resolution = get_patch_output_size(image, target_resolution, input_data_format) + padding = self._get_padding_size(new_resolution, target_resolution) + + padded_image = self.pad(image, padding=padding) + + return padded_image + + def get_image_patches( + self, + image: np.ndarray, + grid_pinpoints, + size: tuple, + patch_size: int, + resample: PILImageResampling, + data_format: ChannelDimension, + input_data_format: ChannelDimension, + ) -> list[np.ndarray]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image (np.ndarray): + The input image to be processed. + grid_pinpoints (List): + A string representation of a list of possible resolutions. + size (`tuple`): + Size to resize the original image to. + patch_size (`int`): + Size of the patches to divide the image into. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension` or `str`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + list[np.ndarray]: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=input_data_format) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, resample=resample, input_data_format=input_data_format + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format) + + patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format) + + # make sure that all patches are in the input data format + patches = [ + to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format) + for patch in patches + ] + + resized_original_image = resize( + image, + size=size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + ) + + image_patches = [resized_original_image] + patches + + return image_patches + + def _pad_for_batching( + self, + pixel_values: list[np.ndarray], + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches. + + Args: + pixel_values (`list[np.ndarray]`): + An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`) + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + list[`np.ndarray`]: The padded images. + """ + max_patch = max(len(x) for x in pixel_values) + pixel_values = [ + self.pad( + image, + padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)), + data_format=data_format, + input_data_format=input_data_format, + ) + for image in pixel_values + ] + + return pixel_values + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + image_grid_pinpoints: Optional[list] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + image_grid_pinpoints (`List` *optional*, defaults to `self.image_grid_pinpoints`): + A list of possible resolutions to use for processing high resolution images. The best resolution is + selected based on the original size of the image. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_pad = do_pad if do_pad is not None else self.do_pad + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + processed_images = [] + image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images] + for image in images: + # convert image into a list of patches + # we intentionally use the same data format as the input data format + image_patches = self.get_image_patches( + image, + image_grid_pinpoints, + size=(size["shortest_edge"], size["shortest_edge"]) + if "shortest_edge" in size + else (min(size["height"], size["width"]), min(size["height"], size["width"])), + patch_size=crop_size["height"], + resample=resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + + # preprocess patches + pixel_values = self._preprocess( + image_patches, + do_resize=do_resize, + size=size, + resample=resample, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + input_data_format=input_data_format, + ) + pixel_values = np.array(pixel_values) + processed_images.append(pixel_values) + + if do_pad: + processed_images = self._pad_for_batching(processed_images) + + return BatchFeature( + data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors + ) + + +__all__ = ["LlavaNextImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..df20e2b90e8323ff17ed2c80d6d5369aba85b428 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next_fast.py @@ -0,0 +1,281 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for LLaVa-NeXT.""" + +from typing import Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + divide_to_patches, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + SizeDict, + get_image_size, +) +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, +) + + +class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + image_grid_pinpoints (`list[list[int]]`, *optional*): + A list of possible resolutions to use for processing high resolution images. The best resolution is selected + based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` + method. + """ + + image_grid_pinpoints: Optional[list[list[int]]] + + +@auto_docstring +class LlavaNextImageProcessorFast(BaseImageProcessorFast): + # To be checked against the slow image processor + # None values left after checking can be removed + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + default_to_square = False + crop_size = {"height": 224, "width": 224} + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + do_pad = True + image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] + valid_kwargs = LlavaNextFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + def _resize_for_patching( + self, + image: "torch.Tensor", + target_resolution: tuple, + interpolation: "F.InterpolationMode", + input_data_format: ChannelDimension, + ) -> "torch.Tensor": + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image ("torch.Tensor"): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + interpolation (`InterpolationMode`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + "torch.Tensor": The resized and padded image. + """ + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = self.resize( + image=image, + size=SizeDict(height=new_height, width=new_width), + interpolation=interpolation, + ) + + return resized_image + + def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple): + original_height, original_width = original_resolution + target_height, target_width = target_resolution + paste_x, r_x = divmod(target_width - original_width, 2) + paste_y, r_y = divmod(target_height - original_height, 2) + return [paste_x, paste_y, paste_x + r_x, paste_y + r_y] + + def _pad_for_patching( + self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension + ) -> "torch.Tensor": + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + new_resolution = get_patch_output_size(image, target_resolution, input_data_format) + padding = self._get_padding_size(new_resolution, target_resolution) + + padded_image = F.pad(image, padding=padding) + + return padded_image + + def _get_image_patches( + self, + image: "torch.Tensor", + grid_pinpoints, + size: tuple, + patch_size: int, + interpolation: "F.InterpolationMode", + ) -> list["torch.Tensor"]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image ("torch.Tensor"): + The input image to be processed. + grid_pinpoints (List): + A string representation of a list of possible resolutions. + size (`tuple`): + Size to resize the original image to. + patch_size (`int`): + Size of the patches to divide the image into. + interpolation (`"InterpolationMode"`): + Resampling filter to use if resizing the image. + + Returns: + list["torch.Tensor"]: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, interpolation=interpolation, input_data_format=ChannelDimension.FIRST + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=ChannelDimension.FIRST) + patches = divide_to_patches(padded_image, patch_size=patch_size) + resized_original_image = F.resize(image, size=size, interpolation=interpolation) + + image_patches = [resized_original_image] + patches + + return image_patches + + def _pad_for_batching( + self, + pixel_values: list["torch.Tensor"], + ) -> list["torch.Tensor"]: + """ + Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches. + + Args: + pixel_values (`list[torch.Tensor]`): + An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`) + + Returns: + list[`torch.Tensor`]: The padded images. + """ + max_patch = max(len(x) for x in pixel_values) + pixel_values = [ + torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]]) + for image in pixel_values + ] + + return pixel_values + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + image_grid_pinpoints: list[list[int]], + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + do_pad: bool, + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + processed_images = [] + image_sizes = [] + # Determine the size tuple + if size and size.height and size.width: + size_tuple = (size.height, size.width) + else: + size_tuple = (size.shortest_edge, size.shortest_edge) + + # Determine the patch size + if crop_size and crop_size.height: + patch_size = crop_size.height + elif size and size.height: + patch_size = size.height + else: + patch_size = size.shortest_edge + + for image in images: + image_patches = self._get_image_patches( + image, + image_grid_pinpoints, + size=size_tuple, + patch_size=patch_size, + interpolation=interpolation, + ) + + # Group images by size for batched processing + processed_image_patches_grouped = {} + grouped_image_patches, grouped_image_patches_index = group_images_by_shape( + image_patches, disable_grouping=disable_grouping + ) + for shape, stacked_image_patches in grouped_image_patches.items(): + if do_resize: + stacked_image_patches = self.resize( + image=stacked_image_patches, + size=size, + interpolation=interpolation, + ) + if do_center_crop: + stacked_image_patches = self.center_crop(stacked_image_patches, crop_size) + # Fused rescale and normalize + stacked_image_patches = self.rescale_and_normalize( + stacked_image_patches, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_image_patches_grouped[shape] = stacked_image_patches + processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index) + processed_image_patches = ( + torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches + ) + processed_images.append(processed_image_patches) + image_sizes.append(get_image_size(image, ChannelDimension.FIRST)) + + if do_pad: + processed_images = self._pad_for_batching(processed_images) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + return BatchFeature( + data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors + ) + + +__all__ = ["LlavaNextImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/modeling_llava_next.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/modeling_llava_next.py new file mode 100644 index 0000000000000000000000000000000000000000..a75b4b7981078cd690ae7d063e5715dce8bf6696 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/modeling_llava_next.py @@ -0,0 +1,793 @@ +# coding=utf-8 +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Llava-NeXT model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...image_processing_utils import select_best_resolution +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ..auto import AutoModel +from .configuration_llava_next import LlavaNextConfig + + +logger = logging.get_logger(__name__) + + +def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): + """ + Calculate the shape of the image patch grid after the preprocessing for images of any resolution. + + Args: + image_size (`tuple`): + The size of the input image in the format (width, height). + grid_pinpoints (`List`): + A list containing possible resolutions. Each item in the list should be a tuple or list + of the form `(height, width)`. + patch_size (`int`): + The size of each image patch. + + Returns: + tuple: The shape of the image patch grid in the format (width, height). + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints should be a list of tuples or lists") + + # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate + if not isinstance(image_size, (list, tuple)): + if not isinstance(image_size, (torch.Tensor, np.ndarray)): + raise TypeError( + f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor" + ) + image_size = image_size.tolist() + + height, width = select_best_resolution(image_size, grid_pinpoints) + return height // patch_size, width // patch_size + + +def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): + """ + Calculate the number of patches after the preprocessing for images of any resolution. + + Args: + image_size (`torch.LongTensor` or `np.ndarray` or `tuple[int, int]`): + The size of the input image in the format (height, width). ? + grid_pinpoints (`List`): + A list containing possible resolutions. Each item in the list should be a tuple or list + of the form `(height, width)`. + patch_size (`int`): + The size of each image patch. + + Returns: + int: the number of patches + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints should be a list of tuples or lists") + + # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate + if not isinstance(image_size, (list, tuple)): + if not isinstance(image_size, (torch.Tensor, np.ndarray)): + raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}") + image_size = image_size.tolist() + + best_resolution = select_best_resolution(image_size, grid_pinpoints) + height, width = best_resolution + num_patches = 0 + # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1 + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + num_patches += 1 + # add the base patch + num_patches += 1 + return num_patches + + +def unpad_image(tensor, original_size): + """ + Unpads a PyTorch tensor of a padded and resized image. + + Args: + tensor (`torch.Tensor`): + The image tensor, assumed to be of shape (num_channels, height, width). + original_size (`tuple`): + The original size of the image (height, width). + + Returns: + `torch.Tensor`: The unpadded image tensor. + """ + if not isinstance(original_size, (list, tuple)): + if not isinstance(original_size, (torch.Tensor, np.ndarray)): + raise TypeError( + f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor" + ) + original_size = original_size.tolist() + original_height, original_width = original_size + current_height, current_width = tensor.shape[1:] + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + scale_factor = current_width / original_width + new_height = int(round(original_height * scale_factor, 7)) + padding = (current_height - new_height) // 2 + unpadded_tensor = tensor[:, padding : current_height - padding, :] + else: + scale_factor = current_height / original_height + new_width = int(round(original_width * scale_factor, 7)) + padding = (current_width - new_width) // 2 + unpadded_tensor = tensor[:, :, padding : current_width - padding] + + return unpadded_tensor + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Llava outputs, with hidden states and attentions. + """ +) +class LlavaNextModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for LlavaNext causal language model (or autoregressive) outputs. + """ +) +class LlavaNextCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext +class LlavaNextMultiModalProjector(nn.Module): + def __init__(self, config: LlavaNextConfig): + super().__init__() + # We have hidden_size * the number of vision feature layers + num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * num_feature_layers, + config.text_config.hidden_size, + bias=config.multimodal_projector_bias, + ) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear( + config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) + + def forward(self, image_features): + hidden_states = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +@auto_docstring +class LlavaNextPreTrainedModel(PreTrainedModel): + config: LlavaNextConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_flex_attn = True + _supports_attention_backend = True + + def _init_weights(self, module): + std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range) + + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, LlavaNextModel): + embed_std = 1 / math.sqrt(self.config.text_config.hidden_size) + module.image_newline.data.normal_(mean=0.0, std=embed_std) + + +@auto_docstring( + custom_intro=""" + The Llava-Next model which consists of a vision backbone and a language model without language modeling head. + """ +) +class LlavaNextModel(LlavaNextPreTrainedModel): + _checkpoint_conversion_mapping = {"language_model.model": "language_model"} + + def __init__(self, config: LlavaNextConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = LlavaNextMultiModalProjector(config) + embed_std = 1 / math.sqrt(config.text_config.hidden_size) + self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std) + + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModel.from_config(config.text_config) + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None): + """ + Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors. + + Args: + image_features (`list[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`) + List of image feature tensor, each contains all the visual feature of all patches. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + vision_feature_select_strategy (`str`) + The feature selection strategy used to select the vision feature from the vision backbone. + image_newline (`torch.Tensor` of shape `(embed_dim)`) + New line embedding vector. + Returns: + image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`) + feature_lens (`list[int]`) + token length of each image in image_features + """ + new_image_features = [] + feature_lens = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size + + num_patch_height, num_patch_width = get_anyres_image_grid_shape( + image_sizes[image_idx], + self.config.image_grid_pinpoints, + self.config.vision_config.image_size, + ) + + if ( + np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0 + and vision_feature_select_strategy == "default" + ): + logger.warning_once( + "Image feature shape does not line up with the provided patch size. " + "You may be using the `default` vision_feature_select_strategy with a" + " visual encoder that does not have CLS." + ) + + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, image_sizes[image_idx]) + if image_newline is not None: + image_feature = torch.cat( + ( + image_feature, + image_newline[:, None, None] + .expand(*image_feature.shape[:-1], 1) + .to(image_feature.device, image_feature.dtype), + ), + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + else: + image_feature = image_feature[0] + if image_newline is not None: + image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0) + new_image_features.append(image_feature) + feature_lens.append(image_feature.size(0)) + feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device) + return new_image_features, feature_lens + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + vision_feature_layer (`Union[int, list[int]]`, *optional*): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + vision_feature_select_strategy (`str`, *optional*): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (list[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + # ! infer image_num_patches from image_sizes + image_num_patches = [ + image_size_to_num_patches( + image_size=imsize, + grid_pinpoints=self.config.image_grid_pinpoints, + patch_size=self.config.vision_config.image_size, + ) + for imsize in image_sizes + ] + if pixel_values.dim() == 5: + # stacked if input is (batch_size, num_patches, num_channels, height, width) + _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)] + pixel_values = torch.cat(_pixel_values_list, dim=0) + elif pixel_values.dim() != 4: + # otherwise has to be stacked from list of (num_patches, num_channels, height, width) + raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_features.hidden_states[vision_feature_layer] + else: + hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_image_feature = torch.cat(hs_pool, dim=-1) + + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + + image_features = self.multi_modal_projector(selected_image_feature) + image_features = torch.split(image_features, image_num_patches, dim=0) + + # NOTE we only support multimodal_patch_merge_type == "spatial_unpad" + image_features, feature_lens = self.pack_image_features( + image_features, + image_sizes, + vision_feature_select_strategy=vision_feature_select_strategy, + image_newline=self.image_newline, + ) + return image_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + image_sizes: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, LlavaNextModelOutputWithPast]: + r""" + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None and pixel_values.size(0) > 0: + image_features = self.get_image_features( + pixel_values, + image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **kwargs, + ) + + return LlavaNextModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + The LLAVA-NeXT model which consists of a vision backbone and a language model. + """ +) +class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = { + "^language_model.model": "model.language_model", + "^vision_tower": "model.vision_tower", + "^multi_modal_projector": "model.multi_modal_projector", + "^image_newline": "model.image_newline", + "^language_model.lm_head": "lm_head", + } + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: LlavaNextConfig): + super().__init__(config) + self.model = LlavaNextModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None): + return self.model.pack_image_features( + image_features=image_features, + image_sizes=image_sizes, + vision_feature_select_strategy=vision_feature_select_strategy, + image_newline=image_newline, + ) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + image_sizes=image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + image_sizes: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, LlavaNextCausalLMOutputWithPast]: + r""" + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, LlavaNextForConditionalGeneration + + >>> model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") + >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") + + >>> prompt = "[INST] \nWhat is shown in this image? [/INST]" + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs, max_length=30) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot (...)" + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + outputs = self.model( + input_ids, + pixel_values=pixel_values, + image_sizes=image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return LlavaNextCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + image_sizes=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + model_inputs["image_sizes"] = image_sizes + + return model_inputs + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +__all__ = ["LlavaNextForConditionalGeneration", "LlavaNextPreTrainedModel", "LlavaNextModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/processing_llava_next.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/processing_llava_next.py new file mode 100644 index 0000000000000000000000000000000000000000..2574fc443519f928a1d8f14bed08ae15466950c5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/processing_llava_next.py @@ -0,0 +1,266 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for LLaVa-NeXT. +""" + +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import select_best_resolution +from ...image_utils import ImageInput, get_image_size, to_numpy_array +from ...processing_utils import ( + MultiModalData, + ProcessingKwargs, + ProcessorMixin, + Unpack, +) +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class LlavaNextProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + "return_mm_token_type_ids": False, + }, + "images_kwargs": { + "do_pad": True, + }, + } + + +class LlavaNextProcessor(ProcessorMixin): + r""" + Constructs a LLaVa-NeXT processor which wraps a LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor. + + [`LlavaNextProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information. + + Args: + image_processor ([`LlavaNextImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`], *optional*): + The tokenizer is a required input. + patch_size (`int`, *optional*): + Patch size from the vision tower. + vision_feature_select_strategy (`str`, *optional*): + The feature selection strategy used to select the vision feature from the vision backbone. + Should be same as in model's config + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + image_token (`str`, *optional*, defaults to `""`): + Special token used to denote image location. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer=None, + patch_size=None, + vision_feature_select_strategy=None, + chat_template=None, + image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + num_additional_image_tokens=0, + **kwargs, + ): + self.patch_size = patch_size + self.num_additional_image_tokens = num_additional_image_tokens + self.vision_feature_select_strategy = vision_feature_select_strategy + self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_token_id = ( + tokenizer.image_token_id + if getattr(tokenizer, "image_token_id", None) + else tokenizer.convert_tokens_to_ids(self.image_token) + ) + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[LlavaNextProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if images is None and text is None: + raise ValueError("You have to specify at least images or text.") + + output_kwargs = self._merge_kwargs( + LlavaNextProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + else: + image_inputs = {} + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise TypeError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = text + if image_inputs: + image_sizes = iter(image_inputs["image_sizes"]) + height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0])) + prompt_strings = [] + for sample in text: + while self.image_token in sample: + image_size = next(image_sizes) + if not isinstance(image_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + image_size = image_size.tolist() + orig_height, orig_width = image_size + num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) + if self.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + sample = sample.replace(self.image_token, "" * num_image_tokens, 1) + prompt_strings.append(sample) + prompt_strings = [sample.replace("", self.image_token) for sample in prompt_strings] + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[array_ids == self.image_token_id] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int: + image_grid_pinpoints = self.image_processor.image_grid_pinpoints + + height_best_resolution, width_best_resolution = select_best_resolution( + [orig_height, orig_width], image_grid_pinpoints + ) + scale_height, scale_width = height_best_resolution // height, width_best_resolution // width + + patches_height = height // self.patch_size + patches_width = width // self.patch_size + unpadded_features, newline_features = self._get_unpadded_features( + orig_height, orig_width, patches_height, patches_width, scale_height, scale_width + ) + # The base patch covers the entire image (+1 for the CLS) + base_features = patches_height * patches_width + self.num_additional_image_tokens + num_image_tokens = unpadded_features + newline_features + base_features + return num_image_tokens + + def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width): + """ + Get number of features for a given image with height/width. LLaVA-NeXT is different from LLaVA + because it divided each image into patches depending on its resolution. Therefore we need to calculate how many + patches an image is divided into and get the number of features from that. + """ + current_height = patches_height * scale_height + current_width = patches_width * scale_width + + original_aspect_ratio = width / height + current_aspect_ratio = current_width / current_height + if original_aspect_ratio > current_aspect_ratio: + new_height = int(round(height * (current_width / width), 7)) + padding = (current_height - new_height) // 2 + current_height -= padding * 2 + else: + new_width = int(round(width * (current_height / height), 7)) + padding = (current_width - new_width) // 2 + current_width -= padding * 2 + + unpadded_features = current_height * current_width + newline_features = current_height + return (unpadded_features, newline_features) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + Args: + image_sizes (list[list[str]], *optional*): + The input sizes formatted as (height, width) per each image. + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + vision_data = {} + if image_sizes is not None: + images_kwargs = LlavaNextProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + + size = images_kwargs.get("size", None) or self.image_processor.size + size = ( + (size["shortest_edge"], size["shortest_edge"]) + if "shortest_edge" in size + else (min(size["height"], size["width"]), min(size["height"], size["width"])) + ) + processed_height, processed_width = size + + batch_num_image_tokens = [] + num_image_patches = [1] * len(image_sizes) # llava-next doesn't batch pixels as Idefics, thus `1` patch` + for image_size in image_sizes: + orig_height, orig_width = image_size + num_image_tokens = self._get_number_of_features( + orig_height, orig_width, processed_height, processed_width + ) + if self.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + batch_num_image_tokens.append(num_image_tokens) + vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + +__all__ = ["LlavaNextProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9dfbb212df4f3cc9528f7fb22d0b19a17b74efe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/configuration_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/configuration_llava_next_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72be050034b1eea21d2c90fab4440475c69a8789 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/configuration_llava_next_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/image_processing_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/image_processing_llava_next_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f39b198640566d3e91bd73b61d6eb95c896357e7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/image_processing_llava_next_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modeling_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modeling_llava_next_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..317404c2ed0b0dea6200e61404222aa84155ad15 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modeling_llava_next_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modular_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modular_llava_next_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d5b63be3535a8f880650dde9873ab26430f5c5f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modular_llava_next_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/processing_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/processing_llava_next_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4db0b7f5edc60f499d146b5108d84e1907976d4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/processing_llava_next_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/video_processing_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/video_processing_llava_next_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c45bdc7715293b882c1f381e4ec47a68449b8780 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/video_processing_llava_next_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/modeling_longt5.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/modeling_longt5.py new file mode 100644 index 0000000000000000000000000000000000000000..ea6ab0cfff35097a62cf665da6c37742553083fa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/modeling_longt5.py @@ -0,0 +1,2227 @@ +# coding=utf-8 +# Copyright 2022 Google LLC., LongT5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch LongT5 model.""" + +import copy +import math +import warnings +from typing import Any, Optional, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + DUMMY_INPUTS, + DUMMY_MASK, + auto_docstring, + is_torch_flex_attn_available, + is_torch_fx_proxy, + is_torchdynamo_compiling, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_longt5 import LongT5Config + + +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import BlockMask + + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +# TODO: Update before the merge + + +def _pad_to_multiple(x: torch.Tensor, block_len: int, dim: int, pad_value: int = 0) -> torch.Tensor: + """Pad a tensor so that a sequence length will be a multiple of `block_len`""" + pad_len = -x.shape[dim] % block_len + # Handle cases when an empty input sequence is given + if not all(x.shape): + new_shape = list(x.shape) + new_shape[dim] += pad_len + return torch.zeros(new_shape, dtype=x.dtype) + + pad = [(0, 0)] * x.ndim + pad[dim] = (0, pad_len) + pad = sum(pad[::-1], ()) + x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value) + return x + + +def _split_into_blocks(x: torch.Tensor, block_len: int, dim: int) -> torch.Tensor: + """Split an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length + is not a multiple of `block_len`, it will be padded first with selected `pad_value`. + """ + # pad tensor to multiple of block_len + if x.shape[dim] % block_len != 0: + x = _pad_to_multiple(x, block_len, dim, pad_value=0) + num_blocks = x.shape[dim] // block_len + output_shape = x.shape[:dim] + (num_blocks, block_len) + x.shape[(dim + 1) :] + # If 0 is in output_shape, we cannot apply reshape because of incompatibility with ONNX conversion + if 0 in output_shape: + return torch.empty(output_shape, dtype=x.dtype, device=x.device) + return x.reshape(output_shape) + + +def _concatenate_3_blocks(x: torch.Tensor, block_dim: int, sequence_dim: int, pad_value: int = 0) -> torch.Tensor: + """Concatenate three consecutive blocks for each input block for local attentiont. + + For more information, see: https://huggingface.co/papers/2112.07916. + """ + num_blocks = x.shape[block_dim] + + pad = [(0, 0)] * x.ndim + pad[block_dim] = (1, 1) + pad = sum(pad[::-1], ()) + # [batch_size, num_blocks, block_len] -> [batch_size, num_blocks + 2, block_len] + x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value) + + blocks_list: list[torch.Tensor] = [] + for i in range(3): + # We use indexing approach here: + # https://numpy.org/doc/stable/user/basics.indexing.html#dealing-with-variable-numbers-of-indices-within-programs + indices = [slice(0, None)] * x.ndim + indices[block_dim] = slice(i, i + num_blocks) + indices = tuple(indices) + blocks_list.append(x[indices]) + # [batch_size, num_blocks, 3 * block_len, ...] + return torch.cat(blocks_list, dim=sequence_dim) + + +def _make_3block_relative_position_ids(block_len: int) -> torch.Tensor: + """Makes 3-blocked relative position ids for local attention.""" + position_ids = torch.arange(3 * block_len, dtype=torch.int32) + center_position_ids = position_ids[block_len:-block_len] + # [block_len, 3 * block_len] + relative_position_ids = position_ids.unsqueeze(0) - center_position_ids.unsqueeze(1) + return relative_position_ids + + +def _mask_local_attention_mask(local_attention_mask: torch.Tensor, block_len: int) -> torch.Tensor: + """Mask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.""" + relative_position_ids = _make_3block_relative_position_ids(block_len) + locality_mask = torch.abs(relative_position_ids) < block_len + locality_mask = locality_mask[None, None, :, :] + locality_mask = locality_mask.to(local_attention_mask.device) + return torch.logical_and(local_attention_mask, locality_mask) + + +def _get_local_attention_mask(attention_mask: torch.Tensor, block_len: int, device: torch.device) -> torch.Tensor: + """Prepare attention mask to be applied for a local attention.""" + # [batch_size, num_blocks, block_len] + _blocked_attention_mask = _split_into_blocks(attention_mask, block_len, dim=1) + # [batch_size, num_block, 3 * block_len] + _3blocked_attention_mask = _concatenate_3_blocks(_blocked_attention_mask, block_dim=1, sequence_dim=2) + + _blocked_attention_mask = _blocked_attention_mask.unsqueeze(-1) + _3blocked_attention_mask = _3blocked_attention_mask.unsqueeze(-2) + # [batch_size, num_block, block_len, 3 * block_len] + local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask) + local_attention_mask = _mask_local_attention_mask(local_attention_mask, block_len) + # [batch_size, 1, num_block, block_len, 3 * block_len] + return local_attention_mask.unsqueeze(1).to(device) + + +def _make_global_fixed_block_ids( + attention_mask: torch.Tensor, global_block_size: int +) -> tuple[torch.Tensor, torch.Tensor]: + """Obtain the "fixed block" global id corresponding to each input token. + + This implementation is a simplified version of the original Flaxformr implementation adopted from: + https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py. + + In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for + the whole fixed block, are assigned to the preceding block. + + Padding tokens from the original sequence are represented by -1. + """ + batch_size, seq_len = attention_mask.shape[:2] + + def handle_orphan_tokens(block_ids: torch.Tensor) -> torch.Tensor: + block_ends = (torch.arange(seq_len) % global_block_size) == global_block_size - 1 + block_ends = block_ends.to(block_ids.device) + true_block_ends = torch.logical_and(block_ends, block_ids >= 0) + full_blocks = true_block_ends.sum(-1).unsqueeze(-1).type(block_ids.dtype) - 1 + block_ids = torch.where(block_ids < full_blocks, block_ids, full_blocks) + return block_ids + + fixed_block_mask = torch.ones_like(attention_mask, device=attention_mask.device) / global_block_size + fixed_block_mask = torch.cumsum(fixed_block_mask, axis=1) - fixed_block_mask + mask = torch.where(attention_mask != 0.0, 1.0, -1000.0).type(attention_mask.dtype) + global_block_ids = torch.floor(mask + fixed_block_mask - 1.0).type(attention_mask.dtype) + _global_block_ids_lower_bound = torch.tensor(-1, dtype=global_block_ids.dtype, device=global_block_ids.device) + global_block_ids = torch.where( + global_block_ids > _global_block_ids_lower_bound, global_block_ids, _global_block_ids_lower_bound + ) + # set padding tokens to -1 + global_block_ids = (global_block_ids * attention_mask) + (attention_mask - 1) + # [batch_size, seq_len] + global_block_ids = handle_orphan_tokens(global_block_ids) + num_globals = seq_len // global_block_size + # [batch_size, seq_len // global_block_size] + if num_globals > 0: + _sequence_block_ids_max = torch.max(global_block_ids, dim=-1).values.repeat(num_globals, 1).transpose(0, 1) + else: + _sequence_block_ids_max = torch.zeros( + batch_size, 0, dtype=global_block_ids.dtype, device=global_block_ids.device + ) + global_segment_ids = torch.cumsum(torch.ones(batch_size, num_globals), dim=-1) - 1 + global_segment_ids = global_segment_ids.to(attention_mask.device) + global_segment_ids = torch.where(global_segment_ids <= _sequence_block_ids_max, 1, 0) + return global_block_ids.type(torch.int), global_segment_ids.type(torch.int) + + +def _make_side_relative_position_ids(attention_mask: torch.Tensor, global_block_size: int) -> torch.Tensor: + """Create the relative position tensor for local -> global attention.""" + block_ids, global_segment_ids = _make_global_fixed_block_ids(attention_mask, global_block_size) + global_seq_len = global_segment_ids.shape[-1] + global_positions = torch.arange(global_seq_len, device=block_ids.device) + side_relative_position = global_positions - block_ids[..., None] + return side_relative_position.type(torch.int64) + + +def _create_global_aggregates( + hidden_states: torch.Tensor, block_ids: torch.Tensor, global_seq_len: int +) -> torch.Tensor: + """Compute individual block aggregates by summing over individual blocks.""" + # (batch..., seq_len, global_seq_len)) + block_ids = block_ids.where( + block_ids >= 0, torch.tensor(global_seq_len, dtype=block_ids.dtype, device=block_ids.device) + ) + one_hot_block_ids = nn.functional.one_hot(block_ids.type(torch.int64), global_seq_len + 1)[:, :, :-1] + return torch.einsum("...nd,...ng->...gd", hidden_states, one_hot_block_ids.type(hidden_states.dtype)) + + +# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->LongT5 +class LongT5LayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean. + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean + # Square Layer Normalization https://huggingface.co/papers/1910.07467 thus variance is calculated + # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for + # half-precision inputs is done in fp32 + + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +try: + from apex.normalization import FusedRMSNorm + + LongT5LayerNorm = FusedRMSNorm + + logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNorm") +except ImportError: + # using the normal LongT5LayerNorm + pass +except Exception: + logger.warning("discovered apex but it failed to load, falling back to LongT5LayerNorm") + pass + + +# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->LongT5 +class LongT5DenseActDense(nn.Module): + def __init__(self, config: LongT5Config): + super().__init__() + self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.act = ACT2FN[config.dense_act_fn] + + def forward(self, hidden_states): + hidden_states = self.wi(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.dropout(hidden_states) + if ( + isinstance(self.wo.weight, torch.Tensor) + and hidden_states.dtype != self.wo.weight.dtype + and self.wo.weight.dtype != torch.int8 + ): + hidden_states = hidden_states.to(self.wo.weight.dtype) + hidden_states = self.wo(hidden_states) + return hidden_states + + +class LongT5DenseGatedActDense(nn.Module): + def __init__(self, config: LongT5Config): + super().__init__() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.act = ACT2FN[config.dense_act_fn] + + def forward(self, hidden_states): + hidden_gelu = self.act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + + +# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->LongT5 +class LongT5LayerFF(nn.Module): + def __init__(self, config: LongT5Config): + super().__init__() + if config.is_gated_act: + self.DenseReluDense = LongT5DenseGatedActDense(config) + else: + self.DenseReluDense = LongT5DenseActDense(config) + + self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + forwarded_states = self.DenseReluDense(forwarded_states) + hidden_states = hidden_states + self.dropout(forwarded_states) + return hidden_states + + +# Copied from transformers.models.t5.modeling_t5.T5Attention with T5->LongT5 +class LongT5Attention(nn.Module): + def __init__( + self, + config: LongT5Config, + has_relative_attention_bias=False, + layer_idx: Optional[int] = None, + ): + super().__init__() + self.is_decoder = config.is_decoder + self.has_relative_attention_bias = has_relative_attention_bias + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.relative_attention_max_distance = config.relative_attention_max_distance + self.d_model = config.d_model + self.key_value_proj_dim = config.d_kv + self.n_heads = config.num_heads + self.dropout = config.dropout_rate + self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + # Mesh TensorFlow initialization to avoid scaling before softmax + self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) + + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + self.pruned_heads = set() + self.gradient_checkpointing = False + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads + ) + # Prune linear layers + self.q = prune_linear_layer(self.q, index) + self.k = prune_linear_layer(self.k, index) + self.v = prune_linear_layer(self.v, index) + self.o = prune_linear_layer(self.o, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.inner_dim = self.key_value_proj_dim * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @staticmethod + def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. The relative position is defined as + memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for + small absolute relative_position and larger buckets for larger absolute relative_positions. All relative + positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. + This should allow for more graceful generalization to longer sequences than the model has been trained on + + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + + Returns: + a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) + """ + relative_buckets = 0 + if bidirectional: + num_buckets //= 2 + relative_buckets += (relative_position > 0).to(torch.long) * num_buckets + relative_position = torch.abs(relative_position) + else: + relative_position = -torch.min(relative_position, torch.zeros_like(relative_position)) + # now relative_position is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = relative_position < max_exact + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + relative_position_if_large = max_exact + ( + torch.log(relative_position.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.long) + relative_position_if_large = torch.min( + relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1) + ) + + relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) + return relative_buckets + + def compute_bias(self, query_length, key_length, device=None, cache_position=None): + """Compute binned relative position bias""" + if device is None: + device = self.relative_attention_bias.weight.device + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) + memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] + relative_position = memory_position - context_position # shape (query_length, key_length) + relative_position_bucket = self._relative_position_bucket( + relative_position, # shape (query_length, key_length) + bidirectional=(not self.is_decoder), + num_buckets=self.relative_attention_num_buckets, + max_distance=self.relative_attention_max_distance, + ) + values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads) + values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length) + return values + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_values=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) + batch_size, seq_length = hidden_states.shape[:2] + + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None + + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + # Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache` + is_updated = False + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) + + if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] + + if mask is not None: + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + + scores += position_bias_masked + + # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask + + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) + attn_output = self.o(attn_output) + + outputs = (attn_output, position_bias) + + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + +class LongT5LocalAttention(nn.Module): + def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = False) -> None: + super().__init__() + self.is_decoder = config.is_decoder + self.has_relative_attention_bias = has_relative_attention_bias + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.relative_attention_max_distance = config.relative_attention_max_distance + self.d_model = config.d_model + self.key_value_proj_dim = config.d_kv + self.n_heads = config.num_heads + self.local_radius = config.local_radius + self.block_len = self.local_radius + 1 + self.dropout = config.dropout_rate + self.inner_dim = self.n_heads * self.key_value_proj_dim + + # Mesh TensorFlow initialization to avoid scaling before softmax + self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) + + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + self.pruned_heads = set() + self.gradient_checkpointing = False + + # Copied from transformers.models.t5.modeling_t5.T5Attention.prune_heads + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads + ) + # Prune linear layers + self.q = prune_linear_layer(self.q, index) + self.k = prune_linear_layer(self.k, index) + self.v = prune_linear_layer(self.v, index) + self.o = prune_linear_layer(self.o, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.inner_dim = self.key_value_proj_dim * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @staticmethod + # Copied from transformers.models.t5.modeling_t5.T5Attention._relative_position_bucket + def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. The relative position is defined as + memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for + small absolute relative_position and larger buckets for larger absolute relative_positions. All relative + positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. + This should allow for more graceful generalization to longer sequences than the model has been trained on + + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + + Returns: + a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) + """ + relative_buckets = 0 + if bidirectional: + num_buckets //= 2 + relative_buckets += (relative_position > 0).to(torch.long) * num_buckets + relative_position = torch.abs(relative_position) + else: + relative_position = -torch.min(relative_position, torch.zeros_like(relative_position)) + # now relative_position is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = relative_position < max_exact + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + relative_position_if_large = max_exact + ( + torch.log(relative_position.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.long) + relative_position_if_large = torch.min( + relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1) + ) + + relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) + return relative_buckets + + def compute_bias(self, block_length: int): + """Compute binned relative position bias""" + target_device = ( + self.relative_attention_bias.weight.device + if self.relative_attention_bias.weight.device.type != "meta" + else None + ) + memory_position = torch.arange(3 * block_length, dtype=torch.long, device=target_device) + context_position = memory_position[block_length:-block_length] + + # (block_length, 3 * block_length) + relative_position = memory_position[None, :] - context_position[:, None] + relative_position_bucket = self._relative_position_bucket( + relative_position, # (block_length, 3 * block_length) + bidirectional=(not self.is_decoder), + num_buckets=self.relative_attention_num_buckets, + max_distance=self.relative_attention_max_distance, + ) + # (block_length, 3 * block_length, num_heads) + values = self.relative_attention_bias(relative_position_bucket) + # (1, 1, num_heads, block_length, 3 * block_length) + values = values.permute([2, 0, 1]).unsqueeze(0).unsqueeze(0) + return values + + def forward( + self, + hidden_states, + mask=None, + position_bias=None, + layer_head_mask=None, + output_attentions=False, + ): + batch_size, seq_length = hidden_states.shape[:2] + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim) + + def unshape(states): + """reshape""" + return states.contiguous().view(batch_size, -1, self.inner_dim) + + # get query/key/value states -> (batch_size, seq_length, n_heads, dim_per_head) + query_states = shape(self.q(hidden_states)) + key_states = shape(self.k(hidden_states)) + value_states = shape(self.v(hidden_states)) + + # Split into blocks -> (batch_size, num_blocks, block_len, n_heads, dim_per_head) + query_states = _split_into_blocks(query_states, self.block_len, dim=1) + key_states = _split_into_blocks(key_states, self.block_len, dim=1) + value_states = _split_into_blocks(value_states, self.block_len, dim=1) + + # Concatenate 3 blocks for keys and values -> (batch_size, num_blocks, 3 * block_len, n_heads, dim_per_head) + key_states = _concatenate_3_blocks(key_states, block_dim=1, sequence_dim=2) + value_states = _concatenate_3_blocks(value_states, block_dim=1, sequence_dim=2) + + # Compute scores + scores = torch.einsum( + "...qhd,...khd->...hqk", query_states, key_states + ) # (batch_size, num_block, n_heads, block_len, 3 * block_len) + + if position_bias is None: + # position_bias shape: # (1, 1, n_heads, block_len, 3 * block_len) + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, 1, self.n_heads, self.block_len, 3 * self.block_len), device=scores.device, dtype=scores.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(self.block_len) + + if mask is not None: + # Replace masked positions with -1e10 (according to the original implementation) + mask = torch.where(mask > 0, 0.0, -1e10) + # We need to adjust position bias shape to be sum with mask + position_bias = position_bias + mask.transpose(1, 2) + + scores += position_bias + # (batch_size, num_blocks, n_heads, block_len, 3 * block_len) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + # (batch_size, num_blocks, n_heads, block_len, 3 * block_len) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask + attn_weights = attn_weights.type(value_states.dtype) + attn_output = unshape(torch.einsum("...hqk,...khd->...qhd", attn_weights, value_states)) + attn_output = attn_output[:, :seq_length, :] + attn_output = self.o(attn_output) + + outputs = ( + attn_output, + position_bias, + ) + + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + +class LongT5TransientGlobalAttention(nn.Module): + def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = False) -> None: + super().__init__() + self.is_decoder = config.is_decoder + self.has_relative_attention_bias = has_relative_attention_bias + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.relative_attention_max_distance = config.relative_attention_max_distance + self.d_model = config.d_model + self.key_value_proj_dim = config.d_kv + self.n_heads = config.num_heads + self.local_radius = config.local_radius + self.block_len = self.local_radius + 1 + self.global_block_size = config.global_block_size + self.dropout = config.dropout_rate + self.inner_dim = self.n_heads * self.key_value_proj_dim + + # Mesh TensorFlow initialization to avoid scaling before softmax + self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) + + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + self.pruned_heads = set() + + # Relativen attention bias & Layer norm for global attention + if self.has_relative_attention_bias: + self.global_relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + self.global_input_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + + # Copied from transformers.models.t5.modeling_t5.T5Attention.prune_heads + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads + ) + # Prune linear layers + self.q = prune_linear_layer(self.q, index) + self.k = prune_linear_layer(self.k, index) + self.v = prune_linear_layer(self.v, index) + self.o = prune_linear_layer(self.o, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.inner_dim = self.key_value_proj_dim * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @staticmethod + # Copied from transformers.models.t5.modeling_t5.T5Attention._relative_position_bucket + def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. The relative position is defined as + memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for + small absolute relative_position and larger buckets for larger absolute relative_positions. All relative + positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. + This should allow for more graceful generalization to longer sequences than the model has been trained on + + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + + Returns: + a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) + """ + relative_buckets = 0 + if bidirectional: + num_buckets //= 2 + relative_buckets += (relative_position > 0).to(torch.long) * num_buckets + relative_position = torch.abs(relative_position) + else: + relative_position = -torch.min(relative_position, torch.zeros_like(relative_position)) + # now relative_position is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = relative_position < max_exact + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + relative_position_if_large = max_exact + ( + torch.log(relative_position.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.long) + relative_position_if_large = torch.min( + relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1) + ) + + relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) + return relative_buckets + + def compute_bias(self, block_length: int): + """Compute binned relative position bias""" + target_device = ( + self.relative_attention_bias.weight.device + if self.relative_attention_bias.weight.device.type != "meta" + else None + ) + memory_position = torch.arange(3 * block_length, dtype=torch.long, device=target_device) + context_position = memory_position[block_length:-block_length] + + # (block_length, 3 * block_length) + relative_position = memory_position[None, :] - context_position[:, None] + relative_position_bucket = self._relative_position_bucket( + relative_position, # (block_length, 3 * block_length) + bidirectional=(not self.is_decoder), + num_buckets=self.relative_attention_num_buckets, + max_distance=self.relative_attention_max_distance, + ) + # (block_length, 3 * block_length, num_heads) + values = self.relative_attention_bias(relative_position_bucket) + # (1, 1, num_heads, block_length, 3 * block_length) + values = values.permute([2, 0, 1]).unsqueeze(0).unsqueeze(0) + return values + + def compute_side_bias(self, mask: torch.Tensor, global_segment_ids: torch.Tensor) -> torch.Tensor: + # (batch_size, 1, seq_len, global_seq_len) + side_attention_mask = torch.eq(mask[..., None], global_segment_ids[:, None, :])[:, None, ...] + attention_side_bias = torch.where(side_attention_mask > 0, 0.0, -1e10) + # (batch_size, seq_len, global_seq_len) + side_relative_position = _make_side_relative_position_ids(mask, self.global_block_size) + side_relative_position_bucket = self._relative_position_bucket( + side_relative_position, + bidirectional=(not self.is_decoder), + num_buckets=self.relative_attention_num_buckets, + max_distance=self.relative_attention_max_distance, + ) + # (batch_size, seq_len, global_seq_len, num_heads) + side_bias = self.global_relative_attention_bias(side_relative_position_bucket) + + # (batch_size, num_heads, seq_len, global_seq_len) + side_bias = side_bias.permute([0, 3, 1, 2]) + # (batch_size, num_heads, seq_len, global_seq_len) + attention_side_bias = attention_side_bias + side_bias + return attention_side_bias + + def forward( + self, + hidden_states, + mask=None, + position_bias=None, + layer_head_mask=None, + output_attentions=False, + ): + batch_size, seq_length = hidden_states.shape[:2] + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim) + + def unshape(states): + """reshape""" + return states.contiguous().view(batch_size, -1, self.inner_dim) + + # Prepare components for transient-global attention + # Obtain block_ids and global_segment_ids + # global_seq_len := seq_len // self.global_block_size + # shapes: (batch_size, seq_len) & (batch_size, global_seq_len) + block_ids, global_segment_ids = _make_global_fixed_block_ids( + mask if mask is not None else torch.ones(hidden_states.shape[:-1]), + self.global_block_size, + ) + # Create global inputs + _global_seq_len = global_segment_ids.shape[-1] + global_inputs = _create_global_aggregates(hidden_states, block_ids, _global_seq_len) + global_inputs = self.global_input_layer_norm(global_inputs) + + # get query states -> (batch_size, seq_length, n_heads, dim_per_head) + query_states = shape(self.q(hidden_states)) + key_states = shape(self.k(hidden_states)) + value_states = shape(self.v(hidden_states)) + # Get global/side key/value states shape: (batch_size, global_seq_len, n_heads, dim_per_head) + side_key_states = shape(self.k(global_inputs)) + side_value_states = shape(self.v(global_inputs)) + + # Split into blocks -> (batch_size, num_blocks, block_len, n_heads, dim_per_head) + query_states = _split_into_blocks(query_states, self.block_len, dim=1) + key_states = _split_into_blocks(key_states, self.block_len, dim=1) + value_states = _split_into_blocks(value_states, self.block_len, dim=1) + + # Concatenate 3 blocks for keys and values -> (batch_size, num_blocks, 3 * block_len, n_heads, dim_per_head) + key_states = _concatenate_3_blocks(key_states, block_dim=1, sequence_dim=2) + value_states = _concatenate_3_blocks(value_states, block_dim=1, sequence_dim=2) + + # Tile side inputs across local key/value blocks + # New shape: (batch_size, num_blocks, global_seq_len, n_heads, dim_per_head) + reps = [1] * (side_key_states.ndim + 1) + reps[1] = key_states.shape[1] + side_key_states = side_key_states.unsqueeze(1).repeat(reps) + side_value_states = side_value_states.unsqueeze(1).repeat(reps) + + # Concatenate "local" and "side"/"global" key/value states to allow each token to attend global aggregated ones + # New shape: (batch_size, num_blocks, 3 * block_len + global_seq_len, n_heads, dim_per_head) + key_states = torch.cat([key_states, side_key_states], dim=2) + value_states = torch.cat([value_states, side_value_states], dim=2) + + # Compute scores -> (batch_size, num_block, n_heads, block_len, 3 * block_len + global_seq_len) + scores = torch.einsum("...qhd,...khd->...hqk", query_states, key_states) + + if mask is not None: + # We need to adjust position bias shape to be sum with mask + local_attention_mask = _get_local_attention_mask(mask, self.block_len, hidden_states.device) + # Replace masked positions with -10_000 (according to the original implementation) + local_attention_mask = torch.where(local_attention_mask > 0, 0.0, -1e10) + else: + local_attention_mask = None + + if position_bias is None: + # position_bias shape: # (1, 1, n_heads, block_len, 3 * block_len) + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, 1, self.n_heads, self.block_len, 3 * self.block_len), + device=scores.device, + dtype=scores.dtype, + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(self.block_len) + + if local_attention_mask is not None: + # (batch_size, 1, n_heads, block_len, 3 * block_len) + position_bias = position_bias + local_attention_mask.transpose(1, 2) + position_bias = position_bias.type(scores.dtype) + + # Calculate global/side bias - shape: # (batch_size, num_heads, seq_len, global_seq_len) + if mask is None: + mask = torch.ones(batch_size, seq_length) + # (batch_size, num_heads, seq_len, global_seq_len) + side_position_bias = self.compute_side_bias(mask, global_segment_ids) + # (batch_size, num_blocks, num_heads, block_len, global_seq_len) + side_position_bias = _split_into_blocks(side_position_bias, self.block_len, dim=-2).transpose(1, 2) + side_position_bias = side_position_bias.type(scores.dtype).to(scores.device) + # (batch_size, num_blocks, num_heads, block_len, 3 * block_len + global_seq_len) + position_bias = torch.cat([position_bias, side_position_bias], dim=-1) + + scores += position_bias + # (batch_size, num_blocks, n_heads, block_len, 3 * block_len + global_seq_len) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask + attn_weights = attn_weights.type(value_states.dtype) + attn_output = unshape(torch.einsum("...hqk,...khd->...qhd", attn_weights, value_states)) + attn_output = attn_output[:, :seq_length, :] + attn_output = self.o(attn_output) + + outputs = (attn_output, position_bias) + + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + +# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->LongT5 +class LongT5LayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): + super().__init__() + self.SelfAttention = LongT5Attention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) + self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_values=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.SelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them + return outputs + + +class LongT5LayerLocalSelfAttention(nn.Module): + """Local self attention used in encoder""" + + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): + super().__init__() + self.LocalSelfAttention = LongT5LocalAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + output_attentions=False, + **kwargs: Any, # to accept past_key_values and use_cache kwargs + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.LocalSelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them + return outputs + + +class LongT5LayerTransientGlobalSelfAttention(nn.Module): + """Transient-Global self attention used in encoder""" + + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): + super().__init__() + self.TransientGlobalSelfAttention = LongT5TransientGlobalAttention( + config, has_relative_attention_bias=has_relative_attention_bias + ) + self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + output_attentions=False, + **kwargs: Any, # to accept past_key_values and use_cache kwargs + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.TransientGlobalSelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->LongT5 +class LongT5LayerCrossAttention(nn.Module): + def __init__(self, config, layer_idx: Optional[int] = None): + super().__init__() + self.EncDecAttention = LongT5Attention(config, has_relative_attention_bias=False, layer_idx=layer_idx) + self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + key_value_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_values=None, + use_cache=False, + query_length=None, + output_attentions=False, + cache_position=None, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.EncDecAttention( + normed_hidden_states, + mask=attention_mask, + key_value_states=key_value_states, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_values=past_key_values, + use_cache=use_cache, + query_length=query_length, + output_attentions=output_attentions, + cache_position=cache_position, + ) + layer_output = hidden_states + self.dropout(attention_output[0]) + outputs = (layer_output,) + attention_output[1:] # add attentions if we output them + return outputs + + +class LongT5Block(GradientCheckpointingLayer): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): + super().__init__() + self.is_decoder = config.is_decoder + if config.is_decoder: + attention_layer = LongT5LayerSelfAttention + elif config.encoder_attention_type == "local": + attention_layer = LongT5LayerLocalSelfAttention + elif config.encoder_attention_type == "transient-global": + attention_layer = LongT5LayerTransientGlobalSelfAttention + else: + raise ValueError( + "For encoder attention mechanism, either `local` or `transient-global` attention type is expected, " + f"but got {config.encoder_attention_type}." + ) + self.layer = nn.ModuleList() + self.layer.append( + attention_layer(config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx) + ) + if self.is_decoder: + self.layer.append(LongT5LayerCrossAttention(config, layer_idx=layer_idx)) + + self.layer.append(LongT5LayerFF(config)) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_values=None, + use_cache=False, + output_attentions=False, + return_dict=True, + cache_position=None, + ): + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = self_attention_outputs[0] + attention_outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 inference - check https://github.com/huggingface/transformers/pull/19229/ + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key_values=past_key_values, + query_length=cache_position[-1] + 1, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 inference - check https://github.com/huggingface/transformers/pull/19229/ + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[1:] + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 inference - check https://github.com/huggingface/transformers/pull/19229/ + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + return ( + (hidden_states,) + attention_outputs + ) # hidden-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + +@auto_docstring +class LongT5PreTrainedModel(PreTrainedModel): + config: LongT5Config + base_model_prefix = "transformer" + supports_gradient_checkpointing = True + _no_split_modules = ["LongT5Block"] + + _can_compile_fullgraph = False # TODO: @raushan more involved due to local/global attn + + @property + # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel.dummy_inputs + def dummy_inputs(self): + input_ids = torch.tensor(DUMMY_INPUTS) + input_mask = torch.tensor(DUMMY_MASK) + dummy_inputs = { + "decoder_input_ids": input_ids, + "input_ids": input_ids, + "decoder_attention_mask": input_mask, + } + return dummy_inputs + + def _try_load_missing_tied_module(self, key): + module = self + key = key.removesuffix(".weight") + for sub_key in key.split("."): + if not hasattr(module, sub_key): + return + module = getattr(module, sub_key) + + self._tie_or_clone_weights(module, self.shared) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requested_loading_info = kwargs.get("output_loading_info", False) + kwargs["output_loading_info"] = True + model, loading_info = super().from_pretrained(*args, **kwargs) + missing_keys = loading_info.get("missing_keys", []) + + if hasattr(model, "shared") and hasattr(model, "_tied_weights_keys"): + for missing_key in missing_keys: + logger.warning( + f"Recovering a missing tied weight {missing_key} from a legacy LongT5 checkpoint. " + f"Consider saving {missing_key} in your checkpoint or updating the config (tie_word_embeddings=true)." + ) + model._try_load_missing_tied_module(missing_key) + + if requested_loading_info: + return model, loading_info + return model + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor # Used for testing weights initialization + if isinstance(module, LongT5LayerNorm): + module.weight.data.fill_(factor * 1.0) + elif isinstance(module, (LongT5Model, LongT5ForConditionalGeneration, LongT5EncoderModel)): + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) + if hasattr(module, "lm_head") and not self.config.tie_word_embeddings: + module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0) + elif isinstance(module, LongT5DenseActDense): + # Mesh TensorFlow FF initialization + # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 + # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 + module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi, "bias") and module.wi.bias is not None: + module.wi.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, LongT5DenseGatedActDense): + module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None: + module.wi_0.bias.data.zero_() + module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None: + module.wi_1.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, (LongT5Attention, LongT5LocalAttention, LongT5TransientGlobalAttention)): + # Mesh TensorFlow attention initialization to avoid scaling before softmax + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 + d_model = self.config.d_model + key_value_proj_dim = self.config.d_kv + n_heads = self.config.num_heads + module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) + module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5)) + if module.has_relative_attention_bias: + module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) + if isinstance(module, LongT5TransientGlobalAttention): + module.global_relative_attention_bias.weight.data.normal_( + mean=0.0, std=factor * ((d_model) ** -0.5) + ) + + # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right with T5->LongT5 + def _shift_right(self, input_ids): + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + if decoder_start_token_id is None: + raise ValueError( + "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. " + "See LongT5 docs for more information." + ) + + # shift inputs to the right + if is_torch_fx_proxy(input_ids): + # Item assignment is not supported natively for proxies. + shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id) + shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) + else: + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class LongT5Stack(LongT5PreTrainedModel): + def __init__(self, config, embed_tokens=None): + super().__init__(config) + + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model) + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + self.is_decoder = config.is_decoder + + self.local_radius = config.local_radius + self.block_len = self.local_radius + 1 + + self.block = nn.ModuleList( + [ + LongT5Block(config, has_relative_attention_bias=bool(i == 0), layer_idx=i) + for i in range(config.num_layers) + ] + ) + self.final_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.t5.modeling_t5.T5Stack.set_input_embeddings + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + cache_position=None, + ): + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError( + f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if inputs_embeds is None: + assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings" + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape + + if self.is_decoder: + if use_cache and past_key_values is None: + if self.config.is_encoder_decoder: + past_key_values = EncoderDecoderCache( + DynamicCache(config=self.config), DynamicCache(config=self.config) + ) + else: + past_key_values = DynamicCache(config=self.config) + elif not self.is_decoder: + # do not pass cache object down the line for encoder stack + # it messes indexing later in decoder-stack because cache object is modified in-place + past_key_values = None + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) + + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + + if self.is_decoder: + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache + if isinstance(past_key_values, EncoderDecoderCache) + else past_key_values, + output_attentions, + ) + # We use local attention in encoder self-attention, otherwise standard self & cross attentions are used + elif self.config.encoder_attention_type == "local": + causal_mask = _get_local_attention_mask(attention_mask, self.block_len, inputs_embeds.device) + else: # we need to use both local attention mask and standard extended mask for transient-global attention + causal_mask = attention_mask + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None + position_bias = None + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) + + for i, layer_module in enumerate(self.block): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states, + causal_mask, + position_bias, + encoder_hidden_states, + encoder_extended_attention_mask, + encoder_decoder_position_bias, # as a positional argument for gradient checkpointing + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + return_dict=return_dict, + cache_position=cache_position, + ) + + # layer_outputs is a tuple with: + # hidden-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + hidden_states = layer_outputs[0] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) + position_bias = layer_outputs[1] + if self.is_decoder and encoder_hidden_states is not None: + encoder_decoder_position_bias = layer_outputs[3 if output_attentions else 2] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[2],) + if self.is_decoder: + all_cross_attentions = all_cross_attentions + (layer_outputs[4],) + + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: Union[torch.Tensor, "BlockMask"], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool = False, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + return attention_mask + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + if using_compilable_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +__HEAD_MASK_WARNING_MSG = """ +The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently, +`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions. +If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers, +num_heads)`. +""" + + +@auto_docstring +class LongT5Model(LongT5PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [ + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", + ] + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: LongT5Config): + super().__init__(config) + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.tie_encoder_decoder = False + self.encoder = LongT5Stack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.tie_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = LongT5Stack(decoder_config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) + + def get_encoder(self): + return self.encoder + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so + you should be able to pad the inputs on both the right and the left. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for detail. + + [What are input IDs?](../glossary#input-ids) + + To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5 + Training](./longt5#training). + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5 + Training](./longt5#training). + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in + `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + Example: + + ```python + >>> from transformers import AutoTokenizer, LongT5Model + + >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base") + >>> model = LongT5Model.from_pretrained("google/long-t5-local-base") + + >>> # Let's try a very long encoder input. + >>> input_ids = tokenizer( + ... 100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt" + ... ).input_ids # Batch size 1 + + >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 + + >>> # forward pass + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + LONGT5 Model with a `language modeling` head on top. + """ +) +class LongT5ForConditionalGeneration(LongT5PreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_unexpected = [ + r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", + ] + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + + def __init__(self, config: LongT5Config): + super().__init__(config) + self.model_dim = config.d_model + + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.tie_encoder_decoder = False + self.encoder = LongT5Stack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.tie_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = LongT5Stack(decoder_config, self.shared) + + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) + + def get_encoder(self): + return self.encoder + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.Tensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so + you should be able to pad the inputs on both the right and the left. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for detail. + + [What are input IDs?](../glossary#input-ids) + + To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5 + Training](./longt5#training). + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5 + Training](./longt5#training). + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in + `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., + config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for + labels in `[0, ..., config.vocab_size]` + + Examples: + + ```python + >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps") + >>> model = LongT5ForConditionalGeneration.from_pretrained( + ... "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps" + ... ) + + >>> # Let's try a very long input. + >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt") + >>> input_ids = inputs.input_ids + + >>> outputs = model.generate(input_ids) + >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + abstractthe aim of this article is to provide an overview of the literature on the role of dog + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + # Convert encoder inputs in embeddings if needed + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + sequence_output = decoder_outputs[0] + + if self.config.tie_word_embeddings: + # Rescale output before projecting on vocab + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + sequence_output = sequence_output * (self.model_dim**-0.5) + + lm_logits = self.lm_head(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-100) + + labels = labels.to(lm_logits.device) + loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) + # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 + + if not return_dict: + output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs + return ((loss,) + output) if loss is not None else output + + return Seq2SeqLMOutput( + loss=loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return self._shift_right(labels) + + +@auto_docstring +class LongT5EncoderModel(LongT5PreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight"] + _keys_to_ignore_on_load_unexpected = [r"decoder"] + + def __init__(self, config: LongT5Config): + super().__init__(config) + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.use_cache = False + encoder_config.tie_encoder_decoder = False + self.encoder = LongT5Stack(encoder_config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) + + def get_encoder(self): + return self.encoder + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], BaseModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so + you should be able to pad the inputs on both the right and the left. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for detail. + + To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5 + Training](./longt5#training). + + Example: + + ```python + >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base") + >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base") + >>> input_ids = tokenizer( + ... 100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt" + ... ).input_ids # Batch size 1 + >>> outputs = model(input_ids=input_ids) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return encoder_outputs + + +__all__ = ["LongT5EncoderModel", "LongT5ForConditionalGeneration", "LongT5Model", "LongT5PreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d12684f6a69266ebebbf58dac512d70866e1cf11 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/configuration_luke.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/configuration_luke.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebdd9a128d61691d03d2e10d1375ae242ee9551a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/configuration_luke.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/tokenization_luke.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/tokenization_luke.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..629dd625d7623b6c89d81e077e1efdc7cf4825c3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/tokenization_luke.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c456a6b88378a5461e11dadfafb8820698b6b9d0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_m2m_100 import * + from .modeling_m2m_100 import * + from .tokenization_m2m_100 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/configuration_m2m_100.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/configuration_m2m_100.py new file mode 100644 index 0000000000000000000000000000000000000000..620641f1cf4e711bc54de79e02eb90328668c219 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/configuration_m2m_100.py @@ -0,0 +1,284 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""M2M100 model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import Any, Optional + +from ... import PreTrainedTokenizer +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast +from ...onnx.utils import compute_effective_axis_dimension +from ...utils import TensorType, is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +class M2M100Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`M2M100Model`]. It is used to instantiate an + M2M100 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the M2M100 + [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the M2M100 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`M2M100Model`] or + d_model (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer. + encoder_layers (`int`, *optional*, defaults to 12): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 12): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + classifier_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for classifier. + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + + Example: + + ```python + >>> from transformers import M2M100Config, M2M100Model + + >>> # Initializing a M2M100 facebook/m2m100_418M style configuration + >>> configuration = M2M100Config() + + >>> # Initializing a model (with random weights) from the facebook/m2m100_418M style configuration + >>> model = M2M100Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "m2m_100" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=128112, + max_position_embeddings=1024, + encoder_layers=12, + encoder_ffn_dim=4096, + encoder_attention_heads=16, + decoder_layers=12, + decoder_ffn_dim=4096, + decoder_attention_heads=16, + encoder_layerdrop=0.05, + decoder_layerdrop=0.05, + use_cache=True, + is_encoder_decoder=True, + activation_function="relu", + d_model=1024, + dropout=0.1, + attention_dropout=0.1, + activation_dropout=0.0, + init_std=0.02, + decoder_start_token_id=2, + scale_embedding=True, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) + + +class M2M100OnnxConfig(OnnxSeq2SeqConfigWithPast): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + + if self.use_past: + common_inputs["decoder_input_ids"] = {0: "batch"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + else: + common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"} + + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + return common_inputs + + # Copied from BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering + # A better name would be _generate_dummy_inputs_for_encoder_and_decoder because sequence classification and question + # answering are not supported for M2M100, but this name is preserved to be able to check that the copy matches what + # was done for BART so that it can be updated if need be. + def _generate_dummy_inputs_for_sequence_classification_and_question_answering( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + # Copied from OnnxConfig.generate_dummy_inputs + # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity. + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + batch_size = compute_effective_axis_dimension( + batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0 + ) + + # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX + token_to_add = tokenizer.num_special_tokens_to_add(is_pair) + seq_length = compute_effective_axis_dimension( + seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add + ) + + # Generate dummy inputs according to compute batch and sequence + dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size + common_inputs = dict(tokenizer(dummy_input, return_tensors=framework)) + return common_inputs + + # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_default_and_seq2seq_lm + def _generate_dummy_inputs_for_default_and_seq2seq_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + # Generate decoder inputs + decoder_seq_length = seq_length if not self.use_past else 1 + decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, decoder_seq_length, is_pair, framework + ) + decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()} + common_inputs = dict(**encoder_inputs, **decoder_inputs) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, encoder_seq_length = common_inputs["input_ids"].shape + decoder_seq_length = common_inputs["decoder_input_ids"].shape[1] + num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads + encoder_shape = ( + batch, + num_encoder_attention_heads, + encoder_seq_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + decoder_past_length = decoder_seq_length + 3 + decoder_shape = ( + batch, + num_decoder_attention_heads, + decoder_past_length, + self._config.hidden_size // num_decoder_attention_heads, + ) + + common_inputs["decoder_attention_mask"] = torch.cat( + [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1 + ) + + common_inputs["past_key_values"] = [] + # If the number of encoder and decoder layers are present in the model configuration, both are considered + num_encoder_layers, num_decoder_layers = self.num_layers + min_num_layers = min(num_encoder_layers, num_decoder_layers) + max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers + remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder" + + for _ in range(min_num_layers): + common_inputs["past_key_values"].append( + ( + torch.zeros(decoder_shape), + torch.zeros(decoder_shape), + torch.zeros(encoder_shape), + torch.zeros(encoder_shape), + ) + ) + # TODO: test this. + shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape + for _ in range(min_num_layers, max_num_layers): + common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape))) + return common_inputs + + generate_dummy_inputs = _generate_dummy_inputs_for_default_and_seq2seq_lm + + +__all__ = ["M2M100Config", "M2M100OnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/modeling_m2m_100.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/modeling_m2m_100.py new file mode 100644 index 0000000000000000000000000000000000000000..6015aa54d76b1ef6f7ccb326867ada6c287429a4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/modeling_m2m_100.py @@ -0,0 +1,1445 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch M2M100 model.""" + +import math +from typing import Callable, Optional, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...integrations.fsdp import is_fsdp_managed_module +from ...modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_attention_mask_for_sdpa, +) +from ...modeling_flash_attention_utils import ( + FlashAttentionKwargs, +) +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import auto_docstring, is_torch_flex_attn_available, is_torchdynamo_compiling, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_m2m_100 import M2M100Config + + +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import BlockMask, make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.bart.modeling_bart.shift_tokens_right +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->M2M100 +class M2M100ScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +class M2M100SinusoidalPositionalEmbedding(nn.Module): + """This module produces sinusoidal positional embeddings of any length.""" + + def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None): + super().__init__() + self.offset = 2 + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + self.make_weights(num_positions + self.offset, embedding_dim, padding_idx) + + def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): + emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx) + if hasattr(self, "weights"): + # in forward put the weights on the correct dtype and device of the param + emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device) + + self.register_buffer("weights", emb_weights, persistent=False) + + @staticmethod + def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): + """ + Build sinusoidal embeddings. + + This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of + "Attention Is All You Need". + """ + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb) + emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0) + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) + if embedding_dim % 2 == 1: + # zero pad + emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) + if padding_idx is not None: + emb[padding_idx, :] = 0 + + return emb.to(torch.get_default_dtype()) + + @torch.no_grad() + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values_length: int = 0, + ): + if input_ids is not None: + bsz, seq_len = input_ids.size() + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to( + input_ids.device + ) + else: + bsz, seq_len = inputs_embeds.size()[:-1] + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length) + + # expand embeddings if needed + max_pos = self.padding_idx + 1 + seq_len + past_key_values_length + if max_pos > self.weights.size(0): + self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) + + return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach() + + def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length + + +# Copied from transformers.models.bart.modeling_bart.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: Optional[float] = None, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +): + if scaling is None: + scaling = query.size(-1) ** -0.5 + + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if head_mask is not None: + attn_weights = attn_weights * head_mask.view(1, -1, 1, 1) + + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->M2M100 +class M2M100Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[M2M100Config] = None, + layer_idx: Optional[int] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.Tensor] = None, + # TODO: we need a refactor so that the different attention modules can get their specific kwargs + # ATM, we have mixed things encoder, decoder, and encoder-decoder attn + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + # determine input shapes + bsz, tgt_len = hidden_states.shape[:-1] + src_len = key_value_states.shape[1] if is_cross_attention else tgt_len + + q_input_shape = (bsz, tgt_len, -1, self.head_dim) + kv_input_shape = (bsz, src_len, -1, self.head_dim) + + # get query proj + query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2) + + is_updated = False + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states = self.k_proj(current_states) + value_states = self.v_proj(current_states) + key_states = key_states.view(*kv_input_shape).transpose(1, 2) + value_states = value_states.view(*kv_input_shape).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + output_attentions=output_attentions, + head_mask=layer_head_mask, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->M2M100, MBART->M2M100 +class M2M100EncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: M2M100Config): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = M2M100Attention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + output_attentions: bool = False, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16: + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + return hidden_states, attn_weights + + +# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->M2M100, MBART->M2M100 +class M2M100DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: M2M100Config, layer_idx: Optional[int] = None): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = M2M100Attention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + is_causal=True, + config=config, + layer_idx=layer_idx, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = M2M100Attention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + config=config, + layer_idx=layer_idx, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_values (`Cache`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +@auto_docstring +class M2M100PreTrainedModel(PreTrainedModel): + config: M2M100Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["M2M100EncoderLayer", "M2M100DecoderLayer"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + # Doesn't support `compile` (dynamic control flow). Can be fixed but low usage model + _can_compile_fullgraph = False + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: Optional[Union[torch.Tensor, "BlockMask"]], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + ): + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + # Other attention flavors support in-built causal (when `mask is None`) + # while we need to create our specific block mask regardless + elif attention_mask is None: + attention_mask = make_flex_block_causal_mask( + torch.ones( + size=(input_tensor.shape[0], input_tensor.shape[1]), + device=attention_mask.device, + ) + ) + return attention_mask + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_compilable_cache: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + if using_compilable_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask + + +class M2M100Encoder(M2M100PreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`M2M100EncoderLayer`]. + + Args: + config: M2M100Config + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = M2M100ScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = M2M100SinusoidalPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + self.padding_idx, + ) + self.layers = nn.ModuleList([M2M100EncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(input_ids, inputs_embeds) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + attention_mask = self._update_full_mask( + attention_mask, + inputs_embeds, + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != len(self.layers): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class M2M100Decoder(M2M100PreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`M2M100DecoderLayer`] + + Args: + config: M2M100Config + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = M2M100ScaledWordEmbedding( + config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = M2M100SinusoidalPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + self.padding_idx, + ) + self.layers = nn.ModuleList([M2M100DecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)]) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..." + ) + use_cache = False + + # initialize `past_key_values` + if use_cache and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + batch_size, seq_length = inputs_embeds.size()[:-1] + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) + + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + + self_attn_cache = ( + past_key_values.self_attention_cache + if isinstance(past_key_values, EncoderDecoderCache) + else past_key_values + ) + + attention_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + self_attn_cache, + ) + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + input_shape, + inputs_embeds, + ) + # embed positions + positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length) + positions = positions.to(inputs_embeds.device) + + hidden_states = inputs_embeds + positions + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != len(self.layers): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + + layer_outputs = decoder_layer( + hidden_states, + attention_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + + if skip_the_layer: + continue + + if output_attentions: + all_self_attns += (layer_outputs[1],) + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring +class M2M100Model(M2M100PreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: M2M100Config): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + self.shared = M2M100ScaledWordEmbedding(vocab_size, config.d_model, padding_idx, embed_scale=embed_scale) + + self.encoder = M2M100Encoder(config, self.shared) + self.decoder = M2M100Decoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) + + def get_encoder(self): + return self.encoder + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + M2M100 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The M2M100 Model with a language modeling head. Can be used for summarization. + """ +) +class M2M100ForConditionalGeneration(M2M100PreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + + def __init__(self, config: M2M100Config): + super().__init__(config) + self.model = M2M100Model(config) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + M2M100 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example Translation: + + ```python + >>> from transformers import AutoTokenizer, M2M100ForConditionalGeneration + + >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M") + + >>> text_to_translate = "Life is like a box of chocolates" + >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt") + + >>> # translate to French + >>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr")) + >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)) + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + lm_logits = self.lm_head(outputs[0]) + + masked_lm_loss = None + if labels is not None: + # move labels to the correct device to enable PP + labels = labels.to(lm_logits.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +__all__ = ["M2M100ForConditionalGeneration", "M2M100Model", "M2M100PreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py new file mode 100644 index 0000000000000000000000000000000000000000..ccced10f2bacfb30aa03bf6f30f2e8f9343c64d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py @@ -0,0 +1,384 @@ +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for M2M100.""" + +import json +import os +from pathlib import Path +from shutil import copyfile +from typing import Any, Optional, Union + +import sentencepiece + +from ...tokenization_utils import BatchEncoding, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "spm_file": "sentencepiece.bpe.model", + "tokenizer_config_file": "tokenizer_config.json", +} + + +# fmt: off +FAIRSEQ_LANGUAGE_CODES = { + "m2m100": ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"], + "wmt21": ['en', 'ha', 'is', 'ja', 'cs', 'ru', 'zh', 'de'] +} +# fmt: on + + +@requires(backends=("sentencepiece",)) +class M2M100Tokenizer(PreTrainedTokenizer): + """ + Construct an M2M100 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + spm_file (`str`): + Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that + contains the vocabulary. + src_lang (`str`, *optional*): + A string representing the source language. + tgt_lang (`str`, *optional*): + A string representing the target language. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + language_codes (`str`, *optional*, defaults to `"m2m100"`): + What language codes to use. Should be one of `"m2m100"` or `"wmt21"`. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Examples: + + ```python + >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + + >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") + >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro") + >>> src_text = " UN Chief Says There Is No Military Solution in Syria" + >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" + >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") + >>> outputs = model(**model_inputs) # should work + ```""" + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + prefix_tokens: list[int] = [] + suffix_tokens: list[int] = [] + + def __init__( + self, + vocab_file, + spm_file, + src_lang=None, + tgt_lang=None, + bos_token="", + eos_token="", + sep_token="", + pad_token="", + unk_token="", + language_codes="m2m100", + sp_model_kwargs: Optional[dict[str, Any]] = None, + num_madeup_words=8, + **kwargs, + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.language_codes = language_codes + fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes] + self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code} + + additional_special_tokens = kwargs.pop("additional_special_tokens", []) + for lang_code in fairseq_language_code: + token = self.get_lang_token(lang_code) + if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder: + additional_special_tokens.append(token) + + self.vocab_file = vocab_file + self.encoder = load_json(vocab_file) + self.decoder = {v: k for k, v in self.encoder.items()} + self.spm_file = spm_file + self.sp_model = load_spm(spm_file, self.sp_model_kwargs) + + self.encoder_size = len(self.encoder) + + self.lang_token_to_id = { + self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code) + } + self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)} + self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()} + + self._src_lang = src_lang if src_lang is not None else "en" + self.tgt_lang = tgt_lang + self.cur_lang_id = self.get_lang_id(self._src_lang) + + self.num_madeup_words = num_madeup_words + + super().__init__( + src_lang=src_lang, + tgt_lang=tgt_lang, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + unk_token=unk_token, + pad_token=pad_token, + language_codes=language_codes, + sp_model_kwargs=self.sp_model_kwargs, + additional_special_tokens=additional_special_tokens, + num_madeup_words=num_madeup_words, + **kwargs, + ) + self.set_src_lang_special_tokens(self._src_lang) + + @property + def vocab_size(self) -> int: + return len(self.encoder) + + def get_vocab(self) -> dict: + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + @property + def src_lang(self) -> str: + return self._src_lang + + @src_lang.setter + def src_lang(self, new_src_lang: str) -> None: + self._src_lang = new_src_lang + self.set_src_lang_special_tokens(self._src_lang) + + def _tokenize(self, text: str) -> list[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + if token in self.lang_token_to_id: + return self.lang_token_to_id[token] + return self.encoder.get(token, self.encoder[self.unk_token]) + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) in a token (str) using the decoder.""" + if index in self.id_to_lang_token: + return self.id_to_lang_token[index] + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + out_string += self.sp_model.decode(current_sub_tokens) + token + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + prefix_ones = [1] * len(self.prefix_tokens) + suffix_ones = [1] * len(self.suffix_tokens) + if token_ids_1 is None: + return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones + return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An MBART sequence has the following format, where `X` represents the sequence: + + - `input_ids` (for encoder) `X [eos, src_lang_code]` + - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + def __getstate__(self) -> dict: + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d: dict) -> None: + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + save_dir = Path(save_directory) + if not save_dir.is_dir(): + raise OSError(f"{save_directory} should be a directory") + vocab_save_path = save_dir / ( + (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"] + ) + spm_save_path = save_dir / ( + (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"] + ) + + save_json(self.encoder, vocab_save_path) + + if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file): + copyfile(self.spm_file, spm_save_path) + elif not os.path.isfile(self.spm_file): + with open(spm_save_path, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (str(vocab_save_path), str(spm_save_path)) + + def prepare_seq2seq_batch( + self, + src_texts: list[str], + src_lang: str = "en", + tgt_texts: Optional[list[str]] = None, + tgt_lang: str = "ro", + **kwargs, + ) -> BatchEncoding: + self.src_lang = src_lang + self.tgt_lang = tgt_lang + self.set_src_lang_special_tokens(self.src_lang) + return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) + + def _build_translation_inputs(self, raw_inputs, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, **extra_kwargs) + tgt_lang_id = self.get_lang_id(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs + + def _switch_to_input_mode(self): + self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + self.set_tgt_lang_special_tokens(self.tgt_lang) + + def set_src_lang_special_tokens(self, src_lang: str) -> None: + """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" + lang_token = self.get_lang_token(src_lang) + self.cur_lang_id = self.lang_token_to_id[lang_token] + self.prefix_tokens = [self.cur_lang_id] + self.suffix_tokens = [self.eos_token_id] + + def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None: + """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].""" + lang_token = self.get_lang_token(tgt_lang) + self.cur_lang_id = self.lang_token_to_id[lang_token] + self.prefix_tokens = [self.cur_lang_id] + self.suffix_tokens = [self.eos_token_id] + + def get_lang_token(self, lang: str) -> str: + return self.lang_code_to_token[lang] + + def get_lang_id(self, lang: str) -> int: + lang_token = self.get_lang_token(lang) + return self.lang_token_to_id[lang_token] + + +def load_spm(path: str, sp_model_kwargs: dict[str, Any]) -> sentencepiece.SentencePieceProcessor: + spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) + spm.Load(str(path)) + return spm + + +def load_json(path: str) -> Union[dict, list]: + with open(path, "r") as f: + return json.load(f) + + +def save_json(data, path: str) -> None: + with open(path, "w") as f: + json.dump(data, f, indent=2) + + +__all__ = ["M2M100Tokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f7cd8c28da631fdd44cc09458380527b6323d044 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_mbart50 import * + from .tokenization_mbart50_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50.py new file mode 100644 index 0000000000000000000000000000000000000000..413beaa03a83e4eddb9eb5a050d85c347ecbe2b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50.py @@ -0,0 +1,359 @@ +# coding=utf-8 +# Copyright 2021 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + + +FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"] # fmt: skip + + +@requires(backends=("sentencepiece",)) +class MBart50Tokenizer(PreTrainedTokenizer): + """ + Construct a MBart50 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + src_lang (`str`, *optional*): + A string representing the source language. + tgt_lang (`str`, *optional*): + A string representing the target language. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Examples: + + ```python + >>> from transformers import MBart50Tokenizer + + >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO") + >>> src_text = " UN Chief Says There Is No Military Solution in Syria" + >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" + >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") + >>> # model(**model_inputs) should work + ```""" + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + prefix_tokens: list[int] = [] + suffix_tokens: list[int] = [] + + def __init__( + self, + vocab_file, + src_lang=None, + tgt_lang=None, + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or [] + kwargs["additional_special_tokens"] += [ + code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"] + ] + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # Original fairseq vocab and spm vocab must be "aligned": + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- + # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' + # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' + + # Mimic fairseq token-to-id alignment for the first 4 token + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} + + # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab + self.fairseq_offset = 1 + + self.sp_model_size = len(self.sp_model) + self.lang_code_to_id = { + code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES) + } + self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()} + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + + self.fairseq_tokens_to_ids.update(self.lang_code_to_id) + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + + super().__init__( + src_lang=src_lang, + tgt_lang=tgt_lang, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + self._src_lang = src_lang if src_lang is not None else "en_XX" + self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] + self.tgt_lang = tgt_lang + self.set_src_lang_special_tokens(self._src_lang) + + @property + def vocab_size(self) -> int: + return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1 # Plus 1 for the mask token + + @property + def src_lang(self) -> str: + return self._src_lang + + @src_lang.setter + def src_lang(self, new_src_lang: str) -> None: + self._src_lang = new_src_lang + self.set_src_lang_special_tokens(self._src_lang) + + def __getstate__(self) -> dict: + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d: dict) -> None: + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def get_vocab(self) -> dict: + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> list[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token: str) -> int: + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + # Need to return unknown token if the SP model returned 0 + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + prefix_ones = [1] * len(self.prefix_tokens) + suffix_ones = [1] * len(self.suffix_tokens) + if token_ids_1 is None: + return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones + return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An MBART-50 sequence has the following format, where `X` represents the sequence: + + - `input_ids` (for encoder) `[src_lang_code] X [eos]` + - `labels`: (for decoder) `[tgt_lang_code] X [eos]` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + def _build_translation_inputs( + self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs + ): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) + tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs + + def prepare_seq2seq_batch( + self, + src_texts: list[str], + src_lang: str = "en_XX", + tgt_texts: Optional[list[str]] = None, + tgt_lang: str = "ro_RO", + **kwargs, + ) -> BatchEncoding: + self.src_lang = src_lang + self.tgt_lang = tgt_lang + return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) + + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) + + def set_src_lang_special_tokens(self, src_lang: str) -> None: + """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" + self.cur_lang_code_id = self.lang_code_to_id[src_lang] + self.prefix_tokens = [self.cur_lang_code_id] + self.suffix_tokens = [self.eos_token_id] + + def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None: + """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos].""" + self.cur_lang_code_id = self.lang_code_to_id[tgt_lang] + self.prefix_tokens = [self.cur_lang_code_id] + self.suffix_tokens = [self.eos_token_id] + + +__all__ = ["MBart50Tokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..985b0929f87c5516a2edd8e02c6aaaa8926d496e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50_fast.py @@ -0,0 +1,258 @@ +# coding=utf-8 +# Copyright 2021 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from shutil import copyfile +from typing import Optional + +from tokenizers import processors + +from ...tokenization_utils import AddedToken, BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_mbart50 import MBart50Tokenizer +else: + MBart50Tokenizer = None + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + + +FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"] # fmt: skip + + +class MBart50TokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" MBART tokenizer for mBART-50 (backed by HuggingFace's *tokenizers* library). Based on + [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + src_lang (`str`, *optional*): + A string representing the source language. + tgt_lang (`str`, *optional*): + A string representing the target language. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + + Examples: + + ```python + >>> from transformers import MBart50TokenizerFast + + >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO") + >>> src_text = " UN Chief Says There Is No Military Solution in Syria" + >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" + >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") + >>> # model(**model_inputs) should work + ```""" + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = MBart50Tokenizer + + prefix_tokens: list[int] = [] + suffix_tokens: list[int] = [] + + def __init__( + self, + vocab_file=None, + src_lang=None, + tgt_lang=None, + tokenizer_file=None, + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or [] + kwargs["additional_special_tokens"] += [ + code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"] + ] + + super().__init__( + vocab_file, + src_lang=src_lang, + tgt_lang=tgt_lang, + tokenizer_file=tokenizer_file, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs, + ) + + self.vocab_file = vocab_file + + self.lang_code_to_id = { + lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES + } + + self._src_lang = src_lang if src_lang is not None else "en_XX" + self.tgt_lang = tgt_lang + self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] + self.set_src_lang_special_tokens(self._src_lang) + + @property + def src_lang(self) -> str: + return self._src_lang + + @src_lang.setter + def src_lang(self, new_src_lang: str) -> None: + self._src_lang = new_src_lang + self.set_src_lang_special_tokens(self._src_lang) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. The special tokens depend on calling set_lang. + + An MBART-50 sequence has the following format, where `X` represents the sequence: + + - `input_ids` (for encoder) `[src_lang_code] X [eos]` + - `labels`: (for decoder) `[tgt_lang_code] X [eos]` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + def prepare_seq2seq_batch( + self, + src_texts: list[str], + src_lang: str = "en_XX", + tgt_texts: Optional[list[str]] = None, + tgt_lang: str = "ro_RO", + **kwargs, + ) -> BatchEncoding: + self.src_lang = src_lang + self.tgt_lang = tgt_lang + return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) + + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) + + def set_src_lang_special_tokens(self, src_lang: str) -> None: + """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" + self.cur_lang_code_id = self.convert_tokens_to_ids(src_lang) + self.prefix_tokens = [self.cur_lang_code_id] + self.suffix_tokens = [self.eos_token_id] + + prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) + suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) + + self._tokenizer.post_processor = processors.TemplateProcessing( + single=prefix_tokens_str + ["$A"] + suffix_tokens_str, + pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, + special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), + ) + + def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None: + """Reset the special tokens to the target language setting. prefix=[src_lang_code] and suffix=[eos].""" + self.cur_lang_code_id = self.convert_tokens_to_ids(tgt_lang) + self.prefix_tokens = [self.cur_lang_code_id] + self.suffix_tokens = [self.eos_token_id] + + prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) + suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) + + self._tokenizer.post_processor = processors.TemplateProcessing( + single=prefix_tokens_str + ["$A"] + suffix_tokens_str, + pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, + special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), + ) + + def _build_translation_inputs( + self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs + ): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) + tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["MBart50TokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..609613a79e3e6112d362af8bb834f6b550600479 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/configuration_megatron_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/configuration_megatron_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cb62459b99bb4bd04084bb7a400f4222f5ac884 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/configuration_megatron_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/modeling_megatron_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/modeling_megatron_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44591bfa4af9c576217d322de11bc1f4407028cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/modeling_megatron_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed3361fee50df163f3021464871bb74457a4f0d4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9434ee35e80e1a142df1a28dff1ed0b9db8e3a3e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75074f06a243e15c20c13311028c035400749c52 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e113347c1e24257bbb23d95fd8495d8321a108f7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19c9a737ea368bdaf5382158390a04630d8d044f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08e86af2d2711e5ce887b68be3684881cfdc3093 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92b950df843d1f09bf3458cf6dc76a598bd309cc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modeling_ministral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modeling_ministral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a474d618d79e0477d06223b40b135ab1dc8312c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modeling_ministral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..18a5657cd2ec6dbf39c6e794fae09bf65753da8f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_mistral import * + from .modeling_flax_mistral import * + from .modeling_mistral import * + from .modeling_tf_mistral import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/configuration_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/configuration_mistral.py new file mode 100644 index 0000000000000000000000000000000000000000..e9f66b1a2fbe9117108b949107cd42246a7760b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/configuration_mistral.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Mistral model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class MistralConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an + Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. + + [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) + [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MistralModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`. + head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): + The attention head dimension. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to `4096*32`): + The maximum sequence length that this model might ever be used with. Mistral's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import MistralModel, MistralConfig + + >>> # Initializing a Mistral 7B style configuration + >>> configuration = MistralConfig() + + >>> # Initializing a model from the Mistral 7B style configuration + >>> model = MistralModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mistral" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `MistralModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + head_dim=None, + hidden_act="silu", + max_position_embeddings=4096 * 32, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + sliding_window=4096, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + self.head_dim = head_dim + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + if "layer_types" in kwargs: + logger.warning_once( + "Detected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility." + ) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["MistralConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_flax_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_flax_mistral.py new file mode 100644 index 0000000000000000000000000000000000000000..2c084ee114d762de16d11710cb05ec49d41e3676 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_flax_mistral.py @@ -0,0 +1,744 @@ +# coding=utf-8 +# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax Mistral model.""" + +from typing import Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPast, + FlaxCausalLMOutput, + FlaxCausalLMOutputWithCrossAttentions, +) +from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, logging +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward +from .configuration_mistral import MistralConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MistralConfig" +_REAL_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1" +_CHECKPOINT_FOR_DOC = "ksmcg/Mistral-tiny" + +MISTRAL_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`MistralConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or + `jax.numpy.bfloat16`. + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRMSNorm with Llama->Mistral +class FlaxMistralRMSNorm(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.epsilon = self.config.rms_norm_eps + self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size) + + def __call__(self, hidden_states): + variance = jnp.asarray(hidden_states, dtype=jnp.float32) + variance = jnp.power(variance, 2) + variance = variance.mean(-1, keepdims=True) + # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt` + hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon) + + return self.weight * jnp.asarray(hidden_states, dtype=self.dtype) + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRotaryEmbedding with Llama->Mistral +class FlaxMistralRotaryEmbedding(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + head_dim = self.config.hidden_size // self.config.num_attention_heads + self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim) + + def __call__(self, key, query, position_ids): + sincos = self.sincos[position_ids] + sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1) + + key = apply_rotary_pos_emb(key, sin_pos, cos_pos) + query = apply_rotary_pos_emb(query, sin_pos, cos_pos) + + key = jnp.asarray(key, dtype=self.dtype) + query = jnp.asarray(query, dtype=self.dtype) + + return key, query + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaMLP with Llama->Mistral +class FlaxMistralMLP(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + embed_dim = self.config.hidden_size + inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim + + kernel_init = jax.nn.initializers.normal(self.config.initializer_range) + self.act = ACT2FN[self.config.hidden_act] + + self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) + self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) + self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) + + def __call__(self, hidden_states): + up_proj_states = self.up_proj(hidden_states) + gate_states = self.act(self.gate_proj(hidden_states)) + + hidden_states = self.down_proj(up_proj_states * gate_states) + return hidden_states + + +# Copied from transformers.models.llama.modeling_flax_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(tensor, sin_pos, cos_pos): + return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos) + + +# Copied from transformers.models.llama.modeling_flax_llama.create_sinusoidal_positions +def create_sinusoidal_positions(num_pos, dim): + inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim)) + freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32") + + emb = np.concatenate((freqs, freqs), axis=-1) + out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1) + return jnp.array(out[:, :, :num_pos]) + + +# Copied from transformers.models.llama.modeling_flax_llama.rotate_half +def rotate_half(tensor): + """Rotates half the hidden dims of the input.""" + rotate_half_tensor = jnp.concatenate( + (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1 + ) + return rotate_half_tensor + + +class FlaxMistralAttention(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + config = self.config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.attention_softmax_in_fp32 = self.dtype is not jnp.float32 + self.rope_theta = config.rope_theta + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Dense(self.num_heads * self.head_dim, use_bias=False, dtype=self.dtype) + self.k_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype) + self.v_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype) + self.o_proj = nn.Dense(self.hidden_size, use_bias=False, dtype=self.dtype) + causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool") + self.causal_mask = jnp.triu(causal_mask, k=-(config.sliding_window or 0)) + self.rotary_emb = FlaxMistralRotaryEmbedding(self.config, dtype=self.dtype) + + def _split_heads(self, hidden_states, num_heads): + return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,)) + + @nn.compact + # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + deterministic: bool = True, + output_attentions: bool = False, + init_cache: bool = False, + ) -> tuple[jnp.ndarray, jnp.ndarray]: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self._split_heads(query_states, self.num_heads) + key_states = self._split_heads(key_states, self.num_key_value_heads) + value_states = self._split_heads(value_states, self.num_key_value_heads) + + key_states, query_states = self.rotary_emb(key_states, query_states, position_ids) + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + + batch_size = hidden_states.shape[0] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + + if self.has_variable("cache", "cached_key") or init_cache: + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + key_states = jnp.repeat(key_states, self.num_key_value_groups, axis=2) + value_states = jnp.repeat(value_states, self.num_key_value_groups, axis=2) + + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + + # usual dot product attention + attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + deterministic=deterministic, + dropout_rate=self.config.attention_dropout, + dtype=attention_dtype, + ) + + if self.attention_softmax_in_fp32: + attn_weights = attn_weights.astype(self.dtype) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = self._merge_heads(attn_output) + attn_output = self.o_proj(attn_output) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaDecoderLayer with Llama->Mistral +class FlaxMistralDecoderLayer(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.input_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) + self.self_attn = FlaxMistralAttention(self.config, dtype=self.dtype) + self.post_attention_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) + self.mlp = FlaxMistralMLP(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask=None, + position_ids=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + ): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + outputs = self.self_attn( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + # residual connection + attn_output = outputs[0] + hidden_states = residual + attn_output + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + hidden_states + + return (hidden_states,) + outputs[1:] + + +# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Mistral, GPT_NEO->MISTRAL, transformer->model +class FlaxMistralPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = MistralConfig + base_model_prefix = "model" + module_class: nn.Module = None + + def __init__( + self, + config: MistralConfig, + input_shape: tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length)) + attention_mask = jnp.ones_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def __call__( + self, + input_ids, + attention_mask=None, + position_ids=None, + params: Optional[dict] = None, + past_key_values: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + batch_size, sequence_length = input_ids.shape + + if position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") + + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + if attention_mask is None: + attention_mask = jnp.ones((batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxMistralAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + jnp.array(position_ids, dtype="i4"), + not train, + False, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + return outputs + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaLayerCollection with Llama->Mistral +class FlaxMistralLayerCollection(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.blocks = [ + FlaxMistralDecoderLayer(self.config, dtype=self.dtype, name=str(i)) + for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask=None, + position_ids=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = False, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for block in self.blocks: + if output_hidden_states: + all_hidden_states += (hidden_states,) + layer_outputs = block( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + # this contains possible `None` values - `FlaxMistralModule` will filter them out + outputs = (hidden_states, all_hidden_states, all_attentions) + + return outputs + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModule with Llama->Mistral +class FlaxMistralModule(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.hidden_size = self.config.hidden_size + embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range) + self.embed_tokens = nn.Embed( + self.config.vocab_size, + self.hidden_size, + embedding_init=embedding_init, + dtype=self.dtype, + ) + self.layers = FlaxMistralLayerCollection(self.config, dtype=self.dtype) + self.norm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + position_ids=None, + deterministic=True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + input_embeds = self.embed_tokens(input_ids.astype("i4")) + + outputs = self.layers( + input_embeds, + position_ids=position_ids, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states = outputs[1] + (hidden_states,) + outputs = (hidden_states, all_hidden_states) + outputs[2:] + else: + outputs = (hidden_states,) + outputs[1:] + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=outputs[1], + attentions=outputs[-1], + ) + + +@add_start_docstrings( + "The bare Mistral Model transformer outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class FlaxMistralModel(FlaxMistralPreTrainedModel): + module_class = FlaxMistralModule + + +append_call_sample_docstring( + FlaxMistralModel, + _CHECKPOINT_FOR_DOC, + FlaxBaseModelOutputWithPast, + _CONFIG_FOR_DOC, + real_checkpoint=_REAL_CHECKPOINT_FOR_DOC, +) + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaForCausalLMModule with Llama->Mistral +class FlaxMistralForCausalLMModule(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.model = FlaxMistralModule(self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.config.vocab_size, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + input_ids, + attention_mask=None, + position_ids=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.model( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + outputs[1:] + + return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) + + +@add_start_docstrings( + """ + The Mistral Model transformer with a language modeling head (linear layer) on top. + """, + MISTRAL_START_DOCSTRING, +) + +# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Mistral +class FlaxMistralForCausalLM(FlaxMistralPreTrainedModel): + module_class = FlaxMistralForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since Mistral uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxMistralForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, + real_checkpoint=_REAL_CHECKPOINT_FOR_DOC, +) + +__all__ = ["FlaxMistralForCausalLM", "FlaxMistralModel", "FlaxMistralPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_mistral.py new file mode 100644 index 0000000000000000000000000000000000000000..5b7c7b2c17900641f6b7d540fbec1ee2faf59c33 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_mistral.py @@ -0,0 +1,480 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/mistral/modular_mistral.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_mistral.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from transformers.utils.generic import check_model_inputs + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import ( + GenericForQuestionAnswering, + GenericForSequenceClassification, + GenericForTokenClassification, + GradientCheckpointingLayer, +) +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from .configuration_mistral import MistralConfig + + +class MistralMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class MistralAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: MistralConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=getattr(self.config, "sliding_window", None), # main diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +@use_kernel_forward_from_hub("RMSNorm") +class MistralRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class MistralDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: MistralConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = MistralAttention(config=config, layer_idx=layer_idx) + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class MistralPreTrainedModel(PreTrainedModel): + config: MistralConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": MistralDecoderLayer, + "attentions": MistralAttention, + } + + +class MistralRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: MistralConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class MistralModel(MistralPreTrainedModel): + def __init__(self, config: MistralConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = MistralRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask + causal_mask = mask_function( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + ) + + +@auto_docstring +class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = MistralModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("meta-mistral/Mistral-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-mistral/Mistral-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class MistralForTokenClassification(GenericForTokenClassification, MistralPreTrainedModel): + pass + + +class MistralForSequenceClassification(GenericForSequenceClassification, MistralPreTrainedModel): + pass + + +class MistralForQuestionAnswering(GenericForQuestionAnswering, MistralPreTrainedModel): ... + + +__all__ = [ + "MistralForCausalLM", + "MistralForQuestionAnswering", + "MistralModel", + "MistralPreTrainedModel", + "MistralForSequenceClassification", + "MistralForTokenClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_tf_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_tf_mistral.py new file mode 100644 index 0000000000000000000000000000000000000000..d3ca7d13b6a88f27fe1957510da75b84865ae2dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_tf_mistral.py @@ -0,0 +1,1016 @@ +# coding=utf-8 +# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 Mistral model.""" + +import math +import warnings +from typing import Optional, Union + +import tensorflow as tf + +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPast, + TFCausalLMOutputWithPast, + TFSequenceClassifierOutputWithPast, +) +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + get_tf_activation, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_mistral import MistralConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MistralConfig" + + +def _make_causal_mask(input_ids_shape, dtype, past_key_values_length=0): + """ + Make causal mask used for bi-directional self-attention, supporting both static and dynamic shapes. + """ + bsz, tgt_len = input_ids_shape + + # Create a matrix where only the lower triangle and diagonal are filled with zeros (causal mask) + mask = tf.fill((tgt_len, tgt_len), tf.dtypes.as_dtype(dtype).min) + mask_cond = tf.range(tgt_len) + mask = tf.where(mask_cond[:, None] >= mask_cond[None, :], 0.0, mask) + + if past_key_values_length > 0: + mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=dtype), mask], axis=-1) + + if bsz is None: + # When batch size is dynamic, expand and tile + # so we can compile a functional model + mask = tf.expand_dims(mask, 0) + mask = tf.expand_dims(mask, 0) # shape: (1, 1, tgt_len, tgt_len + past_key_values_length) + mask = tf.tile(mask, [bsz, 1, 1, 1]) + else: + # When batch size is static, directly use broadcast_to + mask = tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return mask + + +def _expand_mask(mask, dtype, tgt_len=None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = shape_list(mask) + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = tf.expand_dims(tf.expand_dims(mask, 1), 1) + expanded_mask = tf.broadcast_to(expanded_mask, [bsz, 1, tgt_len, src_len]) + + inverted_mask = 1.0 - tf.cast(expanded_mask, dtype) + + return tf.where( + tf.cast(inverted_mask, bool), tf.fill(dims=shape_list(inverted_mask), value=tf.float32.min), inverted_mask + ) + + +class TFMistralRMSNorm(keras.layers.Layer): + def __init__(self, hidden_size, eps=1e-6, **kwargs): + """ + TFMistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.variance_epsilon = eps + + def build(self, input_shape=None): + self.weight = self.add_weight( + name="weight", + shape=self.hidden_size, + initializer="ones", + ) + if self.built: + return + self.built = True + + def call(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = tf.cast(hidden_states, tf.float32) + variance = tf.reduce_mean(tf.square(hidden_states), axis=-1, keepdims=True) + hidden_states = tf.divide(hidden_states, tf.sqrt(variance + self.variance_epsilon)) + return self.weight * tf.cast(hidden_states, input_dtype) + + +# Verification: https://colab.research.google.com/gist/ariG23498/f8d8131b795a131b93d99e70ee93c192/scratchpad.ipynb +class TFMistralRotaryEmbedding(keras.layers.Layer): + def __init__(self, dim, max_position_embeddings=2048, base=10000, **kwargs): + super().__init__(**kwargs) + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.inv_freq = 1.0 / (self.base ** (tf.range(start=0, limit=self.dim, delta=2, dtype=tf.float32) / self.dim)) + + def call(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + t = tf.cast(tf.range(seq_len, dtype=tf.int64), self.inv_freq.dtype) + freqs = tf.einsum("i,j->ij", t, self.inv_freq) + emb = tf.concat([freqs, freqs], axis=-1) + cos_values = tf.cast(tf.cos(emb), x.dtype) + sin_values = tf.cast(tf.sin(emb), x.dtype) + + cos_values = cos_values[:seq_len] + cos_values = tf.cast(cos_values, dtype=x.dtype) + sin_values = sin_values[:seq_len] + sin_values = tf.cast(sin_values, dtype=x.dtype) + return (cos_values, sin_values) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + mid_length = shape_list(x)[-1] // 2 + x1 = x[..., :mid_length] + x2 = x[..., mid_length:] + return tf.concat([-x2, x1], axis=-1) + + +# Verification: https://colab.research.google.com/gist/ariG23498/bb8474baeb33f4ae6ed7d77da5f7e7a4/scratchpad.ipynb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`tf.Tensor`): The query tensor. + k (`tf.Tensor`): The key tensor. + cos (`tf.Tensor`): The cosine part of the rotary embedding. + sin (`tf.Tensor`): The sine part of the rotary embedding. + position_ids (`tf.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(tf.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = tf.expand_dims(tf.gather(cos, position_ids), unsqueeze_dim) + sin = tf.expand_dims(tf.gather(sin, position_ids), unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class TFMistralMLP(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = keras.layers.Dense(self.intermediate_size, use_bias=False, name="gate_proj") + self.up_proj = keras.layers.Dense(self.intermediate_size, use_bias=False, name="up_proj") + self.down_proj = keras.layers.Dense(self.hidden_size, use_bias=False, name="down_proj") + self.act_fn = get_tf_activation(config.hidden_act) + + def call(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "gate_proj", None) is not None: + with tf.name_scope(self.gate_proj.name): + self.gate_proj.build((self.hidden_size,)) + if getattr(self, "up_proj", None) is not None: + with tf.name_scope(self.up_proj.name): + self.up_proj.build((self.hidden_size,)) + if getattr(self, "down_proj", None) is not None: + with tf.name_scope(self.down_proj.name): + self.down_proj.build((self.intermediate_size,)) + + +# Verification: https://colab.research.google.com/gist/ariG23498/556d443d491966763ce2e7eee336efed/scratchpad.ipynb +def repeat_kv(hidden_states: tf.Tensor, n_rep: int) -> tf.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = shape_list(hidden_states) + if n_rep == 1: + return hidden_states + hidden_states = tf.expand_dims(hidden_states, 2) + hidden_states = tf.repeat(hidden_states, repeats=n_rep, axis=2) + return tf.reshape(hidden_states, (batch, num_key_value_heads * n_rep, slen, head_dim)) + + +class TFMistralAttention(keras.layers.Layer): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = keras.layers.Dense(self.num_heads * self.head_dim, use_bias=False, name="q_proj") + self.k_proj = keras.layers.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, name="k_proj") + self.v_proj = keras.layers.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, name="v_proj") + self.o_proj = keras.layers.Dense(self.hidden_size, use_bias=False, name="o_proj") + + self.rotary_emb = TFMistralRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + name="rotary_emb", + ) + self.dropout = keras.layers.Dropout(rate=self.attention_dropout) + + def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): + tensor = tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)) + tensor = tf.transpose(tensor, perm=(0, 2, 1, 3)) + return tensor + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + past_key_value: Optional[tuple[tf.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + training=None, + **kwargs, + ) -> tuple[tf.Tensor, Optional[tf.Tensor], Optional[tuple[tf.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = shape_list(hidden_states) + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = tf.transpose( + tf.reshape(query_states, (bsz, q_len, self.num_heads, self.head_dim)), perm=(0, 2, 1, 3) + ) + key_states = tf.transpose( + tf.reshape(key_states, (bsz, q_len, self.num_key_value_heads, self.head_dim)), perm=(0, 2, 1, 3) + ) + value_states = tf.transpose( + tf.reshape(value_states, (bsz, q_len, self.num_key_value_heads, self.head_dim)), perm=(0, 2, 1, 3) + ) + + kv_seq_len = shape_list(key_states)[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb( + x=value_states, + seq_len=kv_seq_len, + ) + query_states, key_states = apply_rotary_pos_emb( + q=query_states, + k=key_states, + cos=cos, + sin=sin, + position_ids=position_ids, + ) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = tf.concat([past_key_value[0], key_states], axis=2) + value_states = tf.concat([past_key_value[1], value_states], axis=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = tf.matmul(query_states, key_states, transpose_b=True) / math.sqrt(self.head_dim) + + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = stable_softmax(attn_weights, axis=-1) + attn_weights = tf.cast(attn_weights, query_states.dtype) + attn_weights = self.dropout( + attn_weights, + training=training, + ) + attn_output = tf.matmul(attn_weights, value_states) + + attn_output = tf.transpose(attn_output, perm=(0, 2, 1, 3)) + attn_output = tf.reshape(attn_output, (bsz, q_len, self.hidden_size)) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build((self.hidden_size,)) + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build((self.hidden_size,)) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build((self.hidden_size,)) + if getattr(self, "o_proj", None) is not None: + with tf.name_scope(self.o_proj.name): + self.o_proj.build((self.num_heads * self.head_dim,)) + + +class TFMistralDecoderLayer(keras.layers.Layer): + def __init__(self, config: MistralConfig, layer_idx: int, **kwargs): + super().__init__(**kwargs) + self.hidden_size = config.hidden_size + + self.self_attn = TFMistralAttention(config, layer_idx, name="self_attn") + + self.mlp = TFMistralMLP(config, name="mlp") + self.input_layernorm = TFMistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="input_layernorm") + self.post_attention_layernorm = TFMistralRMSNorm( + config.hidden_size, eps=config.rms_norm_eps, name="post_attention_layernorm" + ) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + past_key_value: Optional[tuple[tf.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> tuple[tf.Tensor, Optional[tuple[tf.Tensor, tf.Tensor]]]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(tf.Tensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "input_layernorm", None) is not None: + with tf.name_scope(self.input_layernorm.name): + self.input_layernorm.build(None) + if getattr(self, "post_attention_layernorm", None) is not None: + with tf.name_scope(self.post_attention_layernorm.name): + self.post_attention_layernorm.build(None) + + +@keras_serializable +class TFMistralMainLayer(keras.layers.Layer): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + config_class = MistralConfig + + def __init__(self, config: MistralConfig, **kwargs): + super().__init__(**kwargs) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + + # TF and PT Embedding check: https://colab.research.google.com/gist/ariG23498/2b9826818875c9c4968c79cb19f55f2c/scratchpad.ipynb + self.embed_tokens = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.hidden_size, + name="embed_tokens", + ) + self.layers = [ + TFMistralDecoderLayer(config, layer_idx, name=f"layers.{layer_idx}") + for layer_idx in range(config.num_hidden_layers) + ] + self._attn_implementation = config._attn_implementation + self.norm = TFMistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="norm") + self.config = config + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + # if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @unpack_inputs + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + past_key_values: Optional[list[tf.Tensor]] = None, + inputs_embeds: Optional[tf.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TFBaseModelOutputWithPast]: + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = shape_list(input_ids) + elif inputs_embeds is not None: + batch_size, seq_length, _ = shape_list(inputs_embeds) + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = shape_list(past_key_values[0][0])[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + position_ids = tf.range( + start=past_key_values_length, limit=seq_length + past_key_values_length, dtype=tf.int64 + ) + position_ids = tf.reshape(tf.expand_dims(position_ids, 0), (-1, seq_length)) + + else: + position_ids = tf.cast(tf.reshape(position_ids, (-1, seq_length)), tf.int64) + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is None: + attention_mask = tf.ones((batch_size, seq_length_with_past), dtype=tf.bool) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return TFBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_tokens", None) is not None: + with tf.name_scope(self.embed_tokens.name): + self.embed_tokens.build(None) + if getattr(self, "norm", None) is not None: + with tf.name_scope(self.norm.name): + self.norm.build(None) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +MISTRAL_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `model` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`MistralConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class TFMistralPreTrainedModel(TFPreTrainedModel): + config_class = MistralConfig + base_model_prefix = "model" + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(tf.Tensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + One formats is allowed: + - Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class TFMistralModel(TFMistralPreTrainedModel): + def __init__(self, config: MistralConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFMistralMainLayer(config, name="model") + + @unpack_inputs + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + past_key_values: Optional[list[tf.Tensor]] = None, + inputs_embeds: Optional[tf.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TFBaseModelOutputWithPast]: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + + +class TFMistralForCausalLM(TFMistralPreTrainedModel, TFCausalLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFMistralMainLayer(config, name="model") + self.vocab_size = config.vocab_size + self.lm_head = keras.layers.Dense( + config.vocab_size, + use_bias=False, + kernel_initializer=get_initializer(config.initializer_range), + name="lm_head", + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + past_key_values: Optional[list[tf.Tensor]] = None, + inputs_embeds: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TFCausalLMOutputWithPast]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` + or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = tf.cast(logits, tf.float32) + + loss = None + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels, shifted_logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values: + input_ids = tf.expand_dims(input_ids[:, -1], -1) + + position_ids = kwargs.get("position_ids") + if attention_mask is not None and position_ids is None: + position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True) + if past_key_values: + position_ids = tf.expand_dims(position_ids[:, -1], -1) + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + } + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build((self.config.hidden_size,)) + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +class TFMistralForSequenceClassification(TFMistralPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.model = TFMistralMainLayer(config, name="model") + self.score = keras.layers.Dense( + self.num_labels, + use_bias=False, + kernel_initializer=get_initializer(config.initializer_range), + name="score", + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + past_key_values: Optional[list[tf.Tensor]] = None, + inputs_embeds: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TFSequenceClassifierOutputWithPast]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + + transformer_outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + logits_shape = shape_list(logits) + batch_size = logits_shape[0] + + if self.config.pad_token_id is None: + last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1) + else: + if input_ids is not None: + token_indices = tf.range(shape_list(input_ids)[-1]) + non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype) + last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1) + else: + last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1) + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + loss = None + + pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1) + + if labels is not None: + if self.config.pad_token_id is None and logits_shape[0] != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + + loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels])) + + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "score", None) is not None: + with tf.name_scope(self.score.name): + self.score.build((self.config.hidden_size,)) + + +__all__ = ["TFMistralModel", "TFMistralForCausalLM", "TFMistralForSequenceClassification", "TFMistralPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modular_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modular_mistral.py new file mode 100644 index 0000000000000000000000000000000000000000..290d60b91e66ccb1bb9aaf853b521cc56f20e79a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modular_mistral.py @@ -0,0 +1,199 @@ +from typing import Callable, Optional + +import torch +from torch import nn + +from transformers.utils.generic import check_model_inputs + +from ...cache_utils import Cache, DynamicCache +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import ( + GenericForQuestionAnswering, +) +from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaForSequenceClassification, + LlamaForTokenClassification, + LlamaMLP, + LlamaModel, + LlamaPreTrainedModel, + apply_rotary_pos_emb, + eager_attention_forward, +) +from .configuration_mistral import MistralConfig + + +logger = logging.get_logger(__name__) + + +class MistralMLP(LlamaMLP): + def __init__(self, config): + super().__init__(config) + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + +class MistralAttention(LlamaAttention): + def __init__(self, config: MistralConfig, layer_idx: int): + super().__init__(config, layer_idx) + self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=getattr(self.config, "sliding_window", None), # main diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class MistralDecoderLayer(LlamaDecoderLayer): + def __init__(self, config: MistralConfig, layer_idx: int): + super().__init__(config, layer_idx) + self.self_attn = MistralAttention(config=config, layer_idx=layer_idx) + self.mlp = MistralMLP(config) + + +class MistralPreTrainedModel(LlamaPreTrainedModel): + _can_record_outputs = { + "hidden_states": MistralDecoderLayer, + "attentions": MistralAttention, + } + + +class MistralModel(LlamaModel): + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask + causal_mask = mask_function( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + ) + + +class MistralForCausalLM(LlamaForCausalLM): + pass + + +class MistralForTokenClassification(LlamaForTokenClassification): + pass + + +class MistralForSequenceClassification(LlamaForSequenceClassification): + pass + + +class MistralForQuestionAnswering(GenericForQuestionAnswering, MistralPreTrainedModel): ... + + +__all__ = [ + "MistralForCausalLM", + "MistralForQuestionAnswering", + "MistralModel", + "MistralPreTrainedModel", + "MistralForSequenceClassification", + "MistralForTokenClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91c0a48b8b8c906c4e1c9cbde3afe4130d233197 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/configuration_mixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/configuration_mixtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c083c1097e44e4a0608023786da340e0d2bd3f29 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/configuration_mixtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modeling_mixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modeling_mixtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5e57a37895b1d29ca6a9cb19938d3fd13cbf01d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modeling_mixtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modular_mixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modular_mixtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca95a9fe7bf3535f0530a5707c1cf81d448af8bd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modular_mixtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1efc6527fcbd523aaca30c08c6ee445adc69cb13 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/configuration_mm_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/configuration_mm_grounding_dino.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b70d697edabea8737834521fed9920f4854c2b8e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/configuration_mm_grounding_dino.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/modular_mm_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/modular_mm_grounding_dino.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bd826b44a117b608db36dac46d1cefdc53eae44 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/modular_mm_grounding_dino.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..c3d498323de4dc3b60dcf8c3c8ac02f25d8c3ed1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py @@ -0,0 +1,2605 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_mm_grounding_dino.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...file_utils import ModelOutput, is_timm_available, requires_backends +from ...integrations import use_kernel_forward_from_hub +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid +from ...utils import auto_docstring +from ...utils.backbone_utils import load_backbone +from ..auto.modeling_auto import AutoModel +from .configuration_mm_grounding_dino import MMGroundingDinoConfig + + +if is_timm_available(): + from timm import create_model + + +class MMGroundingDinoContrastiveEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.max_text_len = config.max_text_len + self.bias = nn.Parameter(torch.tensor(0.0)) + + def forward( + self, + vision_hidden_state: torch.FloatTensor, + text_hidden_state: torch.FloatTensor, + text_token_mask: torch.BoolTensor, + ) -> torch.FloatTensor: + res = vision_hidden_state @ text_hidden_state.transpose(-1, -2) + res = res / math.sqrt(vision_hidden_state.shape[-1]) + res = res + self.bias + res.masked_fill_(~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device) + new_res[..., : res.shape[-1]] = res + + return new_res + + +@use_kernel_forward_from_hub("MultiScaleDeformableAttention") +class MultiScaleDeformableAttention(nn.Module): + def forward( + self, + value: Tensor, + value_spatial_shapes: Tensor, + value_spatial_shapes_list: list[tuple], + level_start_index: Tensor, + sampling_locations: Tensor, + attention_weights: Tensor, + im2col_step: int, + ): + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes_list): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id] + .flatten(2) + .transpose(1, 2) + .reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, + sampling_grid_l_, + mode="bilinear", + padding_mode="zeros", + align_corners=False, + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +class MMGroundingDinoLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, config): + super().__init__() + + embedding_dim = config.d_model // 2 + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +class MMGroundingDinoMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. + """ + + def __init__(self, config: MMGroundingDinoConfig, num_heads: int, n_points: int): + super().__init__() + + self.attn = MultiScaleDeformableAttention() + + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in MMGroundingDinoMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + # Ignore copy + if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + output = self.attn( + value, + spatial_shapes, + spatial_shapes_list, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + + output = self.output_proj(output) + + return output, attention_weights + + +class MMGroundingDinoBiMultiHeadAttention(nn.Module): + def __init__(self, config): + super().__init__() + + vision_dim = text_dim = config.d_model + embed_dim = config.encoder_ffn_dim // 2 + num_heads = config.encoder_attention_heads // 2 + dropout = config.fusion_dropout + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.vision_dim = vision_dim + self.text_dim = text_dim + + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by `num_heads` (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + ) + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + + self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.text_proj = nn.Linear(self.text_dim, self.embed_dim) + self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim) + + self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim) + self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim) + + def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + vision_features: torch.FloatTensor, + text_features: torch.FloatTensor, + vision_attention_mask: Optional[torch.BoolTensor] = None, + text_attention_mask: Optional[torch.BoolTensor] = None, + ) -> tuple[tuple[torch.FloatTensor, torch.FloatTensor], tuple[torch.FloatTensor, torch.FloatTensor]]: + """Image-to-text and text-to-image cross-attention + + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`): + Projected flattened image features generated by the vision backbone. + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`): + Projected text features generated by the text encoder. + vision_attention_mask (`torch.BoolTensor`, **optional**): + Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens. + text_attention_mask (`torch.BoolTensor`, **optional**): + Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens. + + Returns: + `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an attention + output and weights: + - **vision_attn_output** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_din)`) + -- + Output of the image-to-text cross-attention layer. + - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length, + vision_sequence_length)`) -- + Attention weights of the image-to-text cross-attention layer. + - **text_attn_output** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`) -- + Output of the text-to-image cross-attention layer. + - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length, + text_sequence_length)`) -- + Attention weights of the text-to-image cross-attention layer. + """ + batch_size, tgt_len, _ = vision_features.size() + + vision_query_states = self.vision_proj(vision_features) * self.scale + vision_query_states = self._reshape(vision_query_states, tgt_len, batch_size) + + text_key_states = self.text_proj(text_features) + text_key_states = self._reshape(text_key_states, -1, batch_size) + + vision_value_states = self.values_vision_proj(vision_features) + vision_value_states = self._reshape(vision_value_states, -1, batch_size) + + text_value_states = self.values_text_proj(text_features) + text_value_states = self._reshape(text_value_states, -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + + vision_query_states = vision_query_states.view(*proj_shape) + text_key_states = text_key_states.view(*proj_shape) + vision_value_states = vision_value_states.view(*proj_shape) + text_value_states = text_value_states.view(*proj_shape) + + src_len = text_key_states.size(1) + attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + + if attn_weights.size() != (batch_size * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + attn_weights = attn_weights - attn_weights.max() + # Do not increase -50000/50000, data type half has quite limited range + attn_weights = torch.clamp(attn_weights, min=-50000, max=50000) + + attn_weights_transposed = attn_weights.transpose(1, 2) + text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0] + + # Do not increase -50000/50000, data type half has quite limited range + text_attn_weights = torch.clamp(text_attn_weights, min=-50000, max=50000) + + # mask vision for language + if vision_attention_mask is not None: + vision_attention_mask = ( + vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + text_attn_weights.masked_fill_(vision_attention_mask, float("-inf")) + + text_attn_weights = text_attn_weights.softmax(dim=-1) + + # mask language for vision + if text_attention_mask is not None: + text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + attn_weights.masked_fill_(text_attention_mask, float("-inf")) + vision_attn_weights = attn_weights.softmax(dim=-1) + + vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training) + text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training) + + vision_attn_output = torch.bmm(vision_attn_probs, text_value_states) + text_attn_output = torch.bmm(text_attn_probs, vision_value_states) + + if vision_attn_output.size() != (batch_size * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`vision_attn_output` should be of size {(batch_size, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}" + ) + + if text_attn_output.size() != (batch_size * self.num_heads, src_len, self.head_dim): + raise ValueError( + f"`text_attn_output` should be of size {(batch_size, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}" + ) + + vision_attn_output = vision_attn_output.view(batch_size, self.num_heads, tgt_len, self.head_dim) + vision_attn_output = vision_attn_output.transpose(1, 2) + vision_attn_output = vision_attn_output.reshape(batch_size, tgt_len, self.embed_dim) + + text_attn_output = text_attn_output.view(batch_size, self.num_heads, src_len, self.head_dim) + text_attn_output = text_attn_output.transpose(1, 2) + text_attn_output = text_attn_output.reshape(batch_size, src_len, self.embed_dim) + + vision_attn_output = self.out_vision_proj(vision_attn_output) + text_attn_output = self.out_text_proj(text_attn_output) + + return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights) + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class MMGroundingDinoDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class MMGroundingDinoFusionLayer(nn.Module): + def __init__(self, config): + super().__init__() + drop_path = config.fusion_droppath + + # pre layer norm + self.layer_norm_vision = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layer_norm_text = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.attn = MMGroundingDinoBiMultiHeadAttention(config) + + # add layer scale for training stability + self.drop_path = MMGroundingDinoDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + init_values = 1e-4 + self.vision_param = nn.Parameter(init_values * torch.ones(config.d_model), requires_grad=True) + self.text_param = nn.Parameter(init_values * torch.ones(config.d_model), requires_grad=True) + + def forward( + self, + vision_features: torch.FloatTensor, + text_features: torch.FloatTensor, + attention_mask_vision: Optional[torch.BoolTensor] = None, + attention_mask_text: Optional[torch.BoolTensor] = None, + ) -> tuple[tuple[torch.FloatTensor, torch.FloatTensor], tuple[torch.FloatTensor, torch.FloatTensor]]: + """Image and text features fusion + + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`): + Projected flattened image features generated by the vision backbone. + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`): + Projected text features generated by the text encoder. + attention_mask_vision (`torch.BoolTensor`, **optional**): + Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens. + attention_mask_text (`torch.BoolTensor`, **optional**): + Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens. + + Returns: + `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an enhanced + feature and attention output and weights: + - **vision_features** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, vision_dim)`) -- + Updated vision features with attention output from image-to-text cross-attention layer. + - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length, + vision_sequence_length)`) -- + Attention weights of the image-to-text cross-attention layer. + - **text_features** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, text_dim)`) -- + Updated text features with attention output from text-to-image cross-attention layer. + - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length, + text_sequence_length)`) -- + Attention weights of the text-to-image cross-attention layer. + """ + vision_features = self.layer_norm_vision(vision_features) + text_features = self.layer_norm_text(text_features) + (delta_v, vision_attn), (delta_t, text_attn) = self.attn( + vision_features, + text_features, + vision_attention_mask=attention_mask_vision, + text_attention_mask=attention_mask_text, + ) + vision_features = vision_features + self.drop_path(self.vision_param * delta_v) + text_features = text_features + self.drop_path(self.text_param * delta_t) + + return (vision_features, vision_attn), (text_features, text_attn) + + +@auto_docstring +class MMGroundingDinoPreTrainedModel(PreTrainedModel): + config: MMGroundingDinoConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, MMGroundingDinoLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, MMGroundingDinoMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(module.n_heads, 1, 1, 2) + .repeat(1, module.n_levels, module.n_points, 1) + ) + for i in range(module.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(module.value_proj.weight.data) + nn.init.constant_(module.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(module.output_proj.weight.data) + nn.init.constant_(module.output_proj.bias.data, 0.0) + elif isinstance(module, MMGroundingDinoBiMultiHeadAttention): + nn.init.xavier_uniform_(module.vision_proj.weight) + module.vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.text_proj.weight) + module.text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.values_vision_proj.weight) + module.values_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.values_text_proj.weight) + module.values_text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.out_vision_proj.weight) + module.out_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.out_text_proj.weight) + module.out_text_proj.bias.data.fill_(0) + elif isinstance(module, MMGroundingDinoFusionLayer): + module.vision_param.data.fill_(1e-4) + module.text_param.data.fill_(1e-4) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, MMGroundingDinoMLPPredictionHead): + nn.init.constant_(module.layers[-1].weight.data, 0) + nn.init.constant_(module.layers[-1].bias.data, 0) + + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + if isinstance(module, MMGroundingDinoContrastiveEmbedding): + nn.init.constant_(module.bias, -math.log((1 - 0.01) / 0.01)) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, MMGroundingDinoDecoder): + module.gradient_checkpointing = value + + +class MMGroundingDinoFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `MMGroundingDinoFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = MMGroundingDinoFrozenBatchNorm2d(module.num_features) + + if module.weight.device != torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class MMGroundingDinoConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by MMGroundingDinoFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + if config.use_timm_backbone: + requires_backends(self, ["timm"]) + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + **config.backbone_kwargs, + ) + else: + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = None + if config.backbone is not None: + backbone_model_type = config.backbone + elif config.backbone_config is not None: + backbone_model_type = config.backbone_config.model_type + else: + raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") + + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +class MMGroundingDinoConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the MMGroundingDinoEncoder. This class extends BaseModelOutput, due to: + - vision and text last hidden states + - vision and text intermediate hidden states + """ +) +class MMGroundingDinoEncoderOutput(ModelOutput): + r""" + last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the vision encoder. + last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the text encoder. + vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + """ + + last_hidden_state_vision: Optional[torch.FloatTensor] = None + last_hidden_state_text: Optional[torch.FloatTensor] = None + vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None + text_hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + + +class MMGroundingDinoMultiheadAttention(nn.Module): + """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`.""" + + def __init__(self, config, num_attention_heads=None): + super().__init__() + if config.hidden_size % num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({num_attention_heads})" + ) + + self.num_attention_heads = num_attention_heads + self.attention_head_size = int(config.hidden_size / num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) + + self.dropout = nn.Dropout(config.attention_dropout) + + def forward( + self, + queries: torch.Tensor, + keys: torch.Tensor, + values: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = queries.shape + query_layer = ( + self.query(queries) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + key_layer = ( + self.key(keys).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) + ) + value_layer = ( + self.value(values).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in MMGroundingDinoModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + context_layer = self.out_proj(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class MMGroundingDinoTextEnhancerLayer(nn.Module): + """Vanilla Transformer with text embeddings as input""" + + def __init__(self, config): + super().__init__() + self.self_attn = MMGroundingDinoMultiheadAttention( + config, num_attention_heads=config.encoder_attention_heads // 2 + ) + + # Implementation of Feedforward model + self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) + self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) + + self.layer_norm_before = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layer_norm_after = nn.LayerNorm(config.d_model, config.layer_norm_eps) + + self.activation = ACT2FN[config.activation_function] + self.num_heads = config.encoder_attention_heads // 2 + self.dropout = config.text_enhancer_dropout + + def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): + return hidden_state if position_embeddings is None else hidden_state + position_embeddings + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_masks: Optional[torch.BoolTensor] = None, + position_embeddings: Optional[torch.FloatTensor] = None, + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: + """Text self-attention to enhance projection of text features generated by + the text encoder (AutoModel based on text_config) within MMGroundingDinoEncoderLayer + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`): + Text features generated by the text encoder. + attention_masks (`torch.BoolTensor`, *optional*): + Attention mask for text self-attention. False for real tokens and True for padding tokens. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings to be added to the hidden states. + + Returns: + `tuple(torch.FloatTensor)` comprising two elements: + - **hidden_states** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) -- + Output of the text self-attention layer. + - **attention_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, sequence_length, + sequence_length)`) -- + Attention weights of the text self-attention layer. + """ + + # repeat attn mask + if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: + # batch_size, num_queries, num_keys + attention_masks = attention_masks[:, None, :, :] + attention_masks = attention_masks.repeat(1, self.num_heads, 1, 1) + + dtype = hidden_states.dtype + attention_masks = attention_masks.to(dtype=dtype) # fp16 compatibility + attention_masks = (1.0 - attention_masks) * torch.finfo(dtype).min + + queries = keys = self.with_pos_embed(hidden_states, position_embeddings) + attention_output, attention_weights = self.self_attn( + queries=queries, + keys=keys, + values=hidden_states, + attention_mask=attention_masks, + output_attentions=True, + ) + attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) + hidden_states = hidden_states + attention_output + hidden_states = self.layer_norm_before(hidden_states) + + residual = hidden_states + hidden_states = self.activation(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = hidden_states + residual + hidden_states = self.layer_norm_after(hidden_states) + + return hidden_states, attention_weights + + +class MMGroundingDinoDeformableLayer(nn.Module): + def __init__(self, config: MMGroundingDinoConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = MMGroundingDinoMultiscaleDeformableAttention( + config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + spatial_shapes_list (`list[tuple[int, int]]`, *optional*): + Spatial shapes of the backbone feature maps (but as list for export compatibility). + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + return hidden_states, attn_weights + + +# Based on https://github.com/IDEA-Research/MMGroundingDino/blob/2b62f419c292ca9c518daae55512fabc3fead4a4/MMGroundingDino/models/MMGroundingDino/utils.py#L24 +def get_sine_pos_embed( + pos_tensor: torch.Tensor, num_pos_feats: int = 128, temperature: int = 10000, exchange_xy: bool = True +) -> Tensor: + """ + Generate sine position embeddings from a position tensor. + + Args: + pos_tensor (torch.Tensor): + Tensor containing positions. Shape: [..., n]. + num_pos_feats (`int`, *optional*, defaults to 128): + Projected shape for each float in the tensor. + temperature (`int`, *optional*, defaults to 10000): + Temperature in the sine/cosine function. + exchange_xy (`bool`, *optional*, defaults to `True`): + Exchange pos x and pos y. For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. + + Returns: + position_embeddings (torch.Tensor): shape: [..., n * hidden_size]. + """ + scale = 2 * math.pi + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + return sin_x + + pos_tensor = pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1) + position_embeddings = [sine_func(x) for x in pos_tensor] + if exchange_xy: + position_embeddings[0], position_embeddings[1] = position_embeddings[1], position_embeddings[0] + position_embeddings = torch.cat(position_embeddings, dim=-1) + return position_embeddings + + +class MMGroundingDinoEncoderLayer(nn.Module): + def __init__(self, config) -> None: + super().__init__() + + self.d_model = config.d_model + + self.text_enhancer_layer = MMGroundingDinoTextEnhancerLayer(config) + self.fusion_layer = MMGroundingDinoFusionLayer(config) + self.deformable_layer = MMGroundingDinoDeformableLayer(config) + + def get_text_position_embeddings( + self, + text_features: Tensor, + text_position_embedding: Optional[torch.Tensor], + text_position_ids: Optional[torch.Tensor], + ) -> Tensor: + batch_size, seq_length, _ = text_features.shape + if text_position_embedding is None and text_position_ids is None: + text_position_embedding = torch.arange(seq_length, device=text_features.device) + text_position_embedding = text_position_embedding.float() + text_position_embedding = text_position_embedding.unsqueeze(0).unsqueeze(-1) + text_position_embedding = text_position_embedding.repeat(batch_size, 1, 1) + text_position_embedding = get_sine_pos_embed( + text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False + ) + if text_position_ids is not None: + text_position_embedding = get_sine_pos_embed( + text_position_ids[..., None], num_pos_feats=self.d_model, exchange_xy=False + ) + + return text_position_embedding + + def forward( + self, + vision_features: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + spatial_shapes_list: list[tuple[int, int]], + level_start_index: Tensor, + key_padding_mask: Tensor, + reference_points: Tensor, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None, + ): + text_position_embedding = self.get_text_position_embeddings( + text_features, text_position_embedding, text_position_ids + ) + + (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer( + vision_features=vision_features, + text_features=text_features, + attention_mask_vision=key_padding_mask, + attention_mask_text=text_attention_mask, + ) + + (text_features, text_enhanced_attn) = self.text_enhancer_layer( + hidden_states=text_features, + attention_masks=~text_self_attention_masks, # note we use ~ for mask here + position_embeddings=(text_position_embedding if text_position_embedding is not None else None), + ) + + (vision_features, vision_deformable_attn) = self.deformable_layer( + hidden_states=vision_features, + attention_mask=~key_padding_mask, + position_embeddings=vision_position_embedding, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + ) + + return ( + (vision_features, text_features), + (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn), + ) + + +class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`MMGroundingDinoEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: MMGroundingDinoConfig + """ + + def __init__(self, config: MMGroundingDinoConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([MMGroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + vision_features: Tensor, + vision_attention_mask: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + spatial_shapes_list: list[tuple[int, int]], + level_start_index: Tensor, + valid_ratios=None, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 0 for pixel features that are real (i.e. **not masked**), + - 1 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + spatial_shapes_list (`list[tuple[int, int]]`): + Spatial shapes of each feature map (but as list for export compatibility). + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Flattened text features that are passed to the encoder. + text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 0 for text features that are real (i.e. **not masked**), + - 1 for text features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`): + Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`: + - 1 for text features that are real (i.e. **not masked**), + - 0 for text features that are padding (i.e. **masked**). + text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`): + Position ids for text features. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device) + + encoder_vision_states = () if output_hidden_states else None + encoder_text_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + all_attn_fused_text = () if output_attentions else None + all_attn_fused_vision = () if output_attentions else None + all_attn_enhanced_text = () if output_attentions else None + all_attn_deformable = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) + + (vision_features, text_features), attentions = encoder_layer( + vision_features=vision_features, + vision_position_embedding=vision_position_embedding, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + key_padding_mask=vision_attention_mask, + reference_points=reference_points, + text_features=text_features, + text_attention_mask=text_attention_mask, + text_position_embedding=text_position_embedding, + text_self_attention_masks=text_self_attention_masks, + text_position_ids=text_position_ids, + ) + + if output_attentions: + all_attn_fused_vision += (attentions[0],) + all_attn_fused_text += (attentions[1],) + all_attn_enhanced_text += (attentions[2],) + all_attn_deformable += (attentions[3],) + + if output_hidden_states: + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) + + if output_attentions: + all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable) + + if not return_dict: + enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns] + return tuple(v for v in enc_outputs if v is not None) + return MMGroundingDinoEncoderOutput( + last_hidden_state_vision=vision_features, + last_hidden_state_text=text_features, + vision_hidden_states=encoder_vision_states, + text_hidden_states=encoder_text_states, + attentions=all_attns, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the MMGroundingDinoDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + """ +) +class MMGroundingDinoDecoderOutput(ModelOutput): + r""" + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + + +class MMGroundingDinoDecoderLayer(nn.Module): + def __init__(self, config: MMGroundingDinoConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = MMGroundingDinoMultiheadAttention(config, num_attention_heads=config.decoder_attention_heads) + + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # cross-attention text + self.encoder_attn_text = MMGroundingDinoMultiheadAttention( + config, num_attention_heads=config.decoder_attention_heads + ) + self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # cross-attention + self.encoder_attn = MMGroundingDinoMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + vision_encoder_hidden_states: Optional[torch.Tensor] = None, + vision_encoder_attention_mask: Optional[torch.Tensor] = None, + text_encoder_hidden_states: Optional[torch.Tensor] = None, + text_encoder_attention_mask: Optional[torch.Tensor] = None, + self_attn_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + residual = hidden_states + + # Self Attention + queries = keys = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states, self_attn_weights = self.self_attn( + queries=queries, + keys=keys, + values=hidden_states, + attention_mask=self_attn_mask, + output_attentions=True, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention Text + queries = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states, text_cross_attn_weights = self.encoder_attn_text( + queries=queries, + keys=text_encoder_hidden_states, + values=text_encoder_hidden_states, + attention_mask=text_encoder_attention_mask, + output_attentions=True, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + hidden_states = self.encoder_attn_text_layer_norm(hidden_states) + + third_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=vision_encoder_attention_mask, + encoder_hidden_states=vision_encoder_hidden_states, + encoder_attention_mask=vision_encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = third_residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights) + + return outputs + + +class MMGroundingDinoDecoder(MMGroundingDinoPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MMGroundingDinoDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Grounding DINO: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: MMGroundingDinoConfig + """ + + def __init__(self, config: MMGroundingDinoConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layers = nn.ModuleList([MMGroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.reference_points_head = MMGroundingDinoMLPPredictionHead( + config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2 + ) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement as in two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + self.query_scale = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds, + vision_encoder_hidden_states, + vision_encoder_attention_mask=None, + text_encoder_hidden_states=None, + text_encoder_attention_mask=None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + self_attn_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Last hidden state from encoder related to vision feature map. + vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Last hidden state from encoder related to text features. + text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 0 for text features that are real (i.e. **not masked**), + - 1 for text features that are padding (i.e. **masked**). + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + spatial_shapes_list (`list[tuple[int, int]]`): + Spatial shapes of the feature maps (but as list for export compatibility). + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`): + Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`: + - 1 for queries that are real (i.e. **not masked**), + - 0 for queries that are padding (i.e. **masked**). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_attns = () if output_attentions else None + all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None + all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + if text_encoder_attention_mask is not None: + dtype = text_encoder_hidden_states.dtype + + text_encoder_attention_mask = text_encoder_attention_mask[:, None, None, :] + text_encoder_attention_mask = text_encoder_attention_mask.repeat( + 1, self.config.decoder_attention_heads, self.config.num_queries, 1 + ) + text_encoder_attention_mask = text_encoder_attention_mask.to(dtype=dtype) + text_encoder_attention_mask = text_encoder_attention_mask * torch.finfo(dtype).min + + for idx, decoder_layer in enumerate(self.layers): + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + elif num_coordinates == 2: + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2) + query_pos = self.reference_points_head(query_pos) + + # In original implementation they apply layer norm before outputting intermediate hidden states + # Though that's not through between layers so the layers use as input the output of the previous layer + # without layer norm + if output_hidden_states: + all_hidden_states += (self.layer_norm(hidden_states),) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + query_pos, + reference_points_input, + spatial_shapes, + level_start_index, + vision_encoder_hidden_states, + vision_encoder_attention_mask, + text_encoder_hidden_states, + text_encoder_attention_mask, + self_attn_mask, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states=hidden_states, + position_embeddings=query_pos, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + vision_encoder_hidden_states=vision_encoder_hidden_states, + vision_encoder_attention_mask=vision_encoder_attention_mask, + text_encoder_hidden_states=text_encoder_hidden_states, + text_encoder_attention_mask=text_encoder_attention_mask, + self_attn_mask=self_attn_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) + reference_points = new_reference_points.detach() + + intermediate += (self.layer_norm(hidden_states),) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if text_encoder_hidden_states is not None: + all_cross_attns_text += (layer_outputs[2],) + + if vision_encoder_hidden_states is not None: + all_cross_attns_vision += (layer_outputs[3],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if output_attentions: + all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_attns, + ] + if v is not None + ) + return MMGroundingDinoDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_attns, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the Grounding DINO encoder-decoder model. + """ +) +class MMGroundingDinoModelOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and + multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the + bi-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as + region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and + background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Logits of top `config.num_queries` scoring bounding boxes in the first stage. + encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Coordinates of top `config.num_queries` scoring bounding boxes in the first stage. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + init_reference_points: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_text_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + encoder_logits: Optional[torch.FloatTensor] = None + encoder_pred_boxes: Optional[torch.FloatTensor] = None + + +class MMGroundingDinoSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, config): + super().__init__() + self.embedding_dim = config.d_model // 2 + self.temperature = config.positional_embedding_temperature + self.scale = 2 * math.pi + + def forward(self, pixel_values, pixel_mask): + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +def build_position_encoding(config): + if config.position_embedding_type == "sine": + position_embedding = MMGroundingDinoSinePositionEmbedding(config) + elif config.position_embedding_type == "learned": + position_embedding = MMGroundingDinoLearnedPositionEmbedding(config) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +# these correspond to [CLS], [SEP], . and ? +SPECIAL_TOKENS = [101, 102, 1012, 1029] + + +def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> tuple[Tensor, Tensor]: + """Generate attention mask between each pair of special tokens and positional ids. + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + Returns: + `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids: + - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`) + - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`) + """ + batch_size, num_token = input_ids.shape + # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool() + for special_token in SPECIAL_TOKENS: + special_tokens_mask = torch.logical_or(special_tokens_mask, input_ids == special_token) + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1) + position_ids = torch.zeros((batch_size, num_token), device=input_ids.device) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + + previous_col = col + + return attention_mask, position_ids.to(torch.long) + + +@auto_docstring( + custom_intro=""" + The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """ +) +class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel): + def __init__(self, config: MMGroundingDinoConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = MMGroundingDinoConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = MMGroundingDinoConvModel(backbone, position_embeddings) + + # Create input projection layers + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for i in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[i] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj_vision = nn.ModuleList(input_proj_list) + + # Create text backbone + self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False) + self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model) + + if config.embedding_init_target or not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) + + self.encoder = MMGroundingDinoEncoder(config) + self.decoder = MMGroundingDinoDecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.encoder_output_bbox_embed = MMGroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + self.encoder_output_class_embed = MMGroundingDinoContrastiveEmbedding(config) + + self.post_init() + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_height = valid_height.float() / height + valid_ratio_width = valid_width.float() / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) + return valid_ratio + + def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder. + padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`. + spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + current_position = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)] + mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_height = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4) + proposals.append(proposal) + current_position += height * width + + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @auto_docstring + def forward( + self, + pixel_values: Tensor, + input_ids: Tensor, + token_type_ids: Optional[Tensor] = None, + attention_mask: Optional[Tensor] = None, + pixel_mask: Optional[Tensor] = None, + encoder_outputs=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details. + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token + + [What are token type IDs?](../glossary#token-type-ids) + + Examples: + + ```python + >>> from transformers import AutoProcessor, AutoModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "a cat." + + >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny") + >>> model = AutoModel.from_pretrained("IDEA-Research/grounding-dino-tiny") + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 900, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + text_token_mask = attention_mask.bool() # just to avoid renaming everywhere + + max_text_len = self.config.max_text_len + if text_self_attention_masks.shape[1] > max_text_len: + text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] + position_ids = position_ids[:, :max_text_len] + input_ids = input_ids[:, :max_text_len] + token_type_ids = token_type_ids[:, :max_text_len] + text_token_mask = text_token_mask[:, :max_text_len] + + # Extract text features from text backbone + text_outputs = self.text_backbone( + input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict + ) + text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0] + text_features = self.text_projection(text_features) + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + feature_maps = [] + masks = [] + for level, (source, mask) in enumerate(vision_features): + feature_maps.append(self.input_proj_vision[level](source)) + masks.append(mask) + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(feature_maps): + _len_sources = len(feature_maps) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj_vision[level](vision_features[-1][0]) + else: + source = self.input_proj_vision[level](feature_maps[-1]) + mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + feature_maps.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if self.config.embedding_init_target or self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes_list = [] + for level, (source, mask, pos_embed) in enumerate(zip(feature_maps, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes_list.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + valid_ratios = valid_ratios.float() + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + vision_features=source_flatten, + vision_attention_mask=~mask_flatten, + vision_position_embedding=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + text_features=text_features, + text_attention_mask=~text_token_mask, + text_position_embedding=None, + text_self_attention_masks=~text_self_attention_masks, + text_position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a MMGroundingDinoEncoderOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, MMGroundingDinoEncoderOutput): + encoder_outputs = MMGroundingDinoEncoderOutput( + last_hidden_state_vision=encoder_outputs[0], + last_hidden_state_text=encoder_outputs[1], + vision_hidden_states=encoder_outputs[2] if output_hidden_states else None, + text_hidden_states=encoder_outputs[3] if output_hidden_states else None, + attentions=encoder_outputs[-1] if output_attentions else None, + ) + + # Fifth, prepare decoder inputs + topk_proposals = None + enc_outputs_class = None + enc_outputs_coord_logits = None + encoder_logits = None + encoder_pred_boxes = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.generate_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes + ) + + # hack implementation as in two-stage Deformable DETR + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.encoder_output_class_embed( + object_query_embedding, encoder_outputs[1], text_token_mask + ) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.num_queries` proposals + topk = self.config.num_queries + topk_logits = enc_outputs_class.max(-1)[0] + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + if query_embeds is not None: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + else: + target = torch.gather( + object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ).detach() + + # Set intermediate topk proposals (coords and class) for loss computation + encoder_pred_boxes = reference_points + encoder_logits = self.encoder_output_class_embed(target, text_features, text_token_mask) + else: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + vision_encoder_hidden_states=encoder_outputs[0], + vision_encoder_attention_mask=mask_flatten, + text_encoder_hidden_states=encoder_outputs[1], + text_encoder_attention_mask=~text_token_mask, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + self_attn_mask=None, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple( + value + for value in [ + enc_outputs_class, + enc_outputs_coord_logits, + encoder_logits, + encoder_pred_boxes, + ] + if value is not None + ) + tuple_outputs = ( + (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs + ) + + return tuple_outputs + + return MMGroundingDinoModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + init_reference_points=init_reference_points, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, + encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, + encoder_vision_hidden_states=encoder_outputs.vision_hidden_states, + encoder_text_hidden_states=encoder_outputs.text_hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + encoder_logits=encoder_logits, + encoder_pred_boxes=encoder_pred_boxes, + ) + + +class MMGroundingDinoMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`MMGroundingDinoForObjectDetection`]. + """ +) +class MMGroundingDinoObjectDetectionOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~MMGroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as + region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and + background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Logits of top `config.num_queries` scoring bounding boxes in the first stage. + encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Coordinates of top `config.num_queries` scoring bounding boxes in the first stage. + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Encoded candidate labels sequence. Used in processor to post process object detection result. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + init_reference_points: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_text_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + encoder_logits: Optional[torch.FloatTensor] = None + encoder_pred_boxes: Optional[torch.FloatTensor] = None + input_ids: Optional[torch.LongTensor] = None + + +def build_label_maps(logits: torch.FloatTensor, input_ids: torch.LongTensor) -> tuple[torch.FloatTensor]: + """ + Computes a mapping between tokens and their corresponding labels, where `num_labels` is determined by the number of classes in the input prompt. + The function identifies segments of tokens between specific delimiter tokens and generates label maps for those segments. + Args: + logits (`torch.Tensor` of shape `(batch_size, seq_length, hidden_size)`): + The output logits from the model, where `hidden_size` corresponds to the dimension of the model's output features. + + input_ids (`torch.Tensor` of shape `(batch_size, seq_length)`): + The input token IDs corresponding to the input prompt. For example, given the prompt "fish. shark.", + `input_ids` might look like `[101, 3869, 1012, 11420, 1012, 102]` where each number corresponds to a token including special tokens. + Returns: + tuple: A tuple containing label maps for each instance in the batch. + - label_maps (tuple of `torch.Tensor`): + A tuple of tensors, where each tensor in the tuple corresponds to an instance in the batch. Each tensor + has shape `(num_labels, hidden_size)` and contains binary values (0 or 1), where `1` indicates the tokens + that are associated with a specific label (class) between delimiter tokens, and `0` elsewhere. + Example: + Given an input prompt "fish. shark." and corresponding `input_ids` as `[101, 3869, 1012, 11420, 1012, 102]`: + - The function identifies the tokens for "fish" (IDs `[3869]`) and "shark" (IDs `[11420]`). + - The function then constructs label maps for these tokens, where each label map indicates which tokens + correspond to which label between the delimiter tokens (e.g., between the period `.`). + - The output is a tuple of label maps, one for each instance in the batch. + Note: + - `SPECIAL_TOKENS` should be a predefined list of tokens that are considered special (e.g., `[CLS]`, `[SEP]`, etc.). + """ + max_seq_len = logits.shape[-1] + # Add [PAD] token to the list of special tokens + delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device) + + delimiter_token_masks = torch.isin(input_ids, delimiter_tokens) + label_groups = torch.cumsum(delimiter_token_masks, dim=1) * (~delimiter_token_masks).to(torch.int32) + + label_maps = () + + # Iterate over batch dimension as we can have different number of labels + for label_group in label_groups: + # `label_group` is a tensor of shape `(seq_len,)` with zeros for non-label tokens and integers for label tokens + # label tokens with same integer value are part of the same label group + + # Get unique labels and exclude 0 (i.e. non-label tokens) + unique_labels = torch.unique(label_group)[1:, None] + num_labels = unique_labels.shape[0] + + # Create one-hot encoding for each label group + label_map = label_group.unsqueeze(0).repeat(num_labels, 1) + label_map = torch.where(label_map == unique_labels, 1, 0) + + # Pad label_map to match `max_seq_len` + label_map = F.pad(label_map, (0, max_seq_len - label_map.shape[1]), value=0) + + label_maps += (label_map,) + + return label_maps + + +def build_text_mask(logits, attention_mask): + """ + Create text_mask based on the matching indices + """ + seq_len = attention_mask.shape[1] + text_mask = torch.zeros_like(logits, device=logits.device, dtype=attention_mask.dtype) + text_mask[:, :, :seq_len] = attention_mask[:, None, :] + + return text_mask.bool() + + +@auto_docstring( + custom_intro=""" + Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, + for tasks such as COCO detection. + """ +) +class MMGroundingDinoForObjectDetection(MMGroundingDinoPreTrainedModel): + _tied_weights_keys = [ + r"bbox_embed\.[1-9]\d*", + r"model\.decoder\.bbox_embed\.[0-9]\d*", + r"class_embed\.[1-9]\d*", + r"model\.decoder\.class_embed\.[0-9]\d*", + ] + + def __init__(self, config: MMGroundingDinoConfig): + super().__init__(config) + + self.model = MMGroundingDinoModel(config) + + self.class_embed = nn.ModuleList( + [MMGroundingDinoContrastiveEmbedding(config) for _ in range(config.decoder_layers)] + ) + + self.bbox_embed = nn.ModuleList( + [ + MMGroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + for _ in range(config.decoder_layers) + ] + ) + + # hack for box-refinement + self.model.decoder.bbox_embed = self.bbox_embed + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + pixel_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Union[MMGroundingDinoEncoderOutput, tuple]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[list[dict[str, Union[torch.LongTensor, torch.FloatTensor]]]] = None, + ): + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details. + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token + + [What are token type IDs?](../glossary#token-type-ids) + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Examples: + + ```python + >>> import requests + + >>> import torch + >>> from PIL import Image + >>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection + + >>> model_id = "IDEA-Research/grounding-dino-tiny" + >>> device = "cuda" + + >>> processor = AutoProcessor.from_pretrained(model_id) + >>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) + + >>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(image_url, stream=True).raw) + >>> # Check for cats and remote controls + >>> text_labels = [["a cat", "a remote control"]] + + >>> inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device) + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> results = processor.post_process_grounded_object_detection( + ... outputs, + ... threshold=0.4, + ... text_threshold=0.3, + ... target_sizes=[(image.height, image.width)] + ... ) + >>> # Retrieve the first image result + >>> result = results[0] + >>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]): + ... box = [round(x, 2) for x in box.tolist()] + ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") + Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28] + Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44] + Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values=pixel_values, + input_ids=input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + pixel_mask=pixel_mask, + encoder_outputs=encoder_outputs, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0) + enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx] + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference_points = outputs.init_reference_points if return_dict else outputs[1] + inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + # hidden_states are of shape (batch_size, num_stages, height, width) + # predict class and bounding box deltas for each stage + num_levels = hidden_states.shape[1] + for level in range(num_levels): + if level == 0: + reference = init_reference_points + else: + reference = inter_references_points[:, level - 1] + reference = torch.special.logit(reference, eps=1e-5) + outputs_class = self.class_embed[level]( + vision_hidden_state=hidden_states[:, level], + text_hidden_state=enc_text_hidden_state, + text_token_mask=attention_mask.bool(), + ) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + + reference_coordinates = reference.shape[-1] + if reference_coordinates == 4: + outputs_coord_logits = delta_bbox + reference + elif reference_coordinates == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + label_maps = build_label_maps(logits, input_ids) + text_mask = build_text_mask(logits, attention_mask) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, + labels, + self.device, + pred_boxes, + self.config, + label_maps, + text_mask, + outputs_class=outputs_class, + outputs_coord=outputs_coord, + encoder_logits=outputs[-2], + encoder_pred_boxes=outputs[-1], + ) + + if not return_dict: + auxiliary_outputs = auxiliary_outputs if auxiliary_outputs is not None else [] + output = [loss, loss_dict, logits, pred_boxes, *auxiliary_outputs, *outputs, input_ids] + output = tuple(out for out in output if out is not None) + return output + + dict_outputs = MMGroundingDinoObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + last_hidden_state=outputs.last_hidden_state, + auxiliary_outputs=auxiliary_outputs, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision, + encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text, + encoder_vision_hidden_states=outputs.encoder_vision_hidden_states, + encoder_text_hidden_states=outputs.encoder_text_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + encoder_logits=outputs.encoder_logits, + encoder_pred_boxes=outputs.encoder_pred_boxes, + input_ids=input_ids, + ) + + return dict_outputs + + +__all__ = ["MMGroundingDinoForObjectDetection", "MMGroundingDinoModel", "MMGroundingDinoPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/feature_extraction_mobilenet_v1.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/feature_extraction_mobilenet_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..02a5401bc145996d1126641ee656180a48c92e20 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/feature_extraction_mobilenet_v1.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for MobileNetV1.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_mobilenet_v1 import MobileNetV1ImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class MobileNetV1FeatureExtractor(MobileNetV1ImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class MobileNetV1FeatureExtractor is deprecated and will be removed in version 5 of Transformers." + " Please use MobileNetV1ImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["MobileNetV1FeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3e875620e62736ec5de99e5a8e4a434301e0a5b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/configuration_modernbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/configuration_modernbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37e8f2a42e1a566560ae4dcd5a450ef2fee546e7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/configuration_modernbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modeling_modernbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modeling_modernbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee064d98823e04301c2bbcbdd77403ff5c28c7fd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modeling_modernbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modular_modernbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modular_modernbert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87da02173483c8ee532b4bbea046496e8358b273 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modular_modernbert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4610b4f7b15c3cda54da252455dedcadd78f4a5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/configuration_modernbert_decoder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/configuration_modernbert_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ff1852b2826e8063f8621b806a122486585646a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/configuration_modernbert_decoder.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modeling_modernbert_decoder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modeling_modernbert_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a686750bba00a176f27233d60003bb45ad7a7e6d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modeling_modernbert_decoder.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modular_modernbert_decoder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modular_modernbert_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dad8a22d0f5130a56ec3f159c878dfea1b18958b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modular_modernbert_decoder.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c04edefd66be0b4a7133422250b6c94c6f022c0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc491dc6be535c47a9ad997fe80f05fd7afc94b4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eda93241e1a909f870360ceae157257a269eb82f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/configuration_mpnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/configuration_mpnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d92e97405459b602b2520ce4a88624fb98a770c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/configuration_mpnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_mpnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_mpnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37b241fd135664f1f5d6f8db6e99bebddc5de0ec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_mpnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_tf_mpnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_tf_mpnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76449ce4fad7da5e271dc6d25467c395b0a23557 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_tf_mpnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28e472139bdf864cf015cd7b57599f099583a3e1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33e7be0d01be71c4a4bdd4b4ddd4340f77e98726 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bbf46adf59857321334a2d1a3685d46278ea848 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/configuration_mt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/configuration_mt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9319d516d82c67f1f002608532cdc7ef55e47a64 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/configuration_mt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_flax_mt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_flax_mt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06f8b02301a5308eaf82cd6e3cf015f0c19407e9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_flax_mt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_tf_mt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_tf_mt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0326dae38a04abba5d8c7b721f4d862aeb302282 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_tf_mt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e64302a535676ef5e11d047e7ef7d95d23e4d8bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85213459dacef1444c1efb3bacbcafe788bab91a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4334a8cefc8e5a8f8218d776b809d9c93911b197 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/configuration_mvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/configuration_mvp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b673d0550e4ac3555b3b8d1d7f7236c2a7ad278b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/configuration_mvp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/modeling_mvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/modeling_mvp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94c86a256daae3fbf273af5f29bc314b1fc682af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/modeling_mvp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f3b9f599acb760dd4784b02af6c485390f4f88c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bc41f25bd73128bcfc4079c03e3fdd95cd62fcd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a56efc31124f42723bbdedd6117039ee08f5746a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/configuration_nemotron.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/configuration_nemotron.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9dd4c1e38c53cf8ef2c47a31449975d41133d48 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/configuration_nemotron.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/modeling_nemotron.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/modeling_nemotron.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0952cbe50dac0ef9f9cf890c44fffb094af3b518 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/modeling_nemotron.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d98c0304a17be67d55d90e5073efbe67a068f967 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/configuration_nllb_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/configuration_nllb_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b42ee0744425d09ab189eeb0f3998b129f33bddb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/configuration_nllb_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/modeling_nllb_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/modeling_nllb_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78b4da8000d3315fd55b3ed1842ab511c46e7356 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/modeling_nllb_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ceeb9895278cab27cbc9ea93428dc5cda305a362 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/configuration_olmo2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/configuration_olmo2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9e9bbfd57bf8df1f26d43d030572696736dfe67 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/configuration_olmo2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/modular_olmo2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/modular_olmo2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6788c0fbde0d0dd4654df71bbedc13ec28ebf838 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/modular_olmo2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b3fd80c2bcc6b268cd47a7afe321364ce3ed1aa5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_owlvit import * + from .feature_extraction_owlvit import * + from .image_processing_owlvit import * + from .image_processing_owlvit_fast import * + from .modeling_owlvit import * + from .processing_owlvit import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/configuration_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/configuration_owlvit.py new file mode 100644 index 0000000000000000000000000000000000000000..d4873ff4a08b3f53cfb4dc8cd63ba01e7a2e96c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/configuration_owlvit.py @@ -0,0 +1,336 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""OWL-ViT model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional + + +if TYPE_CHECKING: + from ...processing_utils import ProcessorMixin + from ...utils import TensorType + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class OwlViTTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`OwlViTTextModel`]. It is used to instantiate an + OwlViT text encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the OwlViT + [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 49408): + Vocabulary size of the OWL-ViT text model. Defines the number of different tokens that can be represented + by the `inputs_ids` passed when calling [`OwlViTTextModel`]. + hidden_size (`int`, *optional*, defaults to 512): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + max_position_embeddings (`int`, *optional*, defaults to 16): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token in the input sequences. + bos_token_id (`int`, *optional*, defaults to 49406): + The id of the beginning-of-sequence token in the input sequences. + eos_token_id (`int`, *optional*, defaults to 49407): + The id of the end-of-sequence token in the input sequences. + + Example: + + ```python + >>> from transformers import OwlViTTextConfig, OwlViTTextModel + + >>> # Initializing a OwlViTTextModel with google/owlvit-base-patch32 style configuration + >>> configuration = OwlViTTextConfig() + + >>> # Initializing a OwlViTTextConfig from the google/owlvit-base-patch32 style configuration + >>> model = OwlViTTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "owlvit_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=49408, + hidden_size=512, + intermediate_size=2048, + num_hidden_layers=12, + num_attention_heads=8, + max_position_embeddings=16, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + pad_token_id=0, + bos_token_id=49406, + eos_token_id=49407, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + + +class OwlViTVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`OwlViTVisionModel`]. It is used to instantiate + an OWL-ViT image encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the OWL-ViT + [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of channels in the input images. + image_size (`int`, *optional*, defaults to 768): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + + Example: + + ```python + >>> from transformers import OwlViTVisionConfig, OwlViTVisionModel + + >>> # Initializing a OwlViTVisionModel with google/owlvit-base-patch32 style configuration + >>> configuration = OwlViTVisionConfig() + + >>> # Initializing a OwlViTVisionModel model from the google/owlvit-base-patch32 style configuration + >>> model = OwlViTVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "owlvit_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=768, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + + +class OwlViTConfig(PretrainedConfig): + r""" + [`OwlViTConfig`] is the configuration class to store the configuration of an [`OwlViTModel`]. It is used to + instantiate an OWL-ViT model according to the specified arguments, defining the text model and vision model + configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the OWL-ViT + [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`OwlViTTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`OwlViTVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original OWL-ViT + implementation. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not the model should return a dictionary. If `False`, returns a tuple. + kwargs (*optional*): + Dictionary of keyword arguments. + """ + + model_type = "owlvit" + sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig} + + def __init__( + self, + text_config=None, + vision_config=None, + projection_dim=512, + logit_scale_init_value=2.6592, + return_dict=True, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the OwlViTTextConfig with default values.") + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the OwlViTVisionConfig with default values.") + + self.text_config = OwlViTTextConfig(**text_config) + self.vision_config = OwlViTVisionConfig(**vision_config) + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.return_dict = return_dict + self.initializer_factor = 1.0 + + @classmethod + def from_text_vision_configs(cls, text_config: dict, vision_config: dict, **kwargs): + r""" + Instantiate a [`OwlViTConfig`] (or a derived class) from owlvit text model configuration and owlvit vision + model configuration. + + Returns: + [`OwlViTConfig`]: An instance of a configuration object + """ + config_dict = {} + config_dict["text_config"] = text_config + config_dict["vision_config"] = vision_config + + return cls.from_dict(config_dict, **kwargs) + + +class OwlViTOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ("attention_mask", {0: "batch", 1: "sequence"}), + ] + ) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("logits_per_image", {0: "batch"}), + ("logits_per_text", {0: "batch"}), + ("text_embeds", {0: "batch"}), + ("image_embeds", {0: "batch"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + def generate_dummy_inputs( + self, + processor: "ProcessorMixin", + batch_size: int = -1, + seq_length: int = -1, + framework: Optional["TensorType"] = None, + ) -> Mapping[str, Any]: + text_input_dict = super().generate_dummy_inputs( + processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework + ) + image_input_dict = super().generate_dummy_inputs( + processor.image_processor, batch_size=batch_size, framework=framework + ) + return {**text_input_dict, **image_input_dict} + + @property + def default_onnx_opset(self) -> int: + return 14 + + +__all__ = ["OwlViTConfig", "OwlViTOnnxConfig", "OwlViTTextConfig", "OwlViTVisionConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/feature_extraction_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/feature_extraction_owlvit.py new file mode 100644 index 0000000000000000000000000000000000000000..ee3a8d0b145cc54e8ccebf7acc84538236fac644 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/feature_extraction_owlvit.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for OwlViT.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_owlvit import OwlViTImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class OwlViTFeatureExtractor(OwlViTImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class OwlViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" + " use OwlViTImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["OwlViTFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit.py new file mode 100644 index 0000000000000000000000000000000000000000..cc9c6cfdeaa8aa0532539f03e3e8e1e1aa3d9619 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit.py @@ -0,0 +1,625 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for OwlViT""" + +import warnings +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + center_crop, + center_to_corners_format, + rescale, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging +from ...utils.import_utils import requires + + +if TYPE_CHECKING: + from .modeling_owlvit import OwlViTObjectDetectionOutput + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +def _upcast(t): + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +def _scale_boxes(boxes, target_sizes): + """ + Scale batch of bounding boxes to the target sizes. + + Args: + boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`): + Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format. + target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`): + Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format. + + Returns: + `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes. + """ + + if isinstance(target_sizes, (list, tuple)): + image_height = torch.tensor([i[0] for i in target_sizes]) + image_width = torch.tensor([i[1] for i in target_sizes]) + elif isinstance(target_sizes, torch.Tensor): + image_height, image_width = target_sizes.unbind(1) + else: + raise TypeError("`target_sizes` must be a list, tuple or torch.Tensor") + + scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1) + scale_factor = scale_factor.unsqueeze(1).to(boxes.device) + boxes = boxes * scale_factor + return boxes + + +def box_area(boxes): + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +@requires(backends=("vision",)) +class OwlViTImageProcessor(BaseImageProcessor): + r""" + Constructs an OWL-ViT image processor. + + This image processor inherits from [`ImageProcessingMixin`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the shorter edge of the input to a certain `size`. + size (`dict[str, int]`, *optional*, defaults to {"height": 768, "width": 768}): + The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a + sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized + to (size, size). + resample (`int`, *optional*, defaults to `Resampling.BICUBIC`): + An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`, + `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`, + `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set + to `True`. + do_center_crop (`bool`, *optional*, defaults to `False`): + Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the + image is padded with 0's and then center cropped. + crop_size (`int`, *optional*, defaults to {"height": 768, "width": 768}): + The size to use for center cropping the image. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the input by a certain factor. + rescale_factor (`float`, *optional*, defaults to `1/255`): + The factor to use for rescaling the image. Only has an effect if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the input with `image_mean` and `image_std`. Desired output size when applying + center-cropping. Only has an effect if `do_center_crop` is set to `True`. + image_mean (`list[int]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + The sequence of means for each channel, to be used when normalizing images. + image_std (`list[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + The sequence of standard deviations for each channel, to be used when normalizing images. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize=True, + size=None, + resample=PILImageResampling.BICUBIC, + do_center_crop=False, + crop_size=None, + do_rescale=True, + rescale_factor=1 / 255, + do_normalize=True, + image_mean=None, + image_std=None, + **kwargs, + ): + size = size if size is not None else {"height": 768, "width": 768} + size = get_size_dict(size, default_to_square=True) + + crop_size = crop_size if crop_size is not None else {"height": 768, "width": 768} + crop_size = get_size_dict(crop_size, default_to_square=True) + + # Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the + # vision image processor method `rescale` as it would be set as an attribute during the super().__init__ + # call. This is for backwards compatibility. + if "rescale" in kwargs: + rescale_val = kwargs.pop("rescale") + kwargs["do_rescale"] = rescale_val + + super().__init__(**kwargs) + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to a certain size. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + The size to resize the image to. Must contain height and width keys. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + The resampling filter to use when resizing the input. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size, default_to_square=True) + if "height" not in size or "width" not in size: + raise ValueError("size dictionary must contain height and width keys") + + return resize( + image, + (size["height"], size["width"]), + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def center_crop( + self, + image: np.ndarray, + crop_size: dict[str, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Center crop an image to a certain size. + + Args: + image (`np.ndarray`): + Image to center crop. + crop_size (`dict[str, int]`): + The size to center crop the image to. Must contain height and width keys. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + crop_size = get_size_dict(crop_size, default_to_square=True) + if "height" not in crop_size or "width" not in crop_size: + raise ValueError("crop_size dictionary must contain height and width keys") + + return center_crop( + image, + (crop_size["height"], crop_size["width"]), + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[dict[str, int]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Prepares an image or batch of images for the model. + + Args: + images (`ImageInput`): + The image or batch of images to be prepared. Expects a single or batch of images with pixel values + ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether or not to resize the input. If `True`, will resize the input to the size specified by `size`. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + The size to resize the input to. Only has an effect if `do_resize` is set to `True`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + The resampling filter to use when resizing the input. Only has an effect if `do_resize` is set to + `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether or not to center crop the input. If `True`, will center crop the input to the size specified by + `crop_size`. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + The size to center crop the input to. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether or not to rescale the input. If `True`, will rescale the input by dividing it by + `rescale_factor`. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + The factor to rescale the input by. Only has an effect if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether or not to normalize the input. If `True`, will normalize the input by subtracting `image_mean` + and dividing by `image_std`. + image_mean (`Union[float, list[float]]`, *optional*, defaults to `self.image_mean`): + The mean to subtract from the input when normalizing. Only has an effect if `do_normalize` is set to + `True`. + image_std (`Union[float, list[float]]`, *optional*, defaults to `self.image_std`): + The standard deviation to divide the input by when normalizing. Only has an effect if `do_normalize` is + set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: defaults to the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_center_crop: + images = [ + self.center_crop(image, crop_size=crop_size, input_data_format=input_data_format) for image in images + ] + + if do_rescale: + images = [ + self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + return encoded_inputs + + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`OwlViTObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original + image size (before any data augmentation). For visualization, this should be the image size after data + augment, but before padding. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + # TODO: (amy) add support for other frameworks + warnings.warn( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + FutureWarning, + ) + + logits, boxes = outputs.logits, outputs.pred_boxes + + if len(logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + probs = torch.max(logits, dim=-1) + scores = torch.sigmoid(probs.values) + labels = probs.indices + + # Convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + def post_process_object_detection( + self, + outputs: "OwlViTObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + ): + """ + Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`OwlViTObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + """ + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) + + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") + + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices + + # Convert to [x0, y0, x1, y1] format + batch_boxes = center_to_corners_format(batch_boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + batch_boxes = _scale_boxes(batch_boxes, target_sizes) + + results = [] + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) + + return results + + def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None): + """ + Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO + api. + + Args: + outputs ([`OwlViTImageGuidedObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.0): + Minimum confidence threshold to use to filter out predicted boxes. + nms_threshold (`float`, *optional*, defaults to 0.3): + IoU threshold for non-maximum suppression of overlapping boxes. + target_sizes (`torch.Tensor`, *optional*): + Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in + the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to + None, predictions will not be unnormalized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. All labels are set to None as + `OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection. + """ + logits, target_boxes = outputs.logits, outputs.target_pred_boxes + + if target_sizes is not None and len(logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes is not None and target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + probs = torch.max(logits, dim=-1) + scores = torch.sigmoid(probs.values) + + # Convert to [x0, y0, x1, y1] format + target_boxes = center_to_corners_format(target_boxes) + + # Apply non-maximum suppression (NMS) + if nms_threshold < 1.0: + for idx in range(target_boxes.shape[0]): + for i in torch.argsort(-scores[idx]): + if not scores[idx][i]: + continue + + ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0] + ious[i] = -1.0 # Mask self-IoU. + scores[idx][ious > nms_threshold] = 0.0 + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + target_boxes = _scale_boxes(target_boxes, target_sizes) + + # Compute box display alphas based on prediction scores + results = [] + alphas = torch.zeros_like(scores) + + for idx in range(target_boxes.shape[0]): + # Select scores for boxes matching the current query: + query_scores = scores[idx] + if not query_scores.nonzero().numel(): + continue + + # Apply threshold on scores before scaling + query_scores[query_scores < threshold] = 0.0 + + # Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1. + # All other boxes will either belong to a different query, or will not be shown. + max_score = torch.max(query_scores) + 1e-6 + query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9) + query_alphas = torch.clip(query_alphas, 0.0, 1.0) + alphas[idx] = query_alphas + + mask = alphas[idx] > 0 + box_scores = alphas[idx][mask] + boxes = target_boxes[idx][mask] + results.append({"scores": box_scores, "labels": None, "boxes": boxes}) + + return results + + +__all__ = ["OwlViTImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..1e458f964a04cd9bf12712f82f4691be374d497d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit_fast.py @@ -0,0 +1,231 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for OwlViT""" + +import warnings +from typing import TYPE_CHECKING, Optional, Union + +import torch + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_transforms import center_to_corners_format +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import TensorType, auto_docstring, logging +from .image_processing_owlvit import _scale_boxes, box_iou + + +if TYPE_CHECKING: + from .modeling_owlvit import OwlViTObjectDetectionOutput + + +logger = logging.get_logger(__name__) + + +@auto_docstring +class OwlViTImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"height": 768, "width": 768} + default_to_square = True + crop_size = {"height": 768, "width": 768} + do_resize = True + do_center_crop = False + do_rescale = True + do_normalize = None + do_convert_rgb = None + model_input_names = ["pixel_values"] + + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`OwlViTObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original + image size (before any data augmentation). For visualization, this should be the image size after data + augment, but before padding. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + # TODO: (amy) add support for other frameworks + warnings.warn( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + FutureWarning, + ) + + logits, boxes = outputs.logits, outputs.pred_boxes + + if len(logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + probs = torch.max(logits, dim=-1) + scores = torch.sigmoid(probs.values) + labels = probs.indices + + # Convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection + def post_process_object_detection( + self, + outputs: "OwlViTObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + ): + """ + Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`OwlViTObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + """ + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) + + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") + + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices + + # Convert to [x0, y0, x1, y1] format + batch_boxes = center_to_corners_format(batch_boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + batch_boxes = _scale_boxes(batch_boxes, target_sizes) + + results = [] + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) + + return results + + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_image_guided_detection + def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None): + """ + Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO + api. + + Args: + outputs ([`OwlViTImageGuidedObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.0): + Minimum confidence threshold to use to filter out predicted boxes. + nms_threshold (`float`, *optional*, defaults to 0.3): + IoU threshold for non-maximum suppression of overlapping boxes. + target_sizes (`torch.Tensor`, *optional*): + Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in + the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to + None, predictions will not be unnormalized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. All labels are set to None as + `OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection. + """ + logits, target_boxes = outputs.logits, outputs.target_pred_boxes + + if target_sizes is not None and len(logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes is not None and target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + probs = torch.max(logits, dim=-1) + scores = torch.sigmoid(probs.values) + + # Convert to [x0, y0, x1, y1] format + target_boxes = center_to_corners_format(target_boxes) + + # Apply non-maximum suppression (NMS) + if nms_threshold < 1.0: + for idx in range(target_boxes.shape[0]): + for i in torch.argsort(-scores[idx]): + if not scores[idx][i]: + continue + + ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0] + ious[i] = -1.0 # Mask self-IoU. + scores[idx][ious > nms_threshold] = 0.0 + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + target_boxes = _scale_boxes(target_boxes, target_sizes) + + # Compute box display alphas based on prediction scores + results = [] + alphas = torch.zeros_like(scores) + + for idx in range(target_boxes.shape[0]): + # Select scores for boxes matching the current query: + query_scores = scores[idx] + if not query_scores.nonzero().numel(): + continue + + # Apply threshold on scores before scaling + query_scores[query_scores < threshold] = 0.0 + + # Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1. + # All other boxes will either belong to a different query, or will not be shown. + max_score = torch.max(query_scores) + 1e-6 + query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9) + query_alphas = torch.clip(query_alphas, 0.0, 1.0) + alphas[idx] = query_alphas + + mask = alphas[idx] > 0 + box_scores = alphas[idx][mask] + boxes = target_boxes[idx][mask] + results.append({"scores": box_scores, "labels": None, "boxes": boxes}) + + return results + + +__all__ = ["OwlViTImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/modeling_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/modeling_owlvit.py new file mode 100644 index 0000000000000000000000000000000000000000..3971b1376d9c10b945fa17fd9f23fcba5ff65e38 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/modeling_owlvit.py @@ -0,0 +1,1642 @@ +# coding=utf-8 +# Copyright 2022 Google AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OWL-ViT model.""" + +from dataclasses import dataclass +from functools import lru_cache +from typing import Any, Optional, Union + +import torch +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + auto_docstring, + filter_out_non_signature_kwargs, + is_vision_available, + logging, + torch_int, +) +from .configuration_owlvit import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig + + +if is_vision_available(): + from transformers.image_transforms import center_to_corners_format + + +logger = logging.get_logger(__name__) + + +# See all OwlViT models at https://huggingface.co/models?filter=owlvit + + +# Copied from transformers.models.clip.modeling_clip.contrastive_loss with clip->owlvit +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) + + +# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlvit +def owlvit_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.t()) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +@auto_docstring +class OwlViTOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`]. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of + [`OwlViTVisionModel`]. + text_model_output (tuple[`BaseModelOutputWithPooling`]): + The output of the [`OwlViTTextModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`OwlViTVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +# Copied from transformers.loss.loss_for_object_detection._upcast +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +# Copied from transformers.loss.loss_for_object_detection.box_area +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +# Copied from transformers.loss.loss_for_object_detection.box_iou +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +# Copied from transformers.loss.loss_for_object_detection.generalized_box_iou +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + + Returns: + `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): + raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") + if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): + raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") + iou, union = box_iou(boxes1, boxes2) + + top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] + area = width_height[:, :, 0] * width_height[:, :, 1] + + return iou - (area - union) / area + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`OwlViTForObjectDetection`]. + """ +) +class OwlViTObjectDetectionOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`]. + image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`): + Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes + image embeddings for each patch. + class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`): + Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total + number of patches is (image_size / patch_size)**2. + text_model_output (tuple[`BaseModelOutputWithPooling`]): + The output of the [`OwlViTTextModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`OwlViTVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + class_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`OwlViTForObjectDetection.image_guided_detection`]. + """ +) +class OwlViTImageGuidedObjectDetectionOutput(ModelOutput): + r""" + logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`): + Classification logits (including no-object) for all queries. + image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`): + Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes + image embeddings for each patch. + query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`): + Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes + image embeddings for each patch. + target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual target image in the batch + (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to + retrieve the unnormalized bounding boxes. + query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual query image in the batch + (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to + retrieve the unnormalized bounding boxes. + class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`): + Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total + number of patches is (image_size / patch_size)**2. + text_model_output (tuple[`BaseModelOutputWithPooling`]): + The output of the [`OwlViTTextModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`OwlViTVisionModel`]. + """ + + logits: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + query_image_embeds: Optional[torch.FloatTensor] = None + target_pred_boxes: Optional[torch.FloatTensor] = None + query_pred_boxes: Optional[torch.FloatTensor] = None + class_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class OwlViTVisionEmbeddings(nn.Module): + def __init__(self, config: OwlViTVisionConfig): + super().__init__() + self.patch_size = config.patch_size + self.config = config + self.embed_dim = config.hidden_size + self.class_embedding = nn.Parameter(torch.randn(config.hidden_size)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=config.patch_size, + stride=config.patch_size, + bias=False, + ) + + self.num_patches = (config.image_size // config.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.interpolate_pos_encoding + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + position_embedding = self.position_embedding.weight.unsqueeze(0) + num_positions = position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding(self.position_ids) + + class_pos_embed = position_embedding[:, :1] + patch_pos_embed = position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + patch_embeds = self.patch_embedding(pixel_values) # shape = [batch_size, num_channels, height, width] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class OwlViTTextEmbeddings(nn.Module): + def __init__(self, config: OwlViTTextConfig): + super().__init__() + self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size) + self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +class OwlViTAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scale + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + # apply the causal_attention_mask first + if causal_attention_mask is not None: + if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {causal_attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + # For int8 compatibility, sometimes the `attn_probs` are in `fp32` + attn_probs = attn_probs.to(value_states.dtype) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->OwlViT +class OwlViTMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->OwlViT +class OwlViTEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: OwlViTConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = OwlViTAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = OwlViTMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +@auto_docstring +class OwlViTPreTrainedModel(PreTrainedModel): + config: OwlViTConfig + base_model_prefix = "owlvit" + supports_gradient_checkpointing = True + _no_split_modules = ["OwlViTEncoderLayer"] + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + factor = self.config.initializer_factor + if isinstance(module, OwlViTTextEmbeddings): + module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) + module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) + elif isinstance(module, OwlViTVisionEmbeddings): + nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) + nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) + nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) + elif isinstance(module, OwlViTAttention): + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (module.embed_dim**-0.5) * factor + nn.init.normal_(module.q_proj.weight, std=in_proj_std) + nn.init.normal_(module.k_proj.weight, std=in_proj_std) + nn.init.normal_(module.v_proj.weight, std=in_proj_std) + nn.init.normal_(module.out_proj.weight, std=out_proj_std) + elif isinstance(module, OwlViTMLP): + in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + fc_std = (2 * module.config.hidden_size) ** -0.5 * factor + nn.init.normal_(module.fc1.weight, std=fc_std) + nn.init.normal_(module.fc2.weight, std=in_proj_std) + elif isinstance(module, OwlViTModel): + nn.init.normal_( + module.text_projection.weight, + std=module.text_embed_dim**-0.5 * factor, + ) + nn.init.normal_( + module.visual_projection.weight, + std=module.vision_embed_dim**-0.5 * factor, + ) + module.logit_scale.data.fill_(self.config.logit_scale_init_value) + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=factor) + if module.bias is not None: + module.bias.data.zero_() + + +class OwlViTEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`OwlViTEncoderLayer`]. + + Args: + config: OwlViTConfig + """ + + def __init__(self, config: OwlViTConfig): + super().__init__() + self.layers = nn.ModuleList([OwlViTEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`). + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class OwlViTTextTransformer(nn.Module): + def __init__(self, config: OwlViTTextConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + self.embeddings = OwlViTTextEmbeddings(config) + self.encoder = OwlViTEncoder(config) + self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + @auto_docstring + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) + + # num_samples, seq_len = input_shape where num_samples = batch_size * num_max_text_queries + # OWLVIT's text model uses causal mask, prepare it here. + # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 + causal_attention_mask = _create_4d_causal_attention_mask( + input_shape, hidden_states.dtype, device=hidden_states.device + ) + # expand attention_mask + if attention_mask is not None: + # [num_samples, seq_len] -> [num_samples, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.final_layer_norm(last_hidden_state) + + # take features from the end of tokens embedding (end of token is the highest number in each sequence) + # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), + input_ids.to(torch.int).argmax(dim=-1).to(last_hidden_state.device), + ] + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class OwlViTTextModel(OwlViTPreTrainedModel): + config: OwlViTTextConfig + + def __init__(self, config: OwlViTTextConfig): + super().__init__(config) + self.text_model = OwlViTTextTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.text_model.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.text_model.embeddings.token_embedding = value + + @auto_docstring + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + + Examples: + ```python + >>> from transformers import AutoProcessor, OwlViTTextModel + + >>> model = OwlViTTextModel.from_pretrained("google/owlvit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32") + >>> inputs = processor( + ... text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt" + ... ) + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + + # Get embeddings for all text queries in all batch samples + return self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class OwlViTVisionTransformer(nn.Module): + def __init__(self, config: OwlViTVisionConfig): + super().__init__() + self.config = config + + self.embeddings = OwlViTVisionEmbeddings(config) + self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.encoder = OwlViTEncoder(config) + self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Cast the input to the expected `dtype` + expected_input_dtype = self.embeddings.patch_embedding.weight.dtype + pixel_values = pixel_values.to(expected_input_dtype) + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layernorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class OwlViTVisionModel(OwlViTPreTrainedModel): + config: OwlViTVisionConfig + main_input_name = "pixel_values" + + def __init__(self, config: OwlViTVisionConfig): + super().__init__(config) + self.vision_model = OwlViTVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, OwlViTVisionModel + + >>> model = OwlViTVisionModel.from_pretrained("google/owlvit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + +@auto_docstring +class OwlViTModel(OwlViTPreTrainedModel): + config: OwlViTConfig + + def __init__(self, config: OwlViTConfig): + super().__init__(config) + + if not isinstance(config.text_config, OwlViTTextConfig): + raise TypeError( + "config.text_config is expected to be of type OwlViTTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, OwlViTVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type OwlViTVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = OwlViTTextTransformer(text_config) + self.vision_model = OwlViTVisionTransformer(vision_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value)) + + # Initialize weights and apply final processing + self.post_init() + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_text_features( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`OwlViTTextModel`]. + + Examples: + ```python + >>> import torch + >>> from transformers import AutoProcessor, OwlViTModel + + >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32") + >>> inputs = processor( + ... text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt" + ... ) + >>> with torch.inference_mode(): + ... text_features = model.get_text_features(**inputs) + ```""" + # Get embeddings for all text queries in all batch samples + text_outputs: BaseModelOutputWithPooling = self.text_model(input_ids=input_ids, attention_mask=attention_mask) + text_features = self.text_projection(text_outputs.pooler_output) + + return text_features + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_image_features( + self, + pixel_values: torch.Tensor, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`OwlViTVisionModel`]. + + Examples: + ```python + >>> import torch + >>> from transformers.image_utils import load_image + >>> from transformers import AutoProcessor, OwlViTModel + + >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = load_image(url) + + >>> inputs = processor(images=image, return_tensors="pt") + >>> with torch.inference_mode(): + ... image_features = model.get_image_features(**inputs) + ```""" + vision_outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + image_features = self.visual_projection(vision_outputs.pooler_output) + + return image_features + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_base_image_embeds: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, OwlViTOutput]: + r""" + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + return_base_image_embeds (`bool`, *optional*): + Whether or not to return the base image embeddings. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, OwlViTModel + + >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + # Use OWL-ViT model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + # Get embeddings for all text queries in all batch samples + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(text_embeds) + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + # normalized features + image_embeds = image_embeds / torch.linalg.norm(image_embeds, ord=2, dim=-1, keepdim=True) + text_embeds_norm = text_embeds / torch.linalg.norm(text_embeds, ord=2, dim=-1, keepdim=True) + + # cosine similarity as logits and set it on the correct device + logit_scale = self.logit_scale.exp().to(image_embeds.device) + + logits_per_text = torch.matmul(text_embeds_norm, image_embeds.t()) * logit_scale + logits_per_image = logits_per_text.t() + + loss = None + if return_loss: + loss = owlvit_loss(logits_per_text) + + text_embeds = text_embeds_norm + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return OwlViTOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +class OwlViTBoxPredictionHead(nn.Module): + def __init__(self, config: OwlViTConfig, out_dim: int = 4): + super().__init__() + + width = config.vision_config.hidden_size + self.dense0 = nn.Linear(width, width) + self.dense1 = nn.Linear(width, width) + self.gelu = nn.GELU() + self.dense2 = nn.Linear(width, out_dim) + + def forward(self, image_features: torch.Tensor) -> torch.FloatTensor: + output = self.dense0(image_features) + output = self.gelu(output) + output = self.dense1(output) + output = self.gelu(output) + output = self.dense2(output) + return output + + +class OwlViTClassPredictionHead(nn.Module): + def __init__(self, config: OwlViTConfig): + super().__init__() + + out_dim = config.text_config.hidden_size + self.query_dim = config.vision_config.hidden_size + + self.dense0 = nn.Linear(self.query_dim, out_dim) + self.logit_shift = nn.Linear(self.query_dim, 1) + self.logit_scale = nn.Linear(self.query_dim, 1) + self.elu = nn.ELU() + + def forward( + self, + image_embeds: torch.FloatTensor, + query_embeds: Optional[torch.FloatTensor], + query_mask: Optional[torch.Tensor], + ) -> tuple[torch.FloatTensor]: + image_class_embeds = self.dense0(image_embeds) + if query_embeds is None: + device = image_class_embeds.device + batch_size, num_patches = image_class_embeds.shape[:2] + pred_logits = torch.zeros((batch_size, num_patches, self.query_dim)).to(device) + return (pred_logits, image_class_embeds) + + # Normalize image and text features + image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6) + query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6) + + # Get class predictions + pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds) + + # Apply a learnable shift and scale to logits + logit_shift = self.logit_shift(image_embeds) + logit_scale = self.logit_scale(image_embeds) + logit_scale = self.elu(logit_scale) + 1 + pred_logits = (pred_logits + logit_shift) * logit_scale + + if query_mask is not None: + if query_mask.ndim > 1: + query_mask = torch.unsqueeze(query_mask, dim=-2) + + pred_logits = torch.where(query_mask == 0, torch.finfo(pred_logits.dtype).min, pred_logits) + pred_logits = pred_logits.to(torch.float32) + + return (pred_logits, image_class_embeds) + + +class OwlViTForObjectDetection(OwlViTPreTrainedModel): + config: OwlViTConfig + + def __init__(self, config: OwlViTConfig): + super().__init__(config) + + self.owlvit = OwlViTModel(config) + self.class_head = OwlViTClassPredictionHead(config) + self.box_head = OwlViTBoxPredictionHead(config) + + self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps) + self.sigmoid = nn.Sigmoid() + self.config = config + self.num_patches_height = self.config.vision_config.image_size // self.config.vision_config.patch_size + self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size + self.box_bias = self.compute_box_bias(self.num_patches_height, self.num_patches_width) + + @staticmethod + def normalize_grid_corner_coordinates(num_patches_height: int, num_patches_width: int) -> torch.Tensor: + # Create grid coordinates using torch + x_coordinates = torch.arange(1, num_patches_width + 1, dtype=torch.float32) + y_coordinates = torch.arange(1, num_patches_height + 1, dtype=torch.float32) + xx, yy = torch.meshgrid(x_coordinates, y_coordinates, indexing="xy") + + # Stack the coordinates and divide by their respective patch counts + box_coordinates = torch.stack((xx, yy), dim=-1) + box_coordinates[..., 0] /= num_patches_width + box_coordinates[..., 1] /= num_patches_height + + # Flatten (h, w, 2) -> (h*w, 2) + box_coordinates = box_coordinates.view(-1, 2) + + return box_coordinates + + @lru_cache(maxsize=2) + def compute_box_bias( + self, num_patches_height: int, num_patches_width: int, feature_map: Optional[torch.FloatTensor] = None + ) -> torch.Tensor: + if feature_map is not None: + raise ValueError("feature_map has been deprecated as an input. Please pass in num_patches instead") + # The box center is biased to its position on the feature grid + box_coordinates = self.normalize_grid_corner_coordinates(num_patches_height, num_patches_width) + box_coordinates = torch.clip(box_coordinates, 0.0, 1.0) + + # Unnormalize xy + box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4) + + # The box size is biased to the patch size + box_size = torch.full_like(box_coord_bias, 1.0) + box_size[..., 0] /= num_patches_width + box_size[..., 1] /= num_patches_height + box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4) + + # Compute box bias + box_bias = torch.cat([box_coord_bias, box_size_bias], dim=-1) + return box_bias + + def box_predictor( + self, + image_feats: torch.FloatTensor, + feature_map: torch.FloatTensor, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + """ + Args: + image_feats: + Features extracted from the image, returned by the `image_text_embedder` method. + feature_map: + A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method. + interpolate_pos_encoding: + Whether to interpolate the pre-trained position encodings. + Returns: + pred_boxes: + List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary. + """ + # Bounding box detection head [batch_size, num_boxes, 4]. + pred_boxes = self.box_head(image_feats) + + # Compute the location of each token on the grid and use it to compute a bias for the bbox prediction + if interpolate_pos_encoding: + _, num_patches_height, num_patches_width, _ = feature_map.shape + box_bias = self.compute_box_bias(num_patches_height, num_patches_width) + else: + box_bias = self.box_bias + + box_bias = box_bias.to(feature_map.device) + pred_boxes += box_bias + pred_boxes = self.sigmoid(pred_boxes) + return pred_boxes + + def class_predictor( + self, + image_feats: torch.FloatTensor, + query_embeds: Optional[torch.FloatTensor] = None, + query_mask: Optional[torch.Tensor] = None, + ) -> tuple[torch.FloatTensor]: + """ + Args: + image_feats: + Features extracted from the `image_text_embedder`. + query_embeds: + Text query embeddings. + query_mask: + Must be provided with query_embeddings. A mask indicating which query embeddings are valid. + """ + (pred_logits, image_class_embeds) = self.class_head(image_feats, query_embeds, query_mask) + + return (pred_logits, image_class_embeds) + + def image_text_embedder( + self, + input_ids: torch.Tensor, + pixel_values: torch.FloatTensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> tuple[torch.FloatTensor]: + # Encode text and image + outputs = self.owlvit( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=True, + ) + + if interpolate_pos_encoding: + _, _, height, width = pixel_values.shape + num_patches_height = height // self.config.vision_config.patch_size + num_patches_width = width // self.config.vision_config.patch_size + else: + num_patches_height = self.num_patches_height + num_patches_width = self.num_patches_width + + # Get image embeddings + last_hidden_state = outputs.vision_model_output[0] + image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state) + + # Resize class token + class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape) + + # Merge image embedding with class tokens + image_embeds = image_embeds[:, 1:, :] * class_token_out + image_embeds = self.layer_norm(image_embeds) + + # Resize to [batch_size, num_patches_height, num_patches_width, hidden_size] + new_size = ( + image_embeds.shape[0], + num_patches_height, + num_patches_width, + image_embeds.shape[-1], + ) + image_embeds = image_embeds.reshape(new_size) + text_embeds = outputs[-4] + + return (text_embeds, image_embeds, outputs) + + def image_embedder( + self, + pixel_values: torch.FloatTensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> tuple[torch.FloatTensor]: + # Get OwlViTModel vision embeddings (same as CLIP) + vision_outputs = self.owlvit.vision_model( + pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, return_dict=True + ) + + if interpolate_pos_encoding: + _, _, height, width = pixel_values.shape + num_patches_height = height // self.config.vision_config.patch_size + num_patches_width = width // self.config.vision_config.patch_size + else: + num_patches_height = self.num_patches_height + num_patches_width = self.num_patches_width + + # Apply post_layernorm to last_hidden_state, return non-projected output + last_hidden_state = vision_outputs[0] + image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state) + + # Resize class token + class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape) + + # Merge image embedding with class tokens + image_embeds = image_embeds[:, 1:, :] * class_token_out + image_embeds = self.layer_norm(image_embeds) + + # Resize to [batch_size, num_patches_height, num_patches_width, hidden_size] + new_size = ( + image_embeds.shape[0], + num_patches_height, + num_patches_width, + image_embeds.shape[-1], + ) + image_embeds = image_embeds.reshape(new_size) + + return (image_embeds, vision_outputs) + + def embed_image_query( + self, + query_image_features: torch.FloatTensor, + query_feature_map: torch.FloatTensor, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + _, class_embeds = self.class_predictor(query_image_features) + pred_boxes = self.box_predictor(query_image_features, query_feature_map, interpolate_pos_encoding) + pred_boxes_as_corners = center_to_corners_format(pred_boxes) + + # Loop over query images + best_class_embeds = [] + best_box_indices = [] + pred_boxes_device = pred_boxes_as_corners.device + + for i in range(query_image_features.shape[0]): + each_query_box = torch.tensor([[0, 0, 1, 1]], device=pred_boxes_device) + each_query_pred_boxes = pred_boxes_as_corners[i] + ious, _ = box_iou(each_query_box, each_query_pred_boxes) + + # If there are no overlapping boxes, fall back to generalized IoU + if torch.all(ious[0] == 0.0): + ious = generalized_box_iou(each_query_box, each_query_pred_boxes) + + # Use an adaptive threshold to include all boxes within 80% of the best IoU + iou_threshold = torch.max(ious) * 0.8 + + selected_inds = (ious[0] >= iou_threshold).nonzero() + if selected_inds.numel(): + selected_embeddings = class_embeds[i][selected_inds.squeeze(1)] + mean_embeds = torch.mean(class_embeds[i], axis=0) + mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings) + best_box_ind = selected_inds[torch.argmin(mean_sim)] + best_class_embeds.append(class_embeds[i][best_box_ind]) + best_box_indices.append(best_box_ind) + + if best_class_embeds: + query_embeds = torch.stack(best_class_embeds) + box_indices = torch.stack(best_box_indices) + else: + query_embeds, box_indices = None, None + + return query_embeds, box_indices, pred_boxes + + @auto_docstring + def image_guided_detection( + self, + pixel_values: torch.FloatTensor, + query_pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> OwlViTImageGuidedObjectDetectionOutput: + r""" + query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values of query image(s) to be detected. Pass in one query image per target image. + + Examples: + ```python + >>> import requests + >>> from PIL import Image + >>> import torch + >>> from transformers import AutoProcessor, OwlViTForObjectDetection + + >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch16") + >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg" + >>> query_image = Image.open(requests.get(query_url, stream=True).raw) + >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt") + >>> with torch.no_grad(): + ... outputs = model.image_guided_detection(**inputs) + >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] + >>> target_sizes = torch.Tensor([image.size[::-1]]) + >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> results = processor.post_process_image_guided_detection( + ... outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes + ... ) + >>> i = 0 # Retrieve predictions for the first image + >>> boxes, scores = results[i]["boxes"], results[i]["scores"] + >>> for box, score in zip(boxes, scores): + ... box = [round(i, 2) for i in box.tolist()] + ... print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}") + Detected similar object with confidence 0.856 at location [10.94, 50.4, 315.8, 471.39] + Detected similar object with confidence 1.0 at location [334.84, 25.33, 636.16, 374.71] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # Compute feature maps for the input and query images + query_feature_map = self.image_embedder( + pixel_values=query_pixel_values, interpolate_pos_encoding=interpolate_pos_encoding + )[0] + feature_map, vision_outputs = self.image_embedder( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape + image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim)) + + batch_size, num_patches_height, num_patches_width, hidden_dim = query_feature_map.shape + query_image_feats = torch.reshape( + query_feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim) + ) + # Get top class embedding and best box index for each query image in batch + query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query( + query_image_feats, query_feature_map, interpolate_pos_encoding + ) + + # Predict object classes [batch_size, num_patches, num_queries+1] + (pred_logits, class_embeds) = self.class_predictor(image_feats=image_feats, query_embeds=query_embeds) + + # Predict object boxes + target_pred_boxes = self.box_predictor(image_feats, feature_map, interpolate_pos_encoding) + + if not return_dict: + output = ( + feature_map, + query_feature_map, + target_pred_boxes, + query_pred_boxes, + pred_logits, + class_embeds, + vision_outputs.to_tuple(), + ) + output = tuple(x for x in output if x is not None) + return output + + return OwlViTImageGuidedObjectDetectionOutput( + image_embeds=feature_map, + query_image_embeds=query_feature_map, + target_pred_boxes=target_pred_boxes, + query_pred_boxes=query_pred_boxes, + logits=pred_logits, + class_embeds=class_embeds, + text_model_output=None, + vision_model_output=vision_outputs, + ) + + @auto_docstring + def forward( + self, + input_ids: torch.Tensor, + pixel_values: torch.FloatTensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> OwlViTObjectDetectionOutput: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids). + output_hidden_states (`bool`, *optional*): + Whether or not to return the last hidden state. See `text_model_last_hidden_state` and + `vision_model_last_hidden_state` under returned tensors for more detail. + + Examples: + ```python + >>> import requests + >>> from PIL import Image + >>> import torch + + >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection + + >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") + >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text_labels = [["a photo of a cat", "a photo of a dog"]] + >>> inputs = processor(text=text_labels, images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] + >>> target_sizes = torch.tensor([(image.height, image.width)]) + >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> results = processor.post_process_grounded_object_detection( + ... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels + ... ) + >>> # Retrieve predictions for the first image for the corresponding text queries + >>> result = results[0] + >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] + >>> for box, score, text_label in zip(boxes, scores, text_labels): + ... box = [round(i, 2) for i in box.tolist()] + ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") + Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] + Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # Embed images and text queries + query_embeds, feature_map, outputs = self.image_text_embedder( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + # Text and vision model outputs + text_outputs = outputs.text_model_output + vision_outputs = outputs.vision_model_output + + batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape + image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim)) + + # Reshape from [batch_size * max_text_queries, hidden_dim] -> [batch_size, max_text_queries, hidden_dim] + max_text_queries = input_ids.shape[0] // batch_size + query_embeds = query_embeds.reshape(batch_size, max_text_queries, query_embeds.shape[-1]) + + # If first token is 0, then this is a padded query [batch_size, num_queries]. + input_ids = input_ids.reshape(batch_size, max_text_queries, input_ids.shape[-1]) + query_mask = input_ids[..., 0] > 0 + + # Predict object classes [batch_size, num_patches, num_queries+1] + (pred_logits, class_embeds) = self.class_predictor(image_feats, query_embeds, query_mask) + + # Predict object boxes + pred_boxes = self.box_predictor(image_feats, feature_map, interpolate_pos_encoding) + + if not return_dict: + output = ( + pred_logits, + pred_boxes, + query_embeds, + feature_map, + class_embeds, + text_outputs.to_tuple(), + vision_outputs.to_tuple(), + ) + output = tuple(x for x in output if x is not None) + return output + + return OwlViTObjectDetectionOutput( + image_embeds=feature_map, + text_embeds=query_embeds, + pred_boxes=pred_boxes, + logits=pred_logits, + class_embeds=class_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +__all__ = ["OwlViTModel", "OwlViTPreTrainedModel", "OwlViTTextModel", "OwlViTVisionModel", "OwlViTForObjectDetection"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/processing_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/processing_owlvit.py new file mode 100644 index 0000000000000000000000000000000000000000..ae39d2b6b30783fb7281b56f874f8a840b129732 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/processing_owlvit.py @@ -0,0 +1,322 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for OWL-ViT +""" + +import warnings +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ( + ImagesKwargs, + ProcessingKwargs, + ProcessorMixin, + Unpack, +) +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available + + +if TYPE_CHECKING: + from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput + + +class OwlViTImagesKwargs(ImagesKwargs, total=False): + query_images: Optional[ImageInput] + + +class OwlViTProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: OwlViTImagesKwargs + _defaults = { + "text_kwargs": { + "padding": "max_length", + }, + "images_kwargs": {}, + "common_kwargs": { + "return_tensors": "np", + }, + } + + +class OwlViTProcessor(ProcessorMixin): + r""" + Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] + into a single processor that inherits both the image processor and tokenizer functionalities. See the + [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information. + + Args: + image_processor ([`OwlViTImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`], *optional*): + The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "OwlViTImageProcessor" + tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + feature_extractor = None + if "feature_extractor" in kwargs: + warnings.warn( + "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" + " instead.", + FutureWarning, + ) + feature_extractor = kwargs.pop("feature_extractor") + + image_processor = image_processor if image_processor is not None else feature_extractor + + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[OwlViTProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and + `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, + `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The query image to be prepared, one query image is expected per target image to be queried. Each image + can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image + should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + OwlViTProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + query_images = output_kwargs["images_kwargs"].pop("query_images", None) + return_tensors = output_kwargs["common_kwargs"]["return_tensors"] + + if text is None and query_images is None and images is None: + raise ValueError( + "You have to specify at least one text or query image or image. All three cannot be none." + ) + + data = {} + if text is not None: + if isinstance(text, str) or (isinstance(text, list) and not isinstance(text[0], list)): + encodings = [self.tokenizer(text, **output_kwargs["text_kwargs"])] + + elif isinstance(text, list) and isinstance(text[0], list): + encodings = [] + + # Maximum number of queries across batch + max_num_queries = max(len(text_single) for text_single in text) + + # Pad all batch samples to max number of text queries + for text_single in text: + if len(text_single) != max_num_queries: + text_single = text_single + [" "] * (max_num_queries - len(text_single)) + + encoding = self.tokenizer(text_single, **output_kwargs["text_kwargs"]) + encodings.append(encoding) + else: + raise TypeError("Input text should be a string, a list of strings or a nested list of strings") + + if return_tensors == "np": + input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0) + attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0) + + elif return_tensors == "jax" and is_flax_available(): + import jax.numpy as jnp + + input_ids = jnp.concatenate([encoding["input_ids"] for encoding in encodings], axis=0) + attention_mask = jnp.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0) + + elif return_tensors == "pt" and is_torch_available(): + import torch + + input_ids = torch.cat([encoding["input_ids"] for encoding in encodings], dim=0) + attention_mask = torch.cat([encoding["attention_mask"] for encoding in encodings], dim=0) + + elif return_tensors == "tf" and is_tf_available(): + import tensorflow as tf + + input_ids = tf.stack([encoding["input_ids"] for encoding in encodings], axis=0) + attention_mask = tf.stack([encoding["attention_mask"] for encoding in encodings], axis=0) + + else: + raise ValueError("Target return tensor type could not be returned") + + data["input_ids"] = input_ids + data["attention_mask"] = attention_mask + + if query_images is not None: + query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values + # Query images always override the text prompt + data = {"query_pixel_values": query_pixel_values} + + if images is not None: + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + data["pixel_values"] = image_features.pixel_values + + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process(self, *args, **kwargs): + """ + This method forwards all its arguments to [`OwlViTImageProcessor.post_process`]. Please refer to the docstring + of this method for more information. + """ + return self.image_processor.post_process(*args, **kwargs) + + def post_process_object_detection(self, *args, **kwargs): + """ + This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer + to the docstring of this method for more information. + """ + warnings.warn( + "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. " + "Use `post_process_grounded_object_detection` instead.", + FutureWarning, + ) + return self.image_processor.post_process_object_detection(*args, **kwargs) + + def post_process_grounded_object_detection( + self, + outputs: "OwlViTObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + text_labels: Optional[list[list[str]]] = None, + ): + """ + Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`OwlViTObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + text_labels (`list[list[str]]`, *optional*): + List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be + set to `None`. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + - "text_labels": The text labels for each predicted bounding box on the image. + """ + output = self.image_processor.post_process_object_detection( + outputs=outputs, threshold=threshold, target_sizes=target_sizes + ) + + if text_labels is not None and len(text_labels) != len(output): + raise ValueError("Make sure that you pass in as many lists of text labels as images") + + # adding text labels to the output + if text_labels is not None: + for image_output, image_text_labels in zip(output, text_labels): + object_text_labels = [image_text_labels[i] for i in image_output["labels"]] + image_output["text_labels"] = object_text_labels + else: + for image_output in output: + image_output["text_labels"] = None + + return output + + def post_process_image_guided_detection( + self, + outputs: "OwlViTImageGuidedObjectDetectionOutput", + threshold: float = 0.0, + nms_threshold: float = 0.3, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + ): + """ + Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO + api. + + Args: + outputs ([`OwlViTImageGuidedObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.0): + Minimum confidence threshold to use to filter out predicted boxes. + nms_threshold (`float`, *optional*, defaults to 0.3): + IoU threshold for non-maximum suppression of overlapping boxes. + target_sizes (`torch.Tensor`, *optional*): + Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in + the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to + None, predictions will not be unnormalized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + - "labels": Set to `None`. + """ + return self.image_processor.post_process_image_guided_detection( + outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes + ) + + @property + def feature_extractor_class(self): + warnings.warn( + "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.", + FutureWarning, + ) + return self.image_processor_class + + @property + def feature_extractor(self): + warnings.warn( + "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.", + FutureWarning, + ) + return self.image_processor + + +__all__ = ["OwlViTProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14e75f132c14d77feea8fa3f7f4f1f655b9eb1cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/configuration_perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/configuration_perceiver.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d521cb16109bbb655b372656920fc8db95cad8b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/configuration_perceiver.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/feature_extraction_perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/feature_extraction_perceiver.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b19baaab45581b316741199f2b467befbf9d55b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/feature_extraction_perceiver.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd1c0a07dfa49a5e9d5baafe74dea9f055460789 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06a2ba66ccfc9b432189d3f00ac72b0f1d7a0f7a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/tokenization_perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/tokenization_perceiver.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e5d87f0189b9456a7442e9c24d2b449b8c0e880 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/tokenization_perceiver.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0cb1e7a9cd04fb32cb8eb29516d95c0fc8e9d108 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_phi3 import * + from .modeling_phi3 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/configuration_phi3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/configuration_phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..33cee6b37ba57ed4c1010f78646274a09489e33d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/configuration_phi3.py @@ -0,0 +1,240 @@ +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Phi-3 model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class Phi3Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the + [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32064): + Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Phi3Model`]. + hidden_size (`int`, *optional*, defaults to 3072): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + resid_pdrop (`float`, *optional*, defaults to 0.0): + Dropout probability for mlp outputs. + embd_pdrop (`int`, *optional*, defaults to 0.0): + The dropout ratio for the embeddings. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio after computing the attention scores. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. + original_max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model was trained with. This is used to determine the size of the + original RoPE embeddings when using long scaling. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon value used for the RMSNorm. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`dict`, *optional*): + The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must + contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and + the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size + divided by the number of attention heads divided by 2. + partial_rotary_factor (`float`, *optional*, defaults to 1.0): + Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 32000): + The id of the "end-of-sequence" token. + pad_token_id (`int`, *optional*, defaults to 32000): + The id of the padding token. + sliding_window (`int`, *optional*): + Sliding window attention window size. If `None`, no sliding window is applied. + + Example: + + ```python + >>> from transformers import Phi3Model, Phi3Config + + >>> # Initializing a Phi-3 style configuration + >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + + >>> # Initializing a model from the configuration + >>> model = Phi3Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "phi3" + keys_to_ignore_at_inference = ["past_key_values"] + base_model_tp_plan = { + "layers.*.self_attn.qkv_proj": "colwise_rep", # we need to replicate here due to the slicing of qkv + "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the slicing of qkv + "layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation + "layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=32064, + hidden_size=3072, + intermediate_size=8192, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + resid_pdrop=0.0, + embd_pdrop=0.0, + attention_dropout=0.0, + hidden_act="silu", + max_position_embeddings=4096, + original_max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + partial_rotary_factor=1.0, + bos_token_id=1, + eos_token_id=32000, + pad_token_id=32000, + sliding_window=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.original_max_position_embeddings = original_max_position_embeddings + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.partial_rotary_factor = partial_rotary_factor + self._rope_scaling_adjustment() + self._rope_scaling_validation() + self.sliding_window = sliding_window + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_adjustment(self): + """ + Adjust the `type` of the `rope_scaling` configuration for backward compatibility. + """ + if self.rope_scaling is None: + return + + rope_scaling_type = self.rope_scaling.get("type", None) + + # For backward compatibility if previous version used "su" or "yarn" + if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]: + self.rope_scaling["type"] = "longrope" + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3: + raise ValueError( + "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) + rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) + if rope_scaling_type is None or rope_scaling_type != "longrope": + raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}") + if not ( + isinstance(rope_scaling_short_factor, list) + and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) + ): + raise ValueError( + f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}" + ) + rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor) + if not len(rope_scaling_short_factor) == rotary_ndims // 2: + raise ValueError( + f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}" + ) + if not ( + isinstance(rope_scaling_long_factor, list) + and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor) + ): + raise ValueError( + f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}" + ) + if not len(rope_scaling_long_factor) == rotary_ndims // 2: + raise ValueError( + f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}" + ) + + +__all__ = ["Phi3Config"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modeling_phi3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modeling_phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..23820075a02056eb6b358cc25529f60b0a784364 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modeling_phi3.py @@ -0,0 +1,547 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/phi3/modular_phi3.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_phi3.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from transformers.utils.generic import check_model_inputs + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import ( + GenericForSequenceClassification, + GenericForTokenClassification, + GradientCheckpointingLayer, +) +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from .configuration_phi3 import Phi3Config + + +class Phi3MLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] + + q_embed = torch.cat([(q_rot * cos) + (rotate_half(q_rot) * sin), q_pass], dim=-1) + k_embed = torch.cat([(k_rot * cos) + (rotate_half(k_rot) * sin), k_pass], dim=-1) + return q_embed, k_embed + + +class Phi3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.num_key_value_heads = config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + op_size = config.num_attention_heads * self.head_dim + 2 * (config.num_key_value_heads * self.head_dim) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) + self.qkv_proj = nn.Linear(config.hidden_size, op_size, bias=False) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + qkv = self.qkv_proj(hidden_states) + query_pos = self.config.num_attention_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(hidden_shape).transpose(1, 2) + key_states = key_states.view(hidden_shape).transpose(1, 2) + value_states = value_states.view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=getattr(self.config, "sliding_window", None), + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +@use_kernel_forward_from_hub("RMSNorm") +class Phi3RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Phi3RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Phi3DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Phi3Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Phi3Attention(config=config, layer_idx=layer_idx) + self.mlp = Phi3MLP(config) + self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.config = config + self.resid_attn_dropout = nn.Dropout(config.resid_pdrop) + self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + self.resid_attn_dropout(hidden_states) # main diff with Llama + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.resid_mlp_dropout(hidden_states) # main diff with Llama + return hidden_states + + +@auto_docstring +class Phi3PreTrainedModel(PreTrainedModel): + config: Phi3Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Phi3DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Phi3DecoderLayer, + "attentions": Phi3Attention, + } + _version = "0.0.5" + + +class Phi3RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Phi3Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class Phi3Model(Phi3PreTrainedModel): + def __init__(self, config: Phi3Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Phi3RotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask + causal_mask = mask_function( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + ) + + +@auto_docstring +class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = Phi3Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, Phi3ForCausalLM + + >>> model = Phi3ForCausalLM.from_pretrained("meta-phi3/Phi3-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi3/Phi3-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the + # process + + # When the first time input length reached long and short factor switching point, enforce re-compute cache + # It will cause downside of slower at this single token position, however, better than current failure. + if ( + past_key_values + and self.config.rope_scaling + and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 + ): + past_length = cache_position[0] + if past_length <= self.config.original_max_position_embeddings: + past_key_values = None + + model_inputs = super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + logits_to_keep=logits_to_keep, + **kwargs, + ) + return model_inputs + + +class Phi3ForSequenceClassification(GenericForSequenceClassification, Phi3PreTrainedModel): + pass + + +class Phi3ForTokenClassification(GenericForTokenClassification, Phi3PreTrainedModel): + pass + + +__all__ = [ + "Phi3PreTrainedModel", + "Phi3Model", + "Phi3ForCausalLM", + "Phi3ForSequenceClassification", + "Phi3ForTokenClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modular_phi3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modular_phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..d355c3792a6b056165d6acbd0db8677bb9784728 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modular_phi3.py @@ -0,0 +1,272 @@ +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PyTorch Phi-3 model.""" + +from typing import Callable, Optional + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg +from ..mistral.modeling_mistral import ( + MistralDecoderLayer, + MistralForCausalLM, + MistralForSequenceClassification, + MistralForTokenClassification, + MistralPreTrainedModel, + eager_attention_forward, + rotate_half, +) +from .configuration_phi3 import Phi3Config + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct" +_CONFIG_FOR_DOC = "Phi3Config" + + +class Phi3MLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] + + q_embed = torch.cat([(q_rot * cos) + (rotate_half(q_rot) * sin), q_pass], dim=-1) + k_embed = torch.cat([(k_rot * cos) + (rotate_half(k_rot) * sin), k_pass], dim=-1) + return q_embed, k_embed + + +class Phi3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.num_key_value_heads = config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + op_size = config.num_attention_heads * self.head_dim + 2 * (config.num_key_value_heads * self.head_dim) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) + self.qkv_proj = nn.Linear(config.hidden_size, op_size, bias=False) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + qkv = self.qkv_proj(hidden_states) + query_pos = self.config.num_attention_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(hidden_shape).transpose(1, 2) + key_states = key_states.view(hidden_shape).transpose(1, 2) + value_states = value_states.view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=getattr(self.config, "sliding_window", None), + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Phi3DecoderLayer(MistralDecoderLayer): + def __init__(self, config: Phi3Config, layer_idx: int): + super().__init__(config, layer_idx) + self.config = config + self.self_attn = Phi3Attention(config=config, layer_idx=layer_idx) + self.mlp = Phi3MLP(config) + self.resid_attn_dropout = nn.Dropout(config.resid_pdrop) + self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + self.resid_attn_dropout(hidden_states) # main diff with Llama + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.resid_mlp_dropout(hidden_states) # main diff with Llama + return hidden_states + + +class Phi3PreTrainedModel(MistralPreTrainedModel): + _version = "0.0.5" + + +class Phi3ForCausalLM(MistralForCausalLM): + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the + # process + + # When the first time input length reached long and short factor switching point, enforce re-compute cache + # It will cause downside of slower at this single token position, however, better than current failure. + if ( + past_key_values + and self.config.rope_scaling + and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 + ): + past_length = cache_position[0] + if past_length <= self.config.original_max_position_embeddings: + past_key_values = None + + model_inputs = GenerationMixin.prepare_inputs_for_generation( + self, + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + logits_to_keep=logits_to_keep, + **kwargs, + ) + return model_inputs + + +class Phi3ForSequenceClassification(MistralForSequenceClassification): + pass + + +class Phi3ForTokenClassification(MistralForTokenClassification): + pass + + +__all__ = [ + "Phi3PreTrainedModel", + "Phi3Model", # noqa: F822 + "Phi3ForCausalLM", + "Phi3ForSequenceClassification", + "Phi3ForTokenClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52eb154040e56d482fdb25840afdaa84b062ecf2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7625a06c47dc307419efdab05d7b11068dcc43a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aef8ae6edd715d71645f8c78984dccbef7f56791 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a724c22e13a040c5dbb1e9f2b17045e1cfab04f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/configuration_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/configuration_pop2piano.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8eb49d9785b4eb70fd8513a741a2a4ae2fc54956 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/configuration_pop2piano.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/feature_extraction_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/feature_extraction_pop2piano.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a23da2fba74892aeb88758560fbbd7ce5fce17c6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/feature_extraction_pop2piano.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/modeling_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/modeling_pop2piano.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aee60b88331b873e921e9a5e460b4c6e71d5ca03 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/modeling_pop2piano.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/processing_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/processing_pop2piano.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de949392e04026370f846ccdcf9ecd834dbf0481 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/processing_pop2piano.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/tokenization_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/tokenization_pop2piano.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5aaa2f67ad61c9608ed9cbf831a48ffdd33fd65 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/tokenization_pop2piano.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2062d730e55ca57e0c8901b53a2927611654c71 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b84d359475e580df6214a6ab60bf28ec5cc47b7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd1e1308cc1265a16632aba88c3241517dd08b22 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1ffdcb6d777783052422e55652f2ba47726ca81 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ed2f8e310292b8d3b71e78c82ea0e6b838c9225 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e45ef66a183f2748bab1462194d026f5efc724f5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60e5656459c2469790dc8e34ab3c31a1a37a698c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/configuration_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/configuration_prophetnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc2494c12750abc3ee449c9896f2eef7fcf368cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/configuration_prophetnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/modeling_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/modeling_prophetnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9f21035aab5d55669af855be12bc9dc116b6332 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/modeling_prophetnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/tokenization_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/tokenization_prophetnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56b2e03277c67ad6939ca44a5f4dda2847f3ec20 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/tokenization_prophetnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47209372d997a1579385d2139cca741b9c63f752 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/modeling_pvt_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/modeling_pvt_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20ee818ed82ecd23ccbf7e08a88d9ca2ad38a111 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/modeling_pvt_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e30f6c7325b4acba3f38ac076ab915352bec707 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/configuration_qwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/configuration_qwen2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08d4ecb0cb6bcefa2fc79276635c0947028d0874 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/configuration_qwen2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modeling_qwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modeling_qwen2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..751b383d34a48812fcd1c665d3f5fd00b2d3171a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modeling_qwen2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modular_qwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modular_qwen2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84a8989cc983f69f0ae24bcf08a26b58a937aa15 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modular_qwen2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3608263468e8f502e0551b75b3d47c6b996cc697 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02ad73526392560a9c3a5f01219ecc5b73ce5435 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62e01fea648c4ea6d62786a2896733c35c704353 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/configuration_qwen3_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/configuration_qwen3_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15f84c9be4b76b0cbf1e6d36ce99b867b7c9c9c6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/configuration_qwen3_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modeling_qwen3_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modeling_qwen3_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf267058d6d43dcf23d504c10a22082bf7c7cd9e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modeling_qwen3_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modular_qwen3_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modular_qwen3_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f4c957d500ad4084cc73b094d8bd87ac94e0d1f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modular_qwen3_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c025276e6145169e6240c7a442fea6f829598ba7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a295d582056d3da5ebf931c3da71a3cfafc84ee4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84ef03443bd6176f1522de3866eac9363ab34f76 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/configuration_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/configuration_qwen3_vl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34372fe818de71a0c7e81c0f284372aa404d57fc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/configuration_qwen3_vl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modeling_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modeling_qwen3_vl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32d3438aa57738f63a234b26970dabc82c77bf8d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modeling_qwen3_vl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modular_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modular_qwen3_vl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4090a0bfedf990711d933830c82d383e6054dfd7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modular_qwen3_vl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/video_processing_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/video_processing_qwen3_vl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b917b24e143dada98220305dab6e1f965a3b2c6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/video_processing_qwen3_vl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a4000cb272723d9920136a6c78465e8413a8b4d1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_qwen3_vl_moe import * + from .modeling_qwen3_vl_moe import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..25358aa79bff482437632829f6319effad36f138 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -0,0 +1,335 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_qwen3_vl_moe.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +class Qwen3VLMoeTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a + Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2MoeModel`] + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 5632): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 128000): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 5000000.0): + The base period of the RoPE embeddings. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + decoder_sparse_step (`int`, *optional*, defaults to 1): + The frequency of the MoE layer. + moe_intermediate_size (`int`, *optional*, defaults to 1408): + Intermediate size of the routed expert. + num_experts_per_tok (`int`, *optional*, defaults to 4): + Number of selected experts. + num_experts (`int`, *optional*, defaults to 60): + Number of routed experts. + norm_topk_prob (`bool`, *optional*, defaults to `True`): + Whether to normalize the topk probabilities. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + mlp_only_layers (`List[int]`, *optional*, defaults to `[]`): + Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock + The list contains layer index, from 0 to num_layers-1 if we have num_layers layers + If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + head_dim (`int`, *optional*): + The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`. + + ```python + >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig + + >>> # Initializing a Qwen3VLMoe style configuration + >>> configuration = Qwen3VLMoeConfig() + + >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration + >>> model = Qwen3VLMoeForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen3_vl_moe_text" + base_config_key = "text_config" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `Qwen3VLMoe` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=151936, + hidden_size=2048, + intermediate_size=5632, + num_hidden_layers=24, + num_attention_heads=16, + num_key_value_heads=16, + hidden_act="silu", + max_position_embeddings=128000, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=False, + rope_theta=5000000.0, + attention_bias=False, + attention_dropout=0.0, + decoder_sparse_step=1, + moe_intermediate_size=1408, + num_experts_per_tok=4, + num_experts=60, + norm_topk_prob=True, + router_aux_loss_coef=0.001, + mlp_only_layers=None, + rope_scaling=None, + head_dim=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.rope_scaling = rope_scaling + self.head_dim = head_dim or hidden_size // num_attention_heads + + rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + + # MoE arguments + self.decoder_sparse_step = decoder_sparse_step + self.moe_intermediate_size = moe_intermediate_size + self.num_experts_per_tok = num_experts_per_tok + self.num_experts = num_experts + self.norm_topk_prob = norm_topk_prob + self.router_aux_loss_coef = router_aux_loss_coef + self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +class Qwen3VLMoeVisionConfig(PretrainedConfig): + model_type = "qwen3_vl_moe" + base_config_key = "vision_config" + + def __init__( + self, + depth=27, + hidden_size=1152, + hidden_act="gelu_pytorch_tanh", + intermediate_size=4304, + num_heads=16, + in_channels=3, + patch_size=16, + spatial_merge_size=2, + temporal_patch_size=2, + out_hidden_size=3584, + num_position_embeddings=2304, + deepstack_visual_indexes=[8, 16, 24], + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + self.depth = depth + self.hidden_size = hidden_size + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.num_heads = num_heads + self.in_channels = in_channels + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.out_hidden_size = out_hidden_size + self.num_position_embeddings = num_position_embeddings + self.initializer_range = initializer_range + self.deepstack_visual_indexes = deepstack_visual_indexes + + +class Qwen3VLMoeConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen3VLMoeModel`]. It is used to instantiate a + Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeTextConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 151655): + The image token index to encode the image prompt. + video_token_id (`int`, *optional*, defaults to 151656): + The video token index to encode the image prompt. + vision_start_token_id (`int`, *optional*, defaults to 151652): + The start token index to encode the image prompt. + vision_end_token_id (`int`, *optional*, defaults to 151653): + The end token index to encode the image prompt. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie the word embeddings. + + ```python + >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig + + >>> # Initializing a Qwen3-VL-MOE style configuration + >>> configuration = Qwen3VLMoeConfig() + + >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration + >>> model = Qwen3VLMoeForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen3_vl_moe" + sub_configs = {"vision_config": Qwen3VLMoeVisionConfig, "text_config": Qwen3VLMoeTextConfig} + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + text_config=None, + vision_config=None, + image_token_id=151655, + video_token_id=151656, + vision_start_token_id=151652, + vision_end_token_id=151653, + tie_word_embeddings=False, + **kwargs, + ): + if isinstance(vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**vision_config) + elif vision_config is None: + self.vision_config = self.sub_configs["vision_config"]() + + if isinstance(text_config, dict): + self.text_config = self.sub_configs["text_config"](**text_config) + elif text_config is None: + self.text_config = self.sub_configs["text_config"]() + + self.image_token_id = image_token_id + self.video_token_id = video_token_id + self.vision_start_token_id = vision_start_token_id + self.vision_end_token_id = vision_end_token_id + super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings) + + +__all__ = ["Qwen3VLMoeConfig", "Qwen3VLMoeTextConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..88e3f6e19f0eadbda8853b5c3b8f39fc7e61e580 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -0,0 +1,1832 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_qwen3_vl_moe.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import OutputRecorder, check_model_inputs +from .configuration_qwen3_vl_moe import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig, Qwen3VLMoeVisionConfig + + +@use_kernel_forward_from_hub("RMSNorm") +class Qwen3VLMoeTextRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Qwen3VLMoeTextRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Qwen3VLMoeTextExperts(nn.Module): + def __init__(self, config): + super().__init__() + self.num_experts = config.num_experts + self.intermediate_size = config.moe_intermediate_size + self.hidden_size = config.hidden_size + self.expert_dim = self.intermediate_size + self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim)) + self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size))) + self.act_fn = ACT2FN[config.hidden_act] + + def forward( + self, hidden_states: torch.Tensor, routing_weights: torch.Tensor, router_indices: torch.Tensor + ) -> torch.Tensor: + """ + When training it is more efficient to just loop over the experts and compute the output for each expert + as otherwise the memory would explode. + + For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs. + + Args: + hidden_states (torch.Tensor): (batch_size * token_num, hidden_size) + routing_weights (torch.Tensor): (batch_size * token_num, num_experts) + router_indices (torch.Tensor): (batch_size * token_num, top_k) + Returns: + torch.Tensor + """ + batch_size = hidden_states.shape[0] + hidden_states = hidden_states.reshape(-1, self.hidden_size) # (num_tokens, hidden_size) + if self.training: + next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device) + with torch.no_grad(): + expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=self.num_experts) + expert_mask = expert_mask.permute(2, 1, 0) + # we sum on the top_k and on the sequence length to get which experts + # are hit this time around + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit[:]: + with torch.no_grad(): + _, token_idx = torch.where(expert_mask[expert_idx[0]]) + current_state = hidden_states[token_idx] + gate_up = current_state @ self.gate_up_proj[expert_idx] + gate, up = gate_up.chunk(2, dim=-1) + gated_output = up * self.act_fn(gate) + out = gated_output @ self.down_proj[expert_idx] + weighted_output = out[0] * routing_weights[token_idx, expert_idx, None] + next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype)) + next_states = next_states.view(batch_size, -1, self.hidden_size) + else: + hidden_states = hidden_states.repeat(self.num_experts, 1) + hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size) + gate_up = torch.bmm(hidden_states, self.gate_up_proj) + gate, up = gate_up.chunk(2, dim=-1) # not supported for DTensors + next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj) + next_states = next_states.reshape(self.num_experts, batch_size, -1, self.hidden_size) + next_states = ( + next_states * routing_weights.transpose(0, 1).view(self.num_experts, batch_size, -1)[..., None] + ) + next_states = next_states.sum(dim=0) + return next_states + + +class Qwen3VLMoeTextSparseMoeBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.num_experts = config.num_experts + self.top_k = config.num_experts_per_tok + self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False) + self.experts = Qwen3VLMoeTextExperts(config) + + # since all the models use norm_topk_prob, we don't need to have a extra check for it + # self.norm_topk_prob = config.norm_topk_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size = hidden_states.shape[0] + hidden_states = hidden_states.reshape(-1, self.hidden_size) + router_logits = self.gate(hidden_states) + routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float) + routing_weights, router_indices = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(hidden_states.dtype) + router_weights = torch.zeros_like(router_logits).scatter_(1, router_indices, routing_weights) + hidden_states = hidden_states.reshape(batch_size, -1, self.hidden_size) + routed_out = self.experts(hidden_states, router_weights, router_indices) + return routed_out, router_logits + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Qwen3VLMoeTextAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.q_norm = Qwen3VLMoeTextRMSNorm( + self.head_dim, eps=config.rms_norm_eps + ) # unlike olmo, only on the head dim! + self.k_norm = Qwen3VLMoeTextRMSNorm( + self.head_dim, eps=config.rms_norm_eps + ) # thus post q_norm does not need reshape + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2) + key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Qwen3VLMoeTextMLP(nn.Module): + def __init__(self, config, intermediate_size=None): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class Qwen3VLMoeTextDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = Qwen3VLMoeTextAttention(config, layer_idx) + + if (layer_idx not in config.mlp_only_layers) and ( + config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0 + ): + self.mlp = Qwen3VLMoeTextSparseMoeBlock(config) + else: + self.mlp = Qwen3VLMoeTextMLP(config, intermediate_size=config.intermediate_size) + + self.input_layernorm = Qwen3VLMoeTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen3VLMoeTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_router_logits (`bool`, *optional*): + Whether or not to return the logits of all the routers. They are useful for computing the router loss, + and should not be returned during inference. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_values (`Cache`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + # For the MoE layers, we need to unpack + if isinstance(hidden_states, tuple): + hidden_states, _ = hidden_states + hidden_states = residual + hidden_states + + return hidden_states + + +@auto_docstring +class Qwen3VLMoePreTrainedModel(PreTrainedModel): + config: Qwen3VLMoeConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen3VLMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _can_compile_fullgraph = False # MoE models don't work with torch.compile (`torch.where(condition)` not supported) + _supports_attention_backend = True + _can_record_outputs = { + "router_logits": OutputRecorder(Qwen3VLMoeTextSparseMoeBlock, index=1), + "hidden_states": Qwen3VLMoeTextDecoderLayer, + "attentions": Qwen3VLMoeTextAttention, + } + + def _init_weights(self, module): + """Initialize the weights.""" + super()._init_weights(module) + if hasattr(self.config, "initializer_range"): + std = self.config.initializer_range + else: + std = getattr(self.config.get_text_config(), "initializer_range", 0.02) + if isinstance(module, Qwen3VLMoeTextExperts): + module.gate_up_proj.data.normal_(mean=0.0, std=std) + module.down_proj.data.normal_(mean=0.0, std=std) + + +class Qwen3VLMoeVisionMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.linear_fc1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=True) + self.linear_fc2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=True) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_state): + return self.linear_fc2(self.act_fn(self.linear_fc1(hidden_state))) + + +class Qwen3VLMoeVisionPatchEmbed(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.patch_size = config.patch_size + self.temporal_patch_size = config.temporal_patch_size + self.in_channels = config.in_channels + self.embed_dim = config.hidden_size + + kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size] + self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + target_dtype = self.proj.weight.dtype + hidden_states = hidden_states.view( + -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size + ) + hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim) + return hidden_states + + +class Qwen3VLMoeVisionRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, seqlen: int) -> torch.Tensor: + seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + freqs = torch.outer(seq, self.inv_freq) + return freqs + + +class Qwen3VLMoeVisionPatchMerger(nn.Module): + def __init__(self, config: Qwen3VLMoeVisionConfig, use_postshuffle_norm=False) -> None: + super().__init__() + self.hidden_size = config.hidden_size * (config.spatial_merge_size**2) + self.use_postshuffle_norm = use_postshuffle_norm + self.norm = nn.LayerNorm(self.hidden_size if use_postshuffle_norm else config.hidden_size, eps=1e-6) + self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size) + self.act_fn = nn.GELU() + self.linear_fc2 = nn.Linear(self.hidden_size, config.out_hidden_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.norm(x.view(-1, self.hidden_size) if self.use_postshuffle_norm else x).view(-1, self.hidden_size) + x = self.linear_fc2(self.act_fn(self.linear_fc1(x))) + return x + + +def apply_rotary_pos_emb_vision( + q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + orig_q_dtype = q.dtype + orig_k_dtype = k.dtype + q, k = q.float(), k.float() + cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float() + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + q_embed = q_embed.to(orig_q_dtype) + k_embed = k_embed.to(orig_k_dtype) + return q_embed, k_embed + + +class Qwen3VLMoeVisionAttention(nn.Module): + def __init__(self, config: Qwen3VLMoeVisionConfig) -> None: + super().__init__() + self.dim = config.hidden_size + self.num_heads = config.num_heads + self.head_dim = self.dim // self.num_heads + self.num_key_value_groups = 1 # needed for eager attention + self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True) + self.proj = nn.Linear(self.dim, self.dim) + self.scaling = self.head_dim**-0.5 + self.config = config + self.attention_dropout = 0.0 + self.is_causal = False + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs, + ) -> torch.Tensor: + seq_length = hidden_states.shape[0] + query_states, key_states, value_states = ( + self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) + ) + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin) + + query_states = query_states.transpose(0, 1).unsqueeze(0) + key_states = key_states.transpose(0, 1).unsqueeze(0) + value_states = value_states.transpose(0, 1).unsqueeze(0) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + if self.config._attn_implementation == "flash_attention_2": + # Flash Attention 2: Use cu_seqlens for variable length attention + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() + attn_output, _ = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask=None, + scaling=self.scaling, + dropout=0.0 if not self.training else self.attention_dropout, + cu_seq_lens_q=cu_seqlens, + cu_seq_lens_k=cu_seqlens, + max_length_q=max_seqlen, + max_length_k=max_seqlen, + is_causal=False, + **kwargs, + ) + else: + # Other implementations: Process each chunk separately + lengths = cu_seqlens[1:] - cu_seqlens[:-1] + splits = [ + torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states) + ] + + attn_outputs = [ + attention_interface( + self, + q, + k, + v, + attention_mask=None, + scaling=self.scaling, + dropout=0.0 if not self.training else self.attention_dropout, + is_causal=False, + **kwargs, + )[0] + for q, k, v in zip(*splits) + ] + attn_output = torch.cat(attn_outputs, dim=1) + + attn_output = attn_output.reshape(seq_length, -1).contiguous() + attn_output = self.proj(attn_output) + return attn_output + + +class Qwen3VLMoeVisionBlock(GradientCheckpointingLayer): + def __init__(self, config, attn_implementation: str = "sdpa") -> None: + super().__init__() + self.norm1 = nn.LayerNorm(config.hidden_size, eps=1e-6) + self.norm2 = nn.LayerNorm(config.hidden_size, eps=1e-6) + self.attn = Qwen3VLMoeVisionAttention(config=config) + self.mlp = Qwen3VLMoeVisionMLP(config=config) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs, + ) -> torch.Tensor: + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + +class Qwen3VLMoeVisionModel(Qwen3VLMoePreTrainedModel): + config: Qwen3VLMoeVisionConfig + _no_split_modules = ["Qwen3VLMoeVisionBlock"] + + def __init__(self, config, *inputs, **kwargs) -> None: + super().__init__(config, *inputs, **kwargs) + self.spatial_merge_size = config.spatial_merge_size + self.patch_size = config.patch_size + self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size + + self.patch_embed = Qwen3VLMoeVisionPatchEmbed( + config=config, + ) + + self.pos_embed = nn.Embedding(config.num_position_embeddings, config.hidden_size) + self.num_grid_per_side = int(config.num_position_embeddings**0.5) + + head_dim = config.hidden_size // config.num_heads + self.rotary_pos_emb = Qwen3VLMoeVisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([Qwen3VLMoeVisionBlock(config) for _ in range(config.depth)]) + self.merger = Qwen3VLMoeVisionPatchMerger( + config=config, + use_postshuffle_norm=False, + ) + + self.deepstack_visual_indexes = config.deepstack_visual_indexes + self.deepstack_merger_list = nn.ModuleList( + [ + Qwen3VLMoeVisionPatchMerger( + config=config, + use_postshuffle_norm=True, + ) + for _ in range(len(config.deepstack_visual_indexes)) + ] + ) + + self.gradient_checkpointing = False + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + merge_size = self.spatial_merge_size + + max_hw = int(grid_thw[:, 1:].max().item()) + freq_table = self.rotary_pos_emb(max_hw) # (max_hw, dim // 2) + device = freq_table.device + + total_tokens = int(torch.prod(grid_thw, dim=1).sum().item()) + pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device) + + offset = 0 + for num_frames, height, width in grid_thw: + merged_h, merged_w = height // merge_size, width // merge_size + + block_rows = torch.arange(merged_h, device=device) # block row indices + block_cols = torch.arange(merged_w, device=device) # block col indices + intra_row = torch.arange(merge_size, device=device) # intra-block row offsets + intra_col = torch.arange(merge_size, device=device) # intra-block col offsets + + # Compute full-resolution positions + row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None] + col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :] + + row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1) + col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1) + + coords = torch.stack((row_idx, col_idx), dim=-1) + + if num_frames > 1: + coords = coords.repeat(num_frames, 1) + + num_tokens = coords.shape[0] + pos_ids[offset : offset + num_tokens] = coords + offset += num_tokens + + embeddings = freq_table[pos_ids] # lookup rotary embeddings + embeddings = embeddings.flatten(1) + return embeddings + + def fast_pos_embed_interpolate(self, grid_thw): + grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2] + + idx_list = [[] for _ in range(4)] + weight_list = [[] for _ in range(4)] + + for t, h, w in zip(grid_ts, grid_hs, grid_ws): + h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h) + w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w) + + h_idxs_floor = h_idxs.int() + w_idxs_floor = w_idxs.int() + h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + + dh = h_idxs - h_idxs_floor + dw = w_idxs - w_idxs_floor + + base_h = h_idxs_floor * self.num_grid_per_side + base_h_ceil = h_idxs_ceil * self.num_grid_per_side + + indices = [ + (base_h[None].T + w_idxs_floor[None]).flatten(), + (base_h[None].T + w_idxs_ceil[None]).flatten(), + (base_h_ceil[None].T + w_idxs_floor[None]).flatten(), + (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(), + ] + + weights = [ + ((1 - dh)[None].T * (1 - dw)[None]).flatten(), + ((1 - dh)[None].T * dw[None]).flatten(), + (dh[None].T * (1 - dw)[None]).flatten(), + (dh[None].T * dw[None]).flatten(), + ] + + for i in range(4): + idx_list[i].extend(indices[i].tolist()) + weight_list[i].extend(weights[i].tolist()) + + idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=self.pos_embed.weight.device) + weight_tensor = torch.tensor( + weight_list, dtype=self.pos_embed.weight.dtype, device=self.pos_embed.weight.device + ) + pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None] + patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3] + + patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)]) + + patch_pos_embeds_permute = [] + merge_size = self.config.spatial_merge_size + for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws): + pos_embed = pos_embed.repeat(t, 1) + pos_embed = ( + pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1) + .permute(0, 1, 3, 2, 4, 5) + .flatten(0, 4) + ) + patch_pos_embeds_permute.append(pos_embed) + patch_pos_embeds = torch.cat(patch_pos_embeds_permute) + return patch_pos_embeds + + def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor: + """ + Args: + hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): + The final hidden states of the model. + grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`): + The temporal, height and width of feature shape of each image in LLM. + + Returns: + `torch.Tensor`: hidden_states. + """ + hidden_states = self.patch_embed(hidden_states) + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + position_embeddings = (emb.cos(), emb.sin()) + + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, + # Select dtype based on the following factors: + # - FA2 requires that cu_seqlens_q must have dtype int32 + # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw + # See https://github.com/huggingface/transformers/pull/34852 for more information + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + position_embeddings=position_embeddings, + **kwargs, + ) + if layer_num in self.deepstack_visual_indexes: + deepstack_feature = self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)]( + hidden_states + ) + deepstack_feature_lists.append(deepstack_feature) + + hidden_states = self.merger(hidden_states) + + return hidden_states, deepstack_feature_lists + + +class Qwen3VLMoeTextRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Qwen3VLMoeTextConfig, device=None): + super().__init__() + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", "default") + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20]) + + def apply_interleaved_mrope(self, freqs, mrope_section): + """Apply interleaved MRoPE to 3D rotary embeddings. + Reorganizes frequency layout from chunked [TTT...HHH...WWW] to + interleaved [THTHWHTHW...TT], preserving frequency continuity. + args: + x: (3, bs, seq_len, head_dim // 2) + mrope_section: (3,) + returns: + x_t: (bs, seq_len, head_dim // 2) + """ + freqs_t = freqs[0] # just overwrite the first dimension T + for dim, offset in enumerate((1, 2), start=1): # H, W + length = mrope_section[dim] * 3 + idx = slice(offset, length, 3) + freqs_t[..., idx] = freqs[dim, ..., idx] + return freqs_t + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + # In contrast to other models, Qwen3VLMoe has different position ids for the grids + # So we expand the inv_freq to shape (3, ...) + if position_ids.ndim == 2: + position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1) + inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1) + position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions) + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3) + freqs = self.apply_interleaved_mrope(freqs, self.mrope_section) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring( + custom_intro=( + "Text part of Qwen3VLMoe, " + "not a pure text-only model, as DeepStack integrates visual features into the early hidden states." + ) +) +class Qwen3VLMoeTextModel(Qwen3VLMoePreTrainedModel): + config: Qwen3VLMoeTextConfig + _no_split_modules = ["Qwen3VLMoeTextDecoderLayer"] + + def __init__(self, config: Qwen3VLMoeTextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Qwen3VLMoeTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Qwen3VLMoeTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Qwen3VLMoeTextRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + # args for deepstack + visual_pos_masks: Optional[torch.Tensor] = None, + deepstack_visual_embeds: Optional[list[torch.Tensor]] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, BaseModelOutputWithPast]: + r""" + visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): + The mask of the visual positions. + deepstack_visual_embeds (`list[torch.Tensor]`, *optional*): + The deepstack visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim). + The feature is extracted from the different visual encoder layers, and fed to the decoder + hidden states. It's from the paper DeepStack(https://arxiv.org/abs/2406.04334). + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + # torch.jit.trace() doesn't support cache objects in the output + if use_cache and past_key_values is None and not torch.jit.is_tracing(): + past_key_values = DynamicCache(config=self.config) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + # the hard coded `3` is for temporal, height and width. + if position_ids is None: + position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1) + elif position_ids.ndim == 2: + position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1) + + if position_ids.ndim == 3 and position_ids.shape[0] == 4: + text_position_ids = position_ids[0] + position_ids = position_ids[1:] + else: + text_position_ids = position_ids[0] + + attention_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=text_position_ids, + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + for layer_idx, decoder_layer in enumerate(self.layers): + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=text_position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = layer_outputs + + # add visual features to the hidden states of first several layers + if deepstack_visual_embeds is not None and layer_idx in range(len(deepstack_visual_embeds)): + hidden_states = self._deepstack_process( + hidden_states, + visual_pos_masks, + deepstack_visual_embeds[layer_idx], + ) + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + def _deepstack_process( + self, hidden_states: torch.Tensor, visual_pos_masks: torch.Tensor, visual_embeds: torch.Tensor + ): + visual_pos_masks = visual_pos_masks.to(hidden_states.device) + visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype) + local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds + hidden_states[visual_pos_masks, :] = local_this + return hidden_states + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Qwen3VLMoe causal language model (or autoregressive) outputs. + """ +) +class Qwen3VLMoeCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + rope_deltas: Optional[torch.LongTensor] = None + aux_loss: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Llava outputs, with hidden states and attentions. + """ +) +class Qwen3VLMoeModelOutputWithPast(ModelOutput): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + rope_deltas: Optional[torch.LongTensor] = None + + +@auto_docstring +class Qwen3VLMoeModel(Qwen3VLMoePreTrainedModel): + base_model_prefix = "" + _checkpoint_conversion_mapping = {} + # Reference: fix gemma3 grad acc #37208 + accepts_loss_kwargs = False + config: Qwen3VLMoeConfig + _no_split_modules = ["Qwen3VLMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"] + + def __init__(self, config): + super().__init__(config) + self.visual = Qwen3VLMoeVisionModel._from_config(config.vision_config) + self.language_model = Qwen3VLMoeTextModel._from_config(config.text_config) + self.rope_deltas = None # cache rope_deltas here + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_rope_index( + self, + input_ids: Optional[torch.LongTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Different from the original implementation, Qwen3VLMoe use timestamps rather than absolute time position ids.""" + + # Since we use timestamps to seperate videos, like , the video_grid_thw should also be split + if video_grid_thw is not None: + video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0) + video_grid_thw[:, 0] = 1 + + spatial_merge_size = self.config.vision_config.spatial_merge_size + image_token_id = self.config.image_token_id + video_token_id = self.config.video_token_id + vision_start_token_id = self.config.vision_start_token_id + mrope_position_deltas = [] + if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None): + total_input_ids = input_ids + if attention_mask is None: + attention_mask = torch.ones_like(total_input_ids) + position_ids = torch.ones( + 3, + input_ids.shape[0], + input_ids.shape[1], + dtype=input_ids.dtype, + device=input_ids.device, + ) + image_index, video_index = 0, 0 + attention_mask = attention_mask.to(total_input_ids.device) + for i, input_ids in enumerate(total_input_ids): + input_ids = input_ids[attention_mask[i] == 1] + image_nums, video_nums = 0, 0 + vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) + vision_tokens = input_ids[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + input_tokens = input_ids.tolist() + llm_pos_ids_list: list = [] + st = 0 + remain_images, remain_videos = image_nums, video_nums + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = ( + t.item(), + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + # t_index is always 0 because llm_grid_t is always 1 (we use timestamps to encode the temporal information for videos) + t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) + mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) + mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) + return position_ids, mrope_position_deltas + else: + if attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) + max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] + mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] + else: + position_ids = ( + torch.arange(input_ids.shape[1], device=input_ids.device) + .view(1, 1, -1) + .expand(3, input_ids.shape[0], -1) + ) + mrope_position_deltas = torch.zeros( + [input_ids.shape[0], 1], + device=input_ids.device, + dtype=input_ids.dtype, + ) + + return position_ids, mrope_position_deltas + + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + """ + Encodes videos into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned. + + Args: + pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): + The tensors corresponding to the input videos. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + """ + # Same implementation as for images + return self.get_image_features(pixel_values_videos, video_grid_thw) + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + """ + Encodes images into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): + The tensors corresponding to the input images. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + """ + pixel_values = pixel_values.type(self.visual.dtype) + image_embeds, deepstack_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) + split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist() + image_embeds = torch.split(image_embeds, split_sizes) + return image_embeds, deepstack_image_embeds + + def get_placeholder_mask( + self, + input_ids: torch.LongTensor, + inputs_embeds: torch.FloatTensor, + image_features: Optional[torch.FloatTensor] = None, + video_features: Optional[torch.FloatTensor] = None, + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + special_video_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_video_mask = special_video_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + special_video_mask = input_ids == self.config.video_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}" + ) + + n_video_tokens = special_video_mask.sum() + special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel(): + raise ValueError( + f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}" + ) + + return special_image_mask, special_video_mask + + @auto_docstring + @check_model_inputs + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]: + r""" + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + image_mask = None + video_mask = None + + if pixel_values is not None: + image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw) + image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + image_mask, _ = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds + ) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + + if pixel_values_videos is not None: + video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) + video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + _, video_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds + ) + inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) + + visual_pos_masks = None + deepstack_visual_embeds = None + if image_mask is not None and video_mask is not None: + # aggregate visual_pos_masks and deepstack_visual_embeds + image_mask = image_mask[..., 0] + video_mask = video_mask[..., 0] + visual_pos_masks = image_mask | video_mask + deepstack_visual_embeds = [] + image_mask_joint = image_mask[visual_pos_masks] + video_mask_joint = video_mask[visual_pos_masks] + for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds): + embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device) + embed_joint[image_mask_joint, :] = img_embed + embed_joint[video_mask_joint, :] = vid_embed + deepstack_visual_embeds.append(embed_joint) + elif image_mask is not None: + image_mask = image_mask[..., 0] + visual_pos_masks = image_mask + deepstack_visual_embeds = deepstack_image_embeds + elif video_mask is not None: + video_mask = video_mask[..., 0] + visual_pos_masks = video_mask + deepstack_visual_embeds = deepstack_video_embeds + + if position_ids is None: + attention_mask_tensor = ( + attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"] + ) + if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4: + attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2) + # Only apply conversion for floating point tensors (inverted masks) + if attention_mask_tensor.dtype.is_floating_point: + attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min + attention_mask_tensor = (1.0 - attention_mask_tensor).int() + + # Calculate RoPE index once per generation in the pre-fill stage only. + # When compiling, we can't check tensor values thus we check only input length + # It is safe to assume that `length!=1` means we're in pre-fill because compiled + # models currently cannot do asssisted decoding + prefill_compiled_stage = is_torchdynamo_compiling() and ( + (input_ids is not None and input_ids.shape[1] != 1) + or (inputs_embeds is not None and inputs_embeds.shape[1] != 1) + ) + prefill_noncompiled_stage = not is_torchdynamo_compiling() and ( + (cache_position is not None and cache_position[0] == 0) + or (past_key_values is None or past_key_values.get_seq_length() == 0) + ) + if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None: + position_ids, rope_deltas = self.get_rope_index( + input_ids, + image_grid_thw, + video_grid_thw, + attention_mask=attention_mask_tensor, + ) + self.rope_deltas = rope_deltas + # then use the prev pre-calculated rope-deltas to get the correct position ids + else: + batch_size, seq_length, _ = inputs_embeds.shape + delta = ( + (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) + if cache_position is not None + else 0 + ) + position_ids = torch.arange(seq_length, device=inputs_embeds.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + if cache_position is not None: # otherwise `deltas` is an int `0` + delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + + outputs = self.language_model( + input_ids=None, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + visual_pos_masks=visual_pos_masks, + deepstack_visual_embeds=deepstack_visual_embeds, + **kwargs, + ) + + return Qwen3VLMoeModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + rope_deltas=self.rope_deltas, + ) + + +def load_balancing_loss_func( + gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], + num_experts: Optional[int] = None, + top_k=2, + attention_mask: Optional[torch.Tensor] = None, +) -> Union[torch.Tensor, int]: + r""" + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + gate_logits: + Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of + shape [batch_size X sequence_length, num_experts]. + num_experts: + Number of experts + top_k: + The number of experts to route per-token, can be also interpreted as the `top-k` routing + parameter. + attention_mask (`torch.Tensor`, *optional*): + The attention_mask used in forward function + shape [batch_size X sequence_length] if not None. + + Returns: + The auxiliary loss. + """ + if gate_logits is None or not isinstance(gate_logits, tuple): + return 0 + + if isinstance(gate_logits, tuple): + compute_device = gate_logits[0].device + concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0) + + routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1) + + _, selected_experts = torch.topk(routing_weights, top_k, dim=-1) + + expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts) + + if attention_mask is None: + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.mean(expert_mask.float(), dim=0) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.mean(routing_weights, dim=0) + else: + batch_size, sequence_length = attention_mask.shape + num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length) + + # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask + expert_attention_mask = ( + attention_mask[None, :, :, None, None] + .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts)) + .reshape(-1, top_k, num_experts) + .to(compute_device) + ) + + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum( + expert_attention_mask, dim=0 + ) + + # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert + router_per_expert_attention_mask = ( + attention_mask[None, :, :, None] + .expand((num_hidden_layers, batch_size, sequence_length, num_experts)) + .reshape(-1, num_experts) + .to(compute_device) + ) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum( + router_per_expert_attention_mask, dim=0 + ) + + overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0)) + return overall_loss * num_experts + + +class Qwen3VLMoeForConditionalGeneration(Qwen3VLMoePreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = ["lm_head.weight"] + # Reference: fix gemma3 grad acc #37208 + accepts_loss_kwargs = False + config: Qwen3VLMoeConfig + + def __init__(self, config): + super().__init__(config) + self.model = Qwen3VLMoeModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + return self.model.get_video_features(pixel_values_videos, video_grid_thw) + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + return self.model.get_image_features(pixel_values, image_grid_thw) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def visual(self): + return self.model.visual + + @check_model_inputs + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Qwen3VLMoeCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + + Example: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration + + >>> model = Qwen3VLMoeForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct", dtype="auto", device_map="auto") + >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct") + + >>> messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + }, + {"type": "text", "text": "Describe this image in short."}, + ], + } + ] + + >>> # Preparation for inference + >>> inputs = processor.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_dict=True, + return_tensors="pt" + ) + >>> inputs = inputs.to(model.device) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, max_new_tokens=128) + >>> generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + >>> processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "A woman in a plaid shirt sits on a sandy beach at sunset, smiling as she gives a high-five to a yellow Labrador Retriever wearing a harness. The ocean waves roll in the background." + ```""" + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size) + + aux_loss = None + if kwargs.get("output_router_logits", False): + aux_loss = load_balancing_loss_func( + outputs.router_logits, + self.config.text_config.num_experts, + self.config.text_config.num_experts_per_tok, + attention_mask, + ) + if labels is not None: + loss += self.config.text_config.router_aux_loss_coef * aux_loss.to( + loss.device + ) # make sure to reside in the same device + + return Qwen3VLMoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + rope_deltas=outputs.rope_deltas, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + use_cache=use_cache, + **kwargs, + ) + + # Qwen3VLMoe position_ids are prepareed with rope_deltas in forward + model_inputs["position_ids"] = None + + if cache_position[0] != 0: + model_inputs["pixel_values"] = None + model_inputs["pixel_values_videos"] = None + + return model_inputs + + def _get_image_nums_and_video_nums( + self, + input_ids: Optional[torch.LongTensor], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Get the number of images and videos for each sample to calculate the separation length of the sample tensor. + These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Returns: + image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`) + video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`) + """ + image_token_id = self.config.image_token_id + video_token_id = self.config.video_token_id + vision_start_token_id = self.config.vision_start_token_id + + if inputs_embeds is not None: + vision_start_mask = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(vision_start_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + image_mask = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + video_mask = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + else: + vision_start_mask = input_ids == vision_start_token_id + image_mask = input_ids == image_token_id + video_mask = input_ids == video_token_id + + vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1) + image_nums = torch.sum(vision_first_mask & image_mask, dim=1) + video_nums = torch.sum(vision_first_mask & video_mask, dim=1) + + return image_nums, video_nums + + def _expand_inputs_for_generation( + self, + expand_size: int = 1, + is_encoder_decoder: bool = False, + input_ids: Optional[torch.LongTensor] = None, + **model_kwargs, + ) -> tuple[torch.LongTensor, dict[str, Any]]: + # Overwritten -- Support for expanding tensors without a batch size dimension + # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t + # pixel_values.shape[0] is sum(seqlen_images for samples) + # image_grid_thw.shape[0] is sum(num_images for samples) + + if expand_size == 1: + return input_ids, model_kwargs + + visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"] + + def _expand_dict_for_generation_visual(dict_to_expand): + image_grid_thw = model_kwargs.get("image_grid_thw", None) + video_grid_thw = model_kwargs.get("video_grid_thw", None) + image_nums, video_nums = self._get_image_nums_and_video_nums( + input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None) + ) + + def _repeat_interleave_samples(x, lengths, repeat_times): + samples = torch.split(x, lengths) + repeat_args = [repeat_times] + [1] * (x.dim() - 1) + result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0) + return result + + for key in dict_to_expand: + if key == "pixel_values": + # split images into samples + samples = torch.split(image_grid_thw, list(image_nums)) + # compute the sequence length of images for each sample + lengths = [torch.prod(sample, dim=1).sum() for sample in samples] + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=lengths, repeat_times=expand_size + ) + elif key == "image_grid_thw": + # get the num of images for each sample + lengths = list(image_nums) + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=lengths, repeat_times=expand_size + ) + elif key == "pixel_values_videos": + samples = torch.split(video_grid_thw, list(video_nums)) + lengths = [torch.prod(sample, dim=1).sum() for sample in samples] + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=lengths, repeat_times=expand_size + ) + elif key == "video_grid_thw": + lengths = list(video_nums) + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=lengths, repeat_times=expand_size + ) + elif key == "second_per_grid_ts": + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size + ) + return dict_to_expand + + def _expand_dict_for_generation(dict_to_expand): + for key in dict_to_expand: + if ( + key != "cache_position" + and dict_to_expand[key] is not None + and isinstance(dict_to_expand[key], torch.Tensor) + and key not in visual_keys + ): + dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0) + return dict_to_expand + + model_kwargs = _expand_dict_for_generation_visual(model_kwargs) + + if input_ids is not None: + input_ids = input_ids.repeat_interleave(expand_size, dim=0) + + model_kwargs = _expand_dict_for_generation(model_kwargs) + + if is_encoder_decoder: + if model_kwargs.get("encoder_outputs") is None: + raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.") + model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"]) + + return input_ids, model_kwargs + + +__all__ = [ + "Qwen3VLMoeVisionModel", + "Qwen3VLMoeForConditionalGeneration", + "Qwen3VLMoeModel", + "Qwen3VLMoePreTrainedModel", + "Qwen3VLMoeTextModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..72d3452bdc50a244b6cf3fdd9bd76dda5b2fc0f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -0,0 +1,551 @@ +# coding=utf-8 +# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Qwen3-VL-MOE model.""" + +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, logging +from ..qwen3_moe.modeling_qwen3_moe import ( + Qwen3MoeDecoderLayer, + Qwen3MoePreTrainedModel, + Qwen3MoeRMSNorm, + load_balancing_loss_func, +) +from ..qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig, Qwen3VLVisionConfig +from ..qwen3_vl.modeling_qwen3_vl import ( + Qwen3VLCausalLMOutputWithPast, + Qwen3VLForConditionalGeneration, + Qwen3VLModel, + Qwen3VLTextAttention, + Qwen3VLTextModel, + Qwen3VLVisionModel, +) + + +logger = logging.get_logger(__name__) + + +class Qwen3VLMoeTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a + Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2MoeModel`] + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 5632): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 128000): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 5000000.0): + The base period of the RoPE embeddings. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + decoder_sparse_step (`int`, *optional*, defaults to 1): + The frequency of the MoE layer. + moe_intermediate_size (`int`, *optional*, defaults to 1408): + Intermediate size of the routed expert. + num_experts_per_tok (`int`, *optional*, defaults to 4): + Number of selected experts. + num_experts (`int`, *optional*, defaults to 60): + Number of routed experts. + norm_topk_prob (`bool`, *optional*, defaults to `True`): + Whether to normalize the topk probabilities. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + mlp_only_layers (`List[int]`, *optional*, defaults to `[]`): + Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock + The list contains layer index, from 0 to num_layers-1 if we have num_layers layers + If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + head_dim (`int`, *optional*): + The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`. + + ```python + >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig + + >>> # Initializing a Qwen3VLMoe style configuration + >>> configuration = Qwen3VLMoeConfig() + + >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration + >>> model = Qwen3VLMoeForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen3_vl_moe_text" + base_config_key = "text_config" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `Qwen3VLMoe` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=151936, + hidden_size=2048, + intermediate_size=5632, + num_hidden_layers=24, + num_attention_heads=16, + num_key_value_heads=16, + hidden_act="silu", + max_position_embeddings=128000, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=False, + rope_theta=5000000.0, + attention_bias=False, + attention_dropout=0.0, + decoder_sparse_step=1, + moe_intermediate_size=1408, + num_experts_per_tok=4, + num_experts=60, + norm_topk_prob=True, + router_aux_loss_coef=0.001, + mlp_only_layers=None, + rope_scaling=None, + head_dim=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.rope_scaling = rope_scaling + self.head_dim = head_dim or hidden_size // num_attention_heads + + rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + + # MoE arguments + self.decoder_sparse_step = decoder_sparse_step + self.moe_intermediate_size = moe_intermediate_size + self.num_experts_per_tok = num_experts_per_tok + self.num_experts = num_experts + self.norm_topk_prob = norm_topk_prob + self.router_aux_loss_coef = router_aux_loss_coef + self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +class Qwen3VLMoeVisionConfig(Qwen3VLVisionConfig): + pass + + +class Qwen3VLMoeConfig(Qwen3VLConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen3VLMoeModel`]. It is used to instantiate a + Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeTextConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 151655): + The image token index to encode the image prompt. + video_token_id (`int`, *optional*, defaults to 151656): + The video token index to encode the image prompt. + vision_start_token_id (`int`, *optional*, defaults to 151652): + The start token index to encode the image prompt. + vision_end_token_id (`int`, *optional*, defaults to 151653): + The end token index to encode the image prompt. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie the word embeddings. + + ```python + >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig + + >>> # Initializing a Qwen3-VL-MOE style configuration + >>> configuration = Qwen3VLMoeConfig() + + >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration + >>> model = Qwen3VLMoeForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen3_vl_moe" + sub_configs = {"vision_config": Qwen3VLMoeVisionConfig, "text_config": Qwen3VLMoeTextConfig} + + +class Qwen3VLMoeTextRMSNorm(Qwen3MoeRMSNorm): + pass + + +class Qwen3VLMoeTextExperts(nn.Module): + def __init__(self, config): + super().__init__() + self.num_experts = config.num_experts + self.intermediate_size = config.moe_intermediate_size + self.hidden_size = config.hidden_size + self.expert_dim = self.intermediate_size + self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim)) + self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size))) + self.act_fn = ACT2FN[config.hidden_act] + + def forward( + self, hidden_states: torch.Tensor, routing_weights: torch.Tensor, router_indices: torch.Tensor + ) -> torch.Tensor: + """ + When training it is more efficient to just loop over the experts and compute the output for each expert + as otherwise the memory would explode. + + For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs. + + Args: + hidden_states (torch.Tensor): (batch_size * token_num, hidden_size) + routing_weights (torch.Tensor): (batch_size * token_num, num_experts) + router_indices (torch.Tensor): (batch_size * token_num, top_k) + Returns: + torch.Tensor + """ + batch_size = hidden_states.shape[0] + hidden_states = hidden_states.reshape(-1, self.hidden_size) # (num_tokens, hidden_size) + if self.training: + next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device) + with torch.no_grad(): + expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=self.num_experts) + expert_mask = expert_mask.permute(2, 1, 0) + # we sum on the top_k and on the sequence length to get which experts + # are hit this time around + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit[:]: + with torch.no_grad(): + _, token_idx = torch.where(expert_mask[expert_idx[0]]) + current_state = hidden_states[token_idx] + gate_up = current_state @ self.gate_up_proj[expert_idx] + gate, up = gate_up.chunk(2, dim=-1) + gated_output = up * self.act_fn(gate) + out = gated_output @ self.down_proj[expert_idx] + weighted_output = out[0] * routing_weights[token_idx, expert_idx, None] + next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype)) + next_states = next_states.view(batch_size, -1, self.hidden_size) + else: + hidden_states = hidden_states.repeat(self.num_experts, 1) + hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size) + gate_up = torch.bmm(hidden_states, self.gate_up_proj) + gate, up = gate_up.chunk(2, dim=-1) # not supported for DTensors + next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj) + next_states = next_states.reshape(self.num_experts, batch_size, -1, self.hidden_size) + next_states = ( + next_states * routing_weights.transpose(0, 1).view(self.num_experts, batch_size, -1)[..., None] + ) + next_states = next_states.sum(dim=0) + return next_states + + +class Qwen3VLMoeTextSparseMoeBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.num_experts = config.num_experts + self.top_k = config.num_experts_per_tok + self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False) + self.experts = Qwen3VLMoeTextExperts(config) + + # since all the models use norm_topk_prob, we don't need to have a extra check for it + # self.norm_topk_prob = config.norm_topk_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size = hidden_states.shape[0] + hidden_states = hidden_states.reshape(-1, self.hidden_size) + router_logits = self.gate(hidden_states) + routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float) + routing_weights, router_indices = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(hidden_states.dtype) + router_weights = torch.zeros_like(router_logits).scatter_(1, router_indices, routing_weights) + hidden_states = hidden_states.reshape(batch_size, -1, self.hidden_size) + routed_out = self.experts(hidden_states, router_weights, router_indices) + return routed_out, router_logits + + +class Qwen3VLMoeTextAttention(Qwen3VLTextAttention): + pass + + +class Qwen3VLMoeTextDecoderLayer(Qwen3MoeDecoderLayer): + pass + + +class Qwen3VLMoePreTrainedModel(Qwen3MoePreTrainedModel): + config: Qwen3VLMoeConfig + _no_split_modules = ["Qwen3VLMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"] + + def _init_weights(self, module): + """Initialize the weights.""" + PreTrainedModel._init_weights(self, module) + if hasattr(self.config, "initializer_range"): + std = self.config.initializer_range + else: + std = getattr(self.config.get_text_config(), "initializer_range", 0.02) + if isinstance(module, Qwen3VLMoeTextExperts): + module.gate_up_proj.data.normal_(mean=0.0, std=std) + module.down_proj.data.normal_(mean=0.0, std=std) + + +class Qwen3VLMoeVisionModel(Qwen3VLVisionModel): + pass + + +class Qwen3VLMoeTextModel(Qwen3VLTextModel): + pass + + +class Qwen3VLMoeCausalLMOutputWithPast(Qwen3VLCausalLMOutputWithPast): + aux_loss: Optional[torch.FloatTensor] = None + + +class Qwen3VLMoeModel(Qwen3VLModel): + pass + + +class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + + Example: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration + + >>> model = Qwen3VLMoeForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct", dtype="auto", device_map="auto") + >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct") + + >>> messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + }, + {"type": "text", "text": "Describe this image in short."}, + ], + } + ] + + >>> # Preparation for inference + >>> inputs = processor.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_dict=True, + return_tensors="pt" + ) + >>> inputs = inputs.to(model.device) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, max_new_tokens=128) + >>> generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + >>> processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "A woman in a plaid shirt sits on a sandy beach at sunset, smiling as she gives a high-five to a yellow Labrador Retriever wearing a harness. The ocean waves roll in the background." + ```""" + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size) + + aux_loss = None + if kwargs.get("output_router_logits", False): + aux_loss = load_balancing_loss_func( + outputs.router_logits, + self.config.text_config.num_experts, + self.config.text_config.num_experts_per_tok, + attention_mask, + ) + if labels is not None: + loss += self.config.text_config.router_aux_loss_coef * aux_loss.to( + loss.device + ) # make sure to reside in the same device + + return Qwen3VLMoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + rope_deltas=outputs.rope_deltas, + ) + + +__all__ = [ + "Qwen3VLMoeConfig", + "Qwen3VLMoeTextConfig", + "Qwen3VLMoeVisionModel", + "Qwen3VLMoeForConditionalGeneration", + "Qwen3VLMoeModel", + "Qwen3VLMoePreTrainedModel", + "Qwen3VLMoeTextModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd0ed7feaa3806121b285691d022704634debce2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/configuration_reformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/configuration_reformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca0fffb7461d8c7d244adcd9270c0455cec52de1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/configuration_reformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f2e1233c4774c95493b660f839c0b50293432e2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fb9bc2235392dfe63137cdd35344289a81e4b89 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1713e724ff58b2b0ad8a3815706cea07aaa17dd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/configuration_regnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/configuration_regnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..371ebcc02ee7b79c65bc12dc8046e5c35e7b1351 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/configuration_regnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_flax_regnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_flax_regnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..915a2519b1a18d93d0b9f3aef570e3d7c640187e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_flax_regnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_regnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_regnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d388f68edf43587e6a28bf24036ccb82343ee1ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_regnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_tf_regnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_tf_regnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13fe8b4cf544222c150caec042c8af3ea56a785d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_tf_regnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a288723a1c31abf17b4fbcb64c3a2f11cbebfa4b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7ea64fd2977c97d91568e672546fbde06011801 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6565ef8bcf4fbae45c4d06bd7fa211b603d9d6d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c807b71634985914c1ea740037b13510a210b9d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ba9be42196bcac1c0fef10428fe99d610fa9ab3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20cde5e6535ef7021d0056ecf5164ac715863bd8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc0b9abd4763a724917b9430c6c16cfd0d73b5cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/configuration_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/configuration_resnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c6849c3e0576ac5a7d475c504e9114b56c917e2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/configuration_resnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_flax_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_flax_resnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b303fb5e9692991cd79ecd3f7f8ff331c1a89f3c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_flax_resnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_resnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73a2f1e4bd6dbee855f897c49d447632b5415977 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_resnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_tf_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_tf_resnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..636fb1aebfc3795915444c1c6ce516789965f0c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_tf_resnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ad2eff844ebc864fb3feb7389a32c689042a52c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/configuration_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/configuration_roberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..581483bd4e47c4cdb064244879a9f82c22b12bfa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/configuration_roberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_flax_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_flax_roberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86b2572e8a9e15f2c3e80aee3e3cd285bf4538d2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_flax_roberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_roberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..956ea996744a4d6dfa56f5c85d73d40caebb33e6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_roberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_tf_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_tf_roberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc32caa61d8404598e17fff31c4838627f547d31 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_tf_roberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ca8ce490f2f45275fa21c6214cbe584f23d1d5c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f188b068e1ee5696c234f0d7609bbb7e2442334d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..208878343d24b3cec254a918f59d69841d1110c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_roberta_prelayernorm import * + from .modeling_flax_roberta_prelayernorm import * + from .modeling_roberta_prelayernorm import * + from .modeling_tf_roberta_prelayernorm import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py new file mode 100644 index 0000000000000000000000000000000000000000..72bc808c450d692903bb8826624afb0efb0199d6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py @@ -0,0 +1,157 @@ +# coding=utf-8 +# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""RoBERTa-PreLayerNorm configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.roberta.configuration_roberta.RobertaConfig with FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40,RoBERTa->RoBERTa-PreLayerNorm,Roberta->RobertaPreLayerNorm,roberta->roberta-prelayernorm +class RobertaPreLayerNormConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`RobertaPreLayerNormModel`] or a [`TFRobertaPreLayerNormModel`]. It is + used to instantiate a RoBERTa-PreLayerNorm model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa-PreLayerNorm + [andreasmadsen/efficient_mlm_m0.40](https://huggingface.co/andreasmadsen/efficient_mlm_m0.40) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the RoBERTa-PreLayerNorm model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`RobertaPreLayerNormModel`] or [`TFRobertaPreLayerNormModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`RobertaPreLayerNormModel`] or [`TFRobertaPreLayerNormModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import RobertaPreLayerNormConfig, RobertaPreLayerNormModel + + >>> # Initializing a RoBERTa-PreLayerNorm configuration + >>> configuration = RobertaPreLayerNormConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = RobertaPreLayerNormModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "roberta-prelayernorm" + + def __init__( + self, + vocab_size=50265, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->RobertaPreLayerNorm +class RobertaPreLayerNormOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["RobertaPreLayerNormConfig", "RobertaPreLayerNormOnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py new file mode 100644 index 0000000000000000000000000000000000000000..f65dc07bb165ce1216831243405de26612d26232 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py @@ -0,0 +1,1527 @@ +# coding=utf-8 +# Copyright 2022 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax RoBERTa-PreLayerNorm model.""" + +from typing import Callable, Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen import partitioning as nn_partitioning +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxBaseModelOutputWithPooling, + FlaxBaseModelOutputWithPoolingAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxQuestionAnsweringModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40" +_CONFIG_FOR_DOC = "RobertaPreLayerNormConfig" + +remat = nn_partitioning.remat + + +# Copied from transformers.models.roberta.modeling_flax_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + input_ids: jnp.ndarray + padding_idx: int + + Returns: jnp.ndarray + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = (input_ids != padding_idx).astype("i4") + + if mask.ndim > 2: + mask = mask.reshape((-1, mask.shape[-1])) + incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask + incremental_indices = incremental_indices.reshape(input_ids.shape) + else: + incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask + + return incremental_indices.astype("i4") + padding_idx + + +ROBERTA_PRELAYERNORM_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`RobertaPreLayerNormConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.token_type_embeddings = nn.Embed( + self.config.type_vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True): + # Embed + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) + + # Sum all embeddings + hidden_states = inputs_embeds + token_type_embeddings + position_embeds + + # Layer Norm + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormSelfAttention(nn.Module): + config: RobertaPreLayerNormConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.head_dim = self.config.hidden_size // self.config.num_attention_heads + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` " + " : {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,)) + + @nn.compact + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic=True, + output_attentions: bool = False, + ): + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.query(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.key(key_value_states) + value_states = self.value(key_value_states) + else: + # self_attention + key_states = self.key(hidden_states) + value_states = self.value(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class FlaxRobertaPreLayerNormSelfOutput(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, input_tensor, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = hidden_states + input_tensor + return hidden_states + + +class FlaxRobertaPreLayerNormAttention(nn.Module): + config: RobertaPreLayerNormConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.self = FlaxRobertaPreLayerNormSelfAttention(self.config, causal=self.causal, dtype=self.dtype) + self.output = FlaxRobertaPreLayerNormSelfOutput(self.config, dtype=self.dtype) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states=None, + init_cache=False, + deterministic=True, + output_attentions: bool = False, + ): + hidden_states_pre_layer_norm = self.LayerNorm(hidden_states) + # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length) + # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable + # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length) + attn_outputs = self.self( + hidden_states_pre_layer_norm, + attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=key_value_states, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] + hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +class FlaxRobertaPreLayerNormIntermediate(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dense = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + + def __call__(self, hidden_states): + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class FlaxRobertaPreLayerNormOutput(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, attention_output, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = hidden_states + attention_output + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormLayer(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxRobertaPreLayerNormAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype) + self.intermediate = FlaxRobertaPreLayerNormIntermediate(self.config, dtype=self.dtype) + self.output = FlaxRobertaPreLayerNormOutput(self.config, dtype=self.dtype) + if self.config.add_cross_attention: + self.crossattention = FlaxRobertaPreLayerNormAttention(self.config, causal=False, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + ): + # Self Attention + attention_outputs = self.attention( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = attention_outputs[0] + + # Cross-Attention Block + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=encoder_hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + + hidden_states = self.intermediate(attention_output) + hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_outputs[1],) + if encoder_hidden_states is not None: + outputs += (cross_attention_outputs[1],) + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormLayerCollection(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + if self.gradient_checkpointing: + FlaxRobertaPreLayerNormCheckpointLayer = remat(FlaxRobertaPreLayerNormLayer, static_argnums=(5, 6, 7)) + self.layers = [ + FlaxRobertaPreLayerNormCheckpointLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + else: + self.layers = [ + FlaxRobertaPreLayerNormLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # Check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.shape[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for " + f" {head_mask.shape[0]}." + ) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask, + head_mask[i] if head_mask is not None else None, + encoder_hidden_states, + encoder_attention_mask, + init_cache, + deterministic, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormEncoder(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.layer = FlaxRobertaPreLayerNormLayerCollection( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPooler with Bert->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormPooler(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + + def __call__(self, hidden_states): + cls_hidden_state = hidden_states[:, 0] + cls_hidden_state = self.dense(cls_hidden_state) + return nn.tanh(cls_hidden_state) + + +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaLMHead with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormLMHead(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.decoder = nn.Dense( + self.config.vocab_size, + dtype=self.dtype, + use_bias=False, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.dense(hidden_states) + hidden_states = ACT2FN["gelu"](hidden_states) + hidden_states = self.layer_norm(hidden_states) + + if shared_embedding is not None: + hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + hidden_states = self.decoder(hidden_states) + + bias = jnp.asarray(self.bias, self.dtype) + hidden_states += bias + return hidden_states + + +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaClassificationHead with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormClassificationHead(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.out_proj = nn.Dense( + self.config.num_labels, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + def __call__(self, hidden_states, deterministic=True): + hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.dense(hidden_states) + hidden_states = nn.tanh(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaPreTrainedModel with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class FlaxRobertaPreLayerNormPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = RobertaPreLayerNormConfig + base_model_prefix = "roberta_prelayernorm" + + module_class: nn.Module = None + + def __init__( + self, + config: RobertaPreLayerNormConfig, + input_shape: tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + gradient_checkpointing: bool = False, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing + def enable_gradient_checkpointing(self): + self._module = self.module_class( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=True, + ) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + token_type_ids = jnp.ones_like(input_ids) + position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id) + attention_mask = jnp.ones_like(input_ids) + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + if self.config.add_cross_attention: + encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + else: + module_init_outputs = self.module.init( + rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False + ) + + random_params = module_init_outputs["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + past_key_values: Optional[dict] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + if position_ids is None: + position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + if self.config.add_cross_attention: + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxRobertaPreLayerNormAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + else: + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + return outputs + + +class FlaxRobertaPreLayerNormModule(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + add_pooling_layer: bool = True + gradient_checkpointing: bool = False + + def setup(self): + self.embeddings = FlaxRobertaPreLayerNormEmbeddings(self.config, dtype=self.dtype) + self.encoder = FlaxRobertaPreLayerNormEncoder( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.pooler = FlaxRobertaPreLayerNormPooler(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # make sure `token_type_ids` is correctly initialized when not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + # make sure `position_ids` is correctly initialized when not passed + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + hidden_states = self.embeddings( + input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic + ) + outputs = self.encoder( + hidden_states, + attention_mask, + head_mask=head_mask, + deterministic=deterministic, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + hidden_states = self.LayerNorm(hidden_states) + pooled = self.pooler(hidden_states) if self.add_pooling_layer else None + + if not return_dict: + # if pooled is None, don't return it + if pooled is None: + return (hidden_states,) + outputs[1:] + return (hidden_states, pooled) + outputs[1:] + + return FlaxBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=hidden_states, + pooler_output=pooled, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + "The bare RoBERTa-PreLayerNorm Model transformer outputting raw hidden-states without any specific head on top.", + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaModel with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormModel(FlaxRobertaPreLayerNormPreTrainedModel): + module_class = FlaxRobertaPreLayerNormModule + + +append_call_sample_docstring( + FlaxRobertaPreLayerNormModel, + _CHECKPOINT_FOR_DOC, + FlaxBaseModelOutputWithPooling, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLMModule with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class FlaxRobertaPreLayerNormForMaskedLMModule(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.lm_head = FlaxRobertaPreLayerNormLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.roberta_prelayernorm.variables["params"]["embeddings"]["word_embeddings"][ + "embedding" + ] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.lm_head(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxMaskedLMOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING +) +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLM with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormForMaskedLM(FlaxRobertaPreLayerNormPreTrainedModel): + module_class = FlaxRobertaPreLayerNormForMaskedLMModule + + +append_call_sample_docstring( + FlaxRobertaPreLayerNormForMaskedLM, + _CHECKPOINT_FOR_DOC, + FlaxBaseModelOutputWithPooling, + _CONFIG_FOR_DOC, + mask="", +) + + +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassificationModule with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class FlaxRobertaPreLayerNormForSequenceClassificationModule(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule( + config=self.config, + dtype=self.dtype, + add_pooling_layer=False, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.classifier = FlaxRobertaPreLayerNormClassificationHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output, deterministic=deterministic) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + RobertaPreLayerNorm Model transformer with a sequence classification/regression head on top (a linear layer on top + of the pooled output) e.g. for GLUE tasks. + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassification with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormForSequenceClassification(FlaxRobertaPreLayerNormPreTrainedModel): + module_class = FlaxRobertaPreLayerNormForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxRobertaPreLayerNormForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->RobertaPreLayerNorm, with self.bert->self.roberta_prelayernorm +class FlaxRobertaPreLayerNormForMultipleChoiceModule(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.classifier = nn.Dense(1, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None + position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None + + # Model + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[2:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + RobertaPreLayerNorm Model with a multiple choice classification head on top (a linear layer on top of the pooled + output and a softmax) e.g. for RocStories/SWAG tasks. + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMultipleChoice with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormForMultipleChoice(FlaxRobertaPreLayerNormPreTrainedModel): + module_class = FlaxRobertaPreLayerNormForMultipleChoiceModule + + +overwrite_call_docstring( + FlaxRobertaPreLayerNormForMultipleChoice, + ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"), +) +append_call_sample_docstring( + FlaxRobertaPreLayerNormForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->RobertaPreLayerNorm, with self.bert->self.roberta_prelayernorm +class FlaxRobertaPreLayerNormForTokenClassificationModule(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule( + config=self.config, + dtype=self.dtype, + add_pooling_layer=False, + gradient_checkpointing=self.gradient_checkpointing, + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + RobertaPreLayerNorm Model with a token classification head on top (a linear layer on top of the hidden-states + output) e.g. for Named-Entity-Recognition (NER) tasks. + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForTokenClassification with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormForTokenClassification(FlaxRobertaPreLayerNormPreTrainedModel): + module_class = FlaxRobertaPreLayerNormForTokenClassificationModule + + +append_call_sample_docstring( + FlaxRobertaPreLayerNormForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule with Bert->RobertaPreLayerNorm, with self.bert->self.roberta_prelayernorm +class FlaxRobertaPreLayerNormForQuestionAnsweringModule(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule( + config=self.config, + dtype=self.dtype, + add_pooling_layer=False, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + logits = self.qa_outputs(hidden_states) + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + outputs[1:] + + return FlaxQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + RobertaPreLayerNorm Model with a span classification head on top for extractive question-answering tasks like SQuAD + (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForQuestionAnswering with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormForQuestionAnswering(FlaxRobertaPreLayerNormPreTrainedModel): + module_class = FlaxRobertaPreLayerNormForQuestionAnsweringModule + + +append_call_sample_docstring( + FlaxRobertaPreLayerNormForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLMModule with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class FlaxRobertaPreLayerNormForCausalLMModule(nn.Module): + config: RobertaPreLayerNormConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.lm_head = FlaxRobertaPreLayerNormLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + token_type_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.roberta_prelayernorm.variables["params"]["embeddings"]["word_embeddings"][ + "embedding" + ] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.lm_head(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + RobertaPreLayerNorm Model with a language modeling head on top (a linear layer on top of the hidden-states output) + e.g for autoregressive tasks. + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLM with Roberta->RobertaPreLayerNorm +class FlaxRobertaPreLayerNormForCausalLM(FlaxRobertaPreLayerNormPreTrainedModel): + module_class = FlaxRobertaPreLayerNormForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxRobertaPreLayerNormForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxRobertaPreLayerNormForCausalLM", + "FlaxRobertaPreLayerNormForMaskedLM", + "FlaxRobertaPreLayerNormForMultipleChoice", + "FlaxRobertaPreLayerNormForQuestionAnswering", + "FlaxRobertaPreLayerNormForSequenceClassification", + "FlaxRobertaPreLayerNormForTokenClassification", + "FlaxRobertaPreLayerNormModel", + "FlaxRobertaPreLayerNormPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py new file mode 100644 index 0000000000000000000000000000000000000000..81481574b01eacae6f6f5eb0fa25c64ac287c9c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -0,0 +1,1442 @@ +# coding=utf-8 +# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch RoBERTa-PreLayerNorm model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, gelu +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->RobertaPreLayerNorm +class RobertaPreLayerNormEmbeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->RobertaPreLayerNorm +class RobertaPreLayerNormSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = self.query(hidden_states) + query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + + is_updated = False + is_cross_attention = encoder_hidden_states is not None + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_layer = curr_past_key_value.layers[self.layer_idx].keys + value_layer = curr_past_key_value.layers[self.layer_idx].values + else: + key_layer = self.key(current_states) + key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + value_layer = self.value(current_states) + value_layer = value_layer.view( + batch_size, -1, self.num_attention_heads, self.attention_head_size + ).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_layer, value_layer = curr_past_key_value.update( + key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if past_key_values is not None: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in RobertaPreLayerNormModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, attention_probs + + +class RobertaPreLayerNormSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states + input_tensor + return hidden_states + + +class RobertaPreLayerNormAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + self.self = RobertaPreLayerNormSelfAttention( + config, position_embedding_type=position_embedding_type, layer_idx=layer_idx + ) + self.output = RobertaPreLayerNormSelfOutput(config) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pruned_heads = set() + + # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + hidden_states_pre_layer_norm = self.LayerNorm(hidden_states) + self_outputs = self.self( + hidden_states_pre_layer_norm, + attention_mask, + head_mask, + encoder_hidden_states, + past_key_values, + output_attentions, + cache_position, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class RobertaPreLayerNormIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class RobertaPreLayerNormOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states + input_tensor + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->RobertaPreLayerNorm +class RobertaPreLayerNormLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = RobertaPreLayerNormAttention(config, layer_idx=layer_idx) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = RobertaPreLayerNormAttention( + config, position_embedding_type="absolute", layer_idx=layer_idx + ) + self.intermediate = RobertaPreLayerNormIntermediate(config) + self.output = RobertaPreLayerNormOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->RobertaPreLayerNorm +class RobertaPreLayerNormEncoder(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [RobertaPreLayerNormLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and self.config.is_decoder and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + + if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler +class RobertaPreLayerNormPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@auto_docstring +class RobertaPreLayerNormPreTrainedModel(PreTrainedModel): + config: RobertaPreLayerNormConfig + base_model_prefix = "roberta_prelayernorm" + supports_gradient_checkpointing = True + _no_split_modules = ["RobertaPreLayerNormEmbeddings", "RobertaPreLayerNormSelfAttention"] + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->RobertaPreLayerNormLMHead + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, RobertaPreLayerNormLMHead): + module.bias.data.zero_() + + +@auto_docstring( + custom_intro=""" + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in *Attention is + all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762 + """ +) +class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + self.embeddings = RobertaPreLayerNormEmbeddings(config) + self.encoder = RobertaPreLayerNormEncoder(config) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.pooler = RobertaPreLayerNormPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values[0][0].shape[-2] + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.LayerNorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + RoBERTa-PreLayerNorm Model with a `language modeling` head on top for CLM fine-tuning. + """ +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer +class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning( + "If you want to use `RobertaPreLayerNormLMHeadModel` as a standalone, add `is_decoder=True.`" + ) + + self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) + self.lm_head = RobertaPreLayerNormLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Example: + + ```python + >>> from transformers import AutoTokenizer, RobertaPreLayerNormForCausalLM, AutoConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("andreasmadsen/efficient_mlm_m0.40") + >>> config = AutoConfig.from_pretrained("andreasmadsen/efficient_mlm_m0.40") + >>> config.is_decoder = True + >>> model = RobertaPreLayerNormForCausalLM.from_pretrained("andreasmadsen/efficient_mlm_m0.40", config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(prediction_scores.device) + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + RoBERTa-PreLayerNorm Model with a `language modeling` head on top. + """ +) +class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `RobertaPreLayerNormForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) + self.lm_head = RobertaPreLayerNormLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @auto_docstring + # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.forward with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MaskedLMOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + masked_lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(prediction_scores.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->RobertaPreLayerNorm +class RobertaPreLayerNormLMHead(nn.Module): + """RobertaPreLayerNorm Head for masked language modeling.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.decoder.bias = self.bias + + def forward(self, features, **kwargs): + x = self.dense(features) + x = gelu(x) + x = self.layer_norm(x) + + # project back to size of vocabulary with bias + x = self.decoder(x) + + return x + + def _tie_weights(self): + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + self.bias = self.decoder.bias + + +@auto_docstring( + custom_intro=""" + RoBERTa-PreLayerNorm Model transformer with a sequence classification/regression head on top (a linear layer on top + of the pooled output) e.g. for GLUE tasks. + """ +) +class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) + self.classifier = RobertaPreLayerNormClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.forward with roberta->roberta_prelayernorm + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.roberta_prelayernorm = RobertaPreLayerNormModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + flat_inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.roberta_prelayernorm( + flat_input_ids, + position_ids=flat_position_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(reshaped_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.forward with roberta->roberta_prelayernorm + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->RobertaPreLayerNorm +class RobertaPreLayerNormClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@auto_docstring +class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.forward with roberta->roberta_prelayernorm + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +__all__ = [ + "RobertaPreLayerNormForCausalLM", + "RobertaPreLayerNormForMaskedLM", + "RobertaPreLayerNormForMultipleChoice", + "RobertaPreLayerNormForQuestionAnswering", + "RobertaPreLayerNormForSequenceClassification", + "RobertaPreLayerNormForTokenClassification", + "RobertaPreLayerNormModel", + "RobertaPreLayerNormPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py new file mode 100644 index 0000000000000000000000000000000000000000..0a370f390269240fb02d55f493fbf68905f99977 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py @@ -0,0 +1,1807 @@ +# coding=utf-8 +# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 RoBERTa-PreLayerNorm model.""" + +from __future__ import annotations + +import math +import warnings + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPoolingAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40" +_CONFIG_FOR_DOC = "RobertaPreLayerNormConfig" + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->RobertaPreLayerNorm +class TFRobertaPreLayerNormEmbeddings(keras.layers.Layer): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.padding_idx = 1 + self.config = config + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding + symbols are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + input_ids: tf.Tensor + Returns: tf.Tensor + """ + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask + + return incremental_indices + self.padding_idx + + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + past_key_values_length=0, + training=False, + ): + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = self.create_position_ids_from_input_ids( + input_ids=input_ids, past_key_values_length=past_key_values_length + ) + else: + position_ids = tf.expand_dims( + tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RobertaPreLayerNorm +class TFRobertaPreLayerNormPooler(keras.layers.Layer): + def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RobertaPreLayerNorm +class TFRobertaPreLayerNormSelfAttention(keras.layers.Layer): + def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + + self.is_decoder = config.is_decoder + self.config = config + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFRobertaPreLayerNormModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +class TFRobertaPreLayerNormSelfOutput(keras.layers.Layer): + def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = hidden_states + input_tensor + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFRobertaPreLayerNormAttention(keras.layers.Layer): + def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFRobertaPreLayerNormSelfAttention(config, name="self") + self.dense_output = TFRobertaPreLayerNormSelfOutput(config, name="output") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention.prune_heads + def prune_heads(self, heads): + raise NotImplementedError + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + hidden_states_pre_layer_norm = self.LayerNorm(inputs=input_tensor) + self_outputs = self.self_attention( + hidden_states=hidden_states_pre_layer_norm, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training + ) + # add attentions (possibly with past_key_value) if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFRobertaPreLayerNormIntermediate(keras.layers.Layer): + def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): + super().__init__(**kwargs) + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.LayerNorm(inputs=hidden_states) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFRobertaPreLayerNormOutput(keras.layers.Layer): + def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = hidden_states + input_tensor + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RobertaPreLayerNorm +class TFRobertaPreLayerNormLayer(keras.layers.Layer): + def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFRobertaPreLayerNormAttention(config, name="attention") + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = TFRobertaPreLayerNormAttention(config, name="crossattention") + self.intermediate = TFRobertaPreLayerNormIntermediate(config, name="intermediate") + self.bert_output = TFRobertaPreLayerNormOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_value: tuple[tf.Tensor] | None, + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + input_tensor=attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + outputs # add attentions if we output them + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->RobertaPreLayerNorm +class TFRobertaPreLayerNormEncoder(keras.layers.Layer): + def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFRobertaPreLayerNormLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_values: tuple[tuple[tf.Tensor]] | None, + use_cache: bool | None, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFRobertaPreLayerNormMainLayer(keras.layers.Layer): + config_class = RobertaPreLayerNormConfig + + def __init__(self, config, add_pooling_layer=True, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.is_decoder = config.is_decoder + + self.num_hidden_layers = config.num_hidden_layers + self.initializer_range = config.initializer_range + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + self.encoder = TFRobertaPreLayerNormEncoder(config, name="encoder") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.pooler = TFRobertaPreLayerNormPooler(config, name="pooler") if add_pooling_layer else None + # The embeddings must be the last declaration in order to follow the weights order + self.embeddings = TFRobertaPreLayerNormEmbeddings(config, name="embeddings") + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]: + if not self.config.is_decoder: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + + if past_key_values is None: + past_key_values_length = 0 + past_key_values = [None] * len(self.encoder.layer) + else: + past_key_values_length = shape_list(past_key_values[0][0])[-2] + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + training=training, + ) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask_shape = shape_list(attention_mask) + + mask_seq_length = seq_length + past_key_values_length + # Provided a padding mask of dimensions [batch_size, mask_seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + if self.is_decoder: + seq_ids = tf.range(mask_seq_length) + causal_mask = tf.less_equal( + tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), + seq_ids[None, :, None], + ) + causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype) + extended_attention_mask = causal_mask * attention_mask[:, None, :] + attention_mask_shape = shape_list(extended_attention_mask) + extended_attention_mask = tf.reshape( + extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2]) + ) + if past_key_values[0] is not None: + # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length] + extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :] + else: + extended_attention_mask = tf.reshape( + attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + one_cst = tf.constant(1.0, dtype=embedding_output.dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + if self.is_decoder and encoder_attention_mask is not None: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.LayerNorm(inputs=sequence_output) + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None + + if not return_dict: + return ( + sequence_output, + pooled_output, + ) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class TFRobertaPreLayerNormPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = RobertaPreLayerNormConfig + base_model_prefix = "roberta_prelayernorm" + + +ROBERTA_PRELAYERNORM_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`RobertaPreLayerNormConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare RoBERTa-PreLayerNorm Model transformer outputting raw hidden-states without any specific head on top.", + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class TFRobertaPreLayerNormModel(TFRobertaPreLayerNormPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(config, name="roberta_prelayernorm") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutputWithPoolingAndCrossAttentions: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + """ + outputs = self.roberta_prelayernorm( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm +class TFRobertaPreLayerNormLMHead(keras.layers.Layer): + """RobertaPreLayerNorm Head for masked language modeling.""" + + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.hidden_size = config.hidden_size + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.act = get_tf_activation("gelu") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = input_embeddings + + def build(self, input_shape=None): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + + def get_output_embeddings(self): + return self.decoder + + def set_output_embeddings(self, value): + self.decoder.weight = value + self.decoder.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.layer_norm(hidden_states) + + # project back to size of vocabulary with bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +@add_start_docstrings( + """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING +) +class TFRobertaPreLayerNormForMaskedLM(TFRobertaPreLayerNormPreTrainedModel, TFMaskedLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"] + + # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer( + config, add_pooling_layer=False, name="roberta_prelayernorm" + ) + self.lm_head = TFRobertaPreLayerNormLMHead(config, self.roberta_prelayernorm.embeddings, name="lm_head") + + def get_lm_head(self): + return self.lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.lm_head.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="", + expected_output="' Paris'", + expected_loss=0.69, + ) + # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.call with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMaskedLMOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFCausalLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"] + + def __init__(self, config: RobertaPreLayerNormConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if not config.is_decoder: + logger.warning( + "If you want to use `TFRobertaPreLayerNormLMHeadModel` as a standalone, add `is_decoder=True.`" + ) + + self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer( + config, add_pooling_layer=False, name="roberta_prelayernorm" + ) + self.lm_head = TFRobertaPreLayerNormLMHead( + config, input_embeddings=self.roberta_prelayernorm.embeddings, name="lm_head" + ) + + def get_lm_head(self): + return self.lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.lm_head.name + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = tf.ones(input_shape) + + # cut decoder_input_ids if past is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values} + + @unpack_inputs + @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFCausalLMOutputWithCrossAttentions | tuple[tf.Tensor]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + outputs = self.roberta_prelayernorm( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + logits = self.lm_head(hidden_states=sequence_output, training=training) + loss = None + + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm +class TFRobertaPreLayerNormClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + self.config = config + + def call(self, features, training=False): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, training=training) + x = self.dense(x) + x = self.dropout(x, training=training) + x = self.out_proj(x) + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + RoBERTa-PreLayerNorm Model transformer with a sequence classification/regression head on top (a linear layer on top + of the pooled output) e.g. for GLUE tasks. + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +class TFRobertaPreLayerNormForSequenceClassification( + TFRobertaPreLayerNormPreTrainedModel, TFSequenceClassificationLoss +): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer( + config, add_pooling_layer=False, name="roberta_prelayernorm" + ) + self.classifier = TFRobertaPreLayerNormClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification.call with roberta->roberta_prelayernorm + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output, training=training) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) + + +@add_start_docstrings( + """ + RobertaPreLayerNorm Model with a multiple choice classification head on top (a linear layer on top of the pooled + output and a softmax) e.g. for RocStories/SWAG tasks. + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm +class TFRobertaPreLayerNormForMultipleChoice(TFRobertaPreLayerNormPreTrainedModel, TFMultipleChoiceLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"lm_head"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(config, name="roberta_prelayernorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward( + ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + outputs = self.roberta_prelayernorm( + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, training=training) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + RoBERTa-PreLayerNorm Model with a token classification head on top (a linear layer on top of the hidden-states + output) e.g. for Named-Entity-Recognition (NER) tasks. + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +class TFRobertaPreLayerNormForTokenClassification(TFRobertaPreLayerNormPreTrainedModel, TFTokenClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer( + config, add_pooling_layer=False, name="roberta_prelayernorm" + ) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification.call with roberta->roberta_prelayernorm + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFTokenClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + RoBERTa-PreLayerNorm Model with a span classification head on top for extractive question-answering tasks like + SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ROBERTA_PRELAYERNORM_START_DOCSTRING, +) +class TFRobertaPreLayerNormForQuestionAnswering(TFRobertaPreLayerNormPreTrainedModel, TFQuestionAnsweringLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer( + config, add_pooling_layer=False, name="roberta_prelayernorm" + ) + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering.call with roberta->roberta_prelayernorm + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + outputs = self.roberta_prelayernorm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + loss = None + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFRobertaPreLayerNormForCausalLM", + "TFRobertaPreLayerNormForMaskedLM", + "TFRobertaPreLayerNormForMultipleChoice", + "TFRobertaPreLayerNormForQuestionAnswering", + "TFRobertaPreLayerNormForSequenceClassification", + "TFRobertaPreLayerNormForTokenClassification", + "TFRobertaPreLayerNormMainLayer", + "TFRobertaPreLayerNormModel", + "TFRobertaPreLayerNormPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec918f62dae19e228e310eb0c30c13d37b0bae8f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13c9528a4fc83e4537767cb09aa58791fa90b785 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b0ea6d138a906b841323c0b0cad5688ec9e3af2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed7f4e0418ab53b3f63a9dc7ad3d41f8638c0ce6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a206f3305dc7d8543ba9a3dfc5cca37f9d83f6e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..512ed1490b0a6d6e19d3673b8eeaf57f246519be Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/configuration_seamless_m4t_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/configuration_seamless_m4t_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edca02532183941fa4adb0576b42c4656a2673cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/configuration_seamless_m4t_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47198204739cda38994e61436895ef5345a42df9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/configuration_sew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/configuration_sew.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f6cb0378d5259d03d79fcdc9d01717bf8805c14 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/configuration_sew.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/feature_extraction_sew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/feature_extraction_sew.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a15abb306adc8fc36dcf1c05f67be38befc3582 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/feature_extraction_sew.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modeling_sew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modeling_sew.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85a81f7f9701e4cab8dab19c89c561a43cfbff46 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modeling_sew.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modular_sew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modular_sew.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b52e5272793148974e55be67c0bf30dfb25bbf0c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modular_sew.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d667f0404799ad7ccc0e91cb89118ae54fca31eb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/configuration_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/configuration_siglip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3da07d430abaeb67fb0120cfe5d706a14ba3027e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/configuration_siglip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77a9052b4992de31360e0ca53e6fb2e3e55e4cfc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96cc331a80be6df9da4531955bed1d3851241515 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/modeling_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/modeling_siglip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4c813690e4f1869248ffd42a892cb160ef4fb35 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/modeling_siglip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/processing_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/processing_siglip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40648dfa6de37cfcb8c63162f1de416961eba882 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/processing_siglip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/tokenization_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/tokenization_siglip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9724b871ead71c72dd27b4287c8b8f1b62e11783 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/tokenization_siglip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22f337bbee2b7e6a800a7248fc7ae961c07e0f07 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/configuration_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/configuration_siglip2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee74d6efdbe268b29fbbb0f08a77896c1377e965 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/configuration_siglip2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f89565586d03af4b843cadfa37625bd3ac463194 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d61e62083ba50c96f0567e9e151726748ce8d40 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modeling_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modeling_siglip2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcce5899717c5a6a442e2d1b28f1cf32c76ce53d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modeling_siglip2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modular_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modular_siglip2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bc50d8e6d0b65cfc38a2b0a2e7f21d0fd050df8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modular_siglip2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/processing_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/processing_siglip2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f81f25bfed698ccfc4246e3e0c6d77e116e086f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/processing_siglip2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bbb950cb6e690a00c3072db255a0da06acf8786 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54a88cab43b8147db5e5f46f9956b2dd51d68568 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..daf49be994dd34851c94e82b24e69c025aac8f2e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0979af46c651400ad9ec7b35896025ecd7359412 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..878deb9f4dd7762d5721717042786aa98e792826 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2882fb051a3115098c29c0c63079f12c9aec05d8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modeling_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modeling_smolvlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04c9924af19248a034d70cc63e273bc4d9e99106 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modeling_smolvlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modular_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modular_smolvlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcb99845eab3b56d6b8ec184f20ba6e938eaab15 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modular_smolvlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/processing_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/processing_smolvlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a38729613236f71f6ab3e3c2da270207e86b6df Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/processing_smolvlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/video_processing_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/video_processing_smolvlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4457f0f1cc58845691242df2ee3c690499eee237 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/video_processing_smolvlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b41c180e350e5b75b7451ccd5a31e9bed0143684 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abc5632cf4f8f5af8521ed8f4426f332365329b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adc8c01a2549a5f52e003dfe7301af79689847ed Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4370d21e3b557b2c1adb3381acfa8f674ca7768 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23422212827afa9aafe28bc86e3fd71d547d8733 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e8c6b7257bc0ba9344a98129df3e0326f964dd5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9f8174e27f472b4fbc819c2db83efbde9ce769d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6f021c148b3e830dea8f37a11262718a7e68d13 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d2a187fc43991fe0a73e99943f2ec37e25f7deb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8afbd8395d1036a70eae0e5837acb623b63e60d6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a11a3d040bb4624a4bc64699e01818139f690237 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31f4cc783dc73fc9121ee3654331bc20e441d311 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ffc8024094b2dc450ac4fe362cccec99d4aadf7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/configuration_stablelm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/configuration_stablelm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da642d9fc1ff306cfc3343a1ab25a0c4f4a39a87 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/configuration_stablelm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/modeling_stablelm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/modeling_stablelm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95ec9a07f92609c80a0767911fa437dac84fa71d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/modeling_stablelm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94a0c13d20db3b34225ea0d1cebde871daa99c2a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bbec9bc7adb265f0af09419d566aa7f90c159a4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93127d6a4acc8a6545c2df3d21e38279bf6331ae Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89ce09c8e1923fda3e4a77f13186e4d0fc08da83 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc55f20aebce3262a9420ff8476a98a7cc811b1d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/configuration_superpoint.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/configuration_superpoint.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f48b50a5184066b97ca86799884541521e7ba7cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/configuration_superpoint.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab15016cd5ea21bc02ee149a65e17a089e03d0d8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6527e2053648bb5b96156c51eec2da8fb68a78f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/modeling_superpoint.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/modeling_superpoint.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7e23f71b6296bf296cb1a9bdbc5b93066803546 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/modeling_superpoint.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ee8774ab5db8d3352efcfaa81159ec1247fc9b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/configuration_t5gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/configuration_t5gemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1e1bf5383ad515d80eb0e476017159101b33f87 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/configuration_t5gemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modular_t5gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modular_t5gemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74ce4f8480e8af2037cb7df1589566e281c20514 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modular_t5gemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7fc4762a062a86bc5c6dbd296171ab495fd1a104 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/configuration_table_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/configuration_table_transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24d4268d39742ffcd826ee0411346c5d64f697ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/configuration_table_transformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/modeling_table_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/modeling_table_transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b3145d7a24e193e0d5ad84e4eb79b59a36b9611 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/modeling_table_transformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2092b57cc781fb8332f3dafd139860e3adf5fbba Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/configuration_textnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/configuration_textnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b5ee37ad9e2e44b665c3f7f646d03665b9ff87d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/configuration_textnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b998802c4c60314fe1e7da92a6023ec8c797ef68 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bc301f6c8713609500022a624f49c2a70174a92 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/modeling_textnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/modeling_textnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37f12f8bb1dfcd09c8b1e28fb5f93146ebd9ac48 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/modeling_textnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e493c4ccc472251b942d744ec761a510645a573a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/configuration_time_series_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/configuration_time_series_transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6be9ab909f7312344be6b9eb1e1f36211de6ae20 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/configuration_time_series_transformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/modeling_time_series_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/modeling_time_series_transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6035c533cc5d036c99b11c35ed68699c44d7df1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/modeling_time_series_transformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db1a9e12457d640f0f0fd4a23934dd9dfedf30b7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc2a542642d500de73f6d68704d85e56f91ec146 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53b5e40aae9b57767d954f6f1be75c907efc945e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a03daae25f4535c96c32e20dc4b13a817518bc06 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_unispeech import * + from .modeling_unispeech import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/configuration_unispeech.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/configuration_unispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..a71ba7f1b7d2a57671e8e75f1c773b0665e1614a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/configuration_unispeech.py @@ -0,0 +1,309 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""UniSpeech model configuration""" + +import functools +import operator + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class UniSpeechConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`UniSpeechModel`]. It is used to instantiate an + UniSpeech model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the UniSpeech + [microsoft/unispeech-large-1500h-cv](https://huggingface.co/microsoft/unispeech-large-1500h-cv) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32): + Vocabulary size of the UniSpeech model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`UniSpeechModel`]. Vocabulary size of the model. Defines the + different tokens that can be represented by the *inputs_ids* passed to the forward method of + [`UniSpeechModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for activations inside the fully connected layer. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + feat_proj_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for output of the feature encoder. + feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for the output of the feature encoder that's used by the quantizer. + final_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the final projection layer of [`UniSpeechForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more + details. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + feat_extract_norm (`str`, *optional*, defaults to `"group"`): + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group + normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D + convolutional layers. + feat_extract_activation (`str, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + conv_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 2, 2)`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, *optional*, defaults to `False`): + Whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, *optional*, defaults to 128): + Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): + Number of groups of 1D convolutional positional embeddings layer. + do_stable_layer_norm (`bool`, *optional*, defaults to `False`): + Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is + True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is + False` corresponds to applying layer norm after the attention layer. + apply_spec_augment (`bool`, *optional*, defaults to `True`): + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://huggingface.co/papers/1904.08779). + mask_time_prob (`float`, *optional*, defaults to 0.05): + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the probability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, *optional*, defaults to 10): + Length of vector span along the time axis. + mask_time_min_masks (`int`, *optional*, defaults to 2): + The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, *optional*, defaults to 0.0): + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, *optional*, defaults to 10): + Length of vector span along the feature axis. + mask_feature_min_masks (`int`, *optional*, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' + num_codevectors_per_group (`int`, *optional*, defaults to 320): + Number of entries in each quantization codebook (group). + num_codevector_groups (`int`, *optional*, defaults to 2): + Number of codevector groups for product codevector quantization. + contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): + The temperature *kappa* in the contrastive loss. + num_negatives (`int`, *optional*, defaults to 100): + Number of negative samples for the contrastive loss. + codevector_dim (`int`, *optional*, defaults to 256): + Dimensionality of the quantized feature vectors. + proj_codevector_dim (`int`, *optional*, defaults to 256): + Dimensionality of the final projection of both the quantized and the transformer features. + diversity_loss_weight (`int`, *optional*, defaults to 0.1): + The weight of the codebook diversity loss component. + ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`): + Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an + instance of [`UniSpeechForCTC`]. + ctc_zero_infinity (`bool`, *optional*, defaults to `False`): + Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly + occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance + of [`UniSpeechForCTC`]. + use_weighted_layer_sum (`bool`, *optional*, defaults to `False`): + Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an + instance of [`UniSpeechForSequenceClassification`]. + classifier_proj_size (`int`, *optional*, defaults to 256): + Dimensionality of the projection before token mean-pooling for classification. + num_ctc_classes (`int`, *optional*, defaults to 80): + Specifies the number of classes (phoneme tokens and blank token) for phoneme-level CTC loss. Only relevant + when using an instance of [`UniSpeechForPreTraining`]. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + replace_prob (`float`, *optional*, defaults to 0.5): + Probability that transformer feature is replaced by quantized feature for pretraining. + + Example: + + ```python + >>> from transformers import UniSpeechConfig, UniSpeechModel + + >>> # Initializing a UniSpeech facebook/unispeech-base-960h style configuration + >>> configuration = UniSpeechConfig() + + >>> # Initializing a model (with random weights) from the facebook/unispeech-base-960h style configuration + >>> model = UniSpeechModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "unispeech" + + def __init__( + self, + vocab_size=32, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout=0.1, + activation_dropout=0.1, + attention_dropout=0.1, + feat_proj_dropout=0.0, + feat_quantizer_dropout=0.0, + final_dropout=0.1, + layerdrop=0.1, + initializer_range=0.02, + layer_norm_eps=1e-5, + feat_extract_norm="group", + feat_extract_activation="gelu", + conv_dim=(512, 512, 512, 512, 512, 512, 512), + conv_stride=(5, 2, 2, 2, 2, 2, 2), + conv_kernel=(10, 3, 3, 3, 3, 2, 2), + conv_bias=False, + num_conv_pos_embeddings=128, + num_conv_pos_embedding_groups=16, + do_stable_layer_norm=False, + apply_spec_augment=True, + mask_time_prob=0.05, + mask_time_length=10, + mask_time_min_masks=2, + mask_feature_prob=0.0, + mask_feature_length=10, + mask_feature_min_masks=0, + num_codevectors_per_group=320, + num_codevector_groups=2, + contrastive_logits_temperature=0.1, + num_negatives=100, + codevector_dim=256, + proj_codevector_dim=256, + diversity_loss_weight=0.1, + ctc_loss_reduction="mean", + ctc_zero_infinity=False, + use_weighted_layer_sum=False, + classifier_proj_size=256, + num_ctc_classes=80, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + replace_prob=0.5, + **kwargs, + ): + super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) + self.hidden_size = hidden_size + self.feat_extract_norm = feat_extract_norm + self.feat_extract_activation = feat_extract_activation + self.conv_dim = list(conv_dim) + self.conv_stride = list(conv_stride) + self.conv_kernel = list(conv_kernel) + self.conv_bias = conv_bias + self.num_conv_pos_embeddings = num_conv_pos_embeddings + self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.num_feat_extract_layers = len(self.conv_dim) + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_attention_heads = num_attention_heads + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.feat_proj_dropout = feat_proj_dropout + self.final_dropout = final_dropout + self.layerdrop = layerdrop + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.num_ctc_classes = num_ctc_classes + self.vocab_size = vocab_size + self.do_stable_layer_norm = do_stable_layer_norm + self.use_weighted_layer_sum = use_weighted_layer_sum + self.classifier_proj_size = classifier_proj_size + + if ( + (len(self.conv_stride) != self.num_feat_extract_layers) + or (len(self.conv_kernel) != self.num_feat_extract_layers) + or (len(self.conv_dim) != self.num_feat_extract_layers) + ): + raise ValueError( + "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` ==" + " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) =" + f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`," + f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." + ) + + # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 + self.apply_spec_augment = apply_spec_augment + self.mask_time_prob = mask_time_prob + self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks + self.mask_feature_prob = mask_feature_prob + self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks + + # parameters for pretraining with codevector quantized representations + self.num_codevectors_per_group = num_codevectors_per_group + self.num_codevector_groups = num_codevector_groups + self.contrastive_logits_temperature = contrastive_logits_temperature + self.feat_quantizer_dropout = feat_quantizer_dropout + self.num_negatives = num_negatives + self.codevector_dim = codevector_dim + self.proj_codevector_dim = proj_codevector_dim + self.diversity_loss_weight = diversity_loss_weight + + # ctc loss + self.ctc_loss_reduction = ctc_loss_reduction + self.ctc_zero_infinity = ctc_zero_infinity + + # pretraining loss + self.replace_prob = replace_prob + + @property + def inputs_to_logits_ratio(self): + return functools.reduce(operator.mul, self.conv_stride, 1) + + +__all__ = ["UniSpeechConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modeling_unispeech.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modeling_unispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..233653a36b39c58d3edcc984cefec9f8e44961be --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modeling_unispeech.py @@ -0,0 +1,1518 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/unispeech/modular_unispeech.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_unispeech.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import warnings +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...integrations.fsdp import is_fsdp_managed_module +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + CausalLMOutput, + ModelOutput, + SequenceClassifierOutput, + Wav2Vec2BaseModelOutput, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import auto_docstring, is_torch_flex_attn_available, logging +from .configuration_unispeech import UniSpeechConfig + + +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions. + """ +) +class UniSpeechForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official + paper](https://huggingface.co/papers/2006.11477). + projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked + projected quantized states. + projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive + target vectors for contrastive loss. + codevector_perplexity (`torch.FloatTensor` of shape `(1,)`): + The perplexity of the codevector distribution, used to measure the diversity of the codebook. + """ + + loss: Optional[torch.FloatTensor] = None + projected_states: Optional[torch.FloatTensor] = None + projected_quantized_states: Optional[torch.FloatTensor] = None + codevector_perplexity: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +class UniSpeechSamePadLayer(nn.Module): + def __init__(self, num_conv_pos_embeddings): + super().__init__() + self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 + + def forward(self, hidden_states): + if self.num_pad_remove > 0: + hidden_states = hidden_states[:, :, : -self.num_pad_remove] + return hidden_states + + +class UniSpeechPositionalConvEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size=config.num_conv_pos_embeddings, + padding=config.num_conv_pos_embeddings // 2, + groups=config.num_conv_pos_embedding_groups, + ) + + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): + self.conv = weight_norm(self.conv, name="weight", dim=2) + if hasattr(self.conv, "parametrizations"): + weight_g = self.conv.parametrizations.weight.original0 + weight_v = self.conv.parametrizations.weight.original1 + else: + weight_g = self.conv.weight_g + weight_v = self.conv.weight_v + deepspeed.zero.register_external_parameter(self, weight_v) + deepspeed.zero.register_external_parameter(self, weight_g) + else: + self.conv = weight_norm(self.conv, name="weight", dim=2) + + self.padding = UniSpeechSamePadLayer(config.num_conv_pos_embeddings) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose(1, 2) + + hidden_states = self.conv(hidden_states) + hidden_states = self.padding(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class UniSpeechNoLayerNormConvLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class UniSpeechLayerNormConvLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + + hidden_states = hidden_states.transpose(-2, -1) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.transpose(-2, -1) + + hidden_states = self.activation(hidden_states) + return hidden_states + + +class UniSpeechGroupNormConvLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class UniSpeechFeatureEncoder(nn.Module): + """Construct the features from raw audio waveform""" + + def __init__(self, config): + super().__init__() + + if config.feat_extract_norm == "group": + conv_layers = [UniSpeechGroupNormConvLayer(config, layer_id=0)] + [ + UniSpeechNoLayerNormConvLayer(config, layer_id=i + 1) + for i in range(config.num_feat_extract_layers - 1) + ] + elif config.feat_extract_norm == "layer": + conv_layers = [ + UniSpeechLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers) + ] + else: + raise ValueError( + f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']" + ) + self.conv_layers = nn.ModuleList(conv_layers) + self.gradient_checkpointing = False + self._requires_grad = True + + def _freeze_parameters(self): + for param in self.parameters(): + param.requires_grad = False + self._requires_grad = False + + def forward(self, input_values): + hidden_states = input_values[:, None] + + # make sure hidden_states require grad for gradient_checkpointing + if self._requires_grad and self.training: + hidden_states.requires_grad = True + + for conv_layer in self.conv_layers: + hidden_states = conv_layer(hidden_states) + + return hidden_states + + +class UniSpeechFeatureProjection(nn.Module): + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) + self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) + self.dropout = nn.Dropout(config.feat_proj_dropout) + + def forward(self, hidden_states): + # non-projected hidden states are needed for quantization + norm_hidden_states = self.layer_norm(hidden_states) + hidden_states = self.projection(norm_hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states, norm_hidden_states + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: Optional[float] = None, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +): + if scaling is None: + scaling = query.size(-1) ** -0.5 + + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if head_mask is not None: + attn_weights = attn_weights * head_mask.view(1, -1, 1, 1) + + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class UniSpeechAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[UniSpeechConfig] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + # TODO: we need a refactor so that the different attention modules can get their specific kwargs + # ATM, we have mixed things encoder, decoder, and encoder-decoder attn + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + # determine input shapes + bsz, tgt_len = hidden_states.shape[:-1] + src_len = key_value_states.shape[1] if is_cross_attention else tgt_len + + q_input_shape = (bsz, tgt_len, -1, self.head_dim) + kv_input_shape = (bsz, src_len, -1, self.head_dim) + + # get query proj + query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2) + + current_states = key_value_states if is_cross_attention else hidden_states + key_states = self.k_proj(current_states).view(*kv_input_shape).transpose(1, 2) + value_states = self.v_proj(current_states).view(*kv_input_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + output_attentions=output_attentions, + head_mask=layer_head_mask, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights, None + + +class UniSpeechFeedForward(nn.Module): + def __init__(self, config): + super().__init__() + self.intermediate_dropout = nn.Dropout(config.activation_dropout) + + self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.output_dropout = nn.Dropout(config.hidden_dropout) + + def forward(self, hidden_states): + hidden_states = self.intermediate_dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.intermediate_dropout(hidden_states) + + hidden_states = self.output_dense(hidden_states) + hidden_states = self.output_dropout(hidden_states) + return hidden_states + + +class UniSpeechEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.attention = UniSpeechAttention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=False, + config=config, + ) + + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.feed_forward = UniSpeechFeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states, attention_mask=None, output_attentions=False): + attn_residual = hidden_states + hidden_states, attn_weights, _ = self.attention( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = self.dropout(hidden_states) + hidden_states = attn_residual + hidden_states + + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states + self.feed_forward(hidden_states) + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class UniSpeechEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.pos_conv_embed = UniSpeechPositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.ModuleList([UniSpeechEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens output 0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 + + attention_mask = self._update_full_mask( + attention_mask, + hidden_states, + ) + + position_embeddings = self.pos_conv_embed(hidden_states) + hidden_states = hidden_states + position_embeddings + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) + + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + + for layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.config.layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + layer_outputs = layer( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + +class UniSpeechAttnAdapterLayer(nn.Module): + def __init__(self, config): + """ + Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed + up training throughput. + """ + super().__init__() + self.input_dim = config.adapter_attn_dim + self.hidden_dim = config.hidden_size + + self.norm = nn.LayerNorm(self.hidden_dim) + self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim) + self.act_fn = nn.ReLU() + self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim) + + def forward(self, hidden_states: torch.FloatTensor): + hidden_states = self.norm(hidden_states) + + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act_fn(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states + + +class UniSpeechEncoderLayerStableLayerNorm(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.attention = UniSpeechAttention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=False, + config=config, + ) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.feed_forward = UniSpeechFeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if getattr(config, "adapter_attn_dim", None) is not None: + self.adapter_layer = UniSpeechAttnAdapterLayer(config) + else: + self.adapter_layer = None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + attn_residual = hidden_states + hidden_states = self.layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.attention( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = self.dropout(hidden_states) + hidden_states = attn_residual + hidden_states + hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states)) + + if self.adapter_layer is not None: + hidden_states = hidden_states + self.adapter_layer(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class UniSpeechEncoderStableLayerNorm(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.pos_conv_embed = UniSpeechPositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.ModuleList( + [UniSpeechEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens output 0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 + + attention_mask = self._update_full_mask( + attention_mask, + hidden_states, + ) + + position_embeddings = self.pos_conv_embed(hidden_states) + hidden_states = hidden_states + position_embeddings + hidden_states = self.dropout(hidden_states) + + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + + for layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.config.layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication + layer_outputs = layer( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + +class UniSpeechGumbelVectorQuantizer(nn.Module): + """ + Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH + GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information. + """ + + def __init__(self, config): + super().__init__() + self.num_groups = config.num_codevector_groups + self.num_vars = config.num_codevectors_per_group + + if config.codevector_dim % self.num_groups != 0: + raise ValueError( + f"`config.codevector_dim {config.codevector_dim} must be divisible " + f"by `config.num_codevector_groups` {self.num_groups} for concatenation" + ) + + # storage for codebook variables (codewords) + self.codevectors = nn.Parameter( + torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups) + ) + self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) + + # can be decayed for training + self.temperature = 2 + + @staticmethod + def _compute_perplexity(probs): + marginal_probs = probs.mean(dim=0) + perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum() + return perplexity + + def forward(self, hidden_states): + batch_size, sequence_length, hidden_size = hidden_states.shape + + # project to codevector dim + hidden_states = self.weight_proj(hidden_states) + hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1) + + if self.training: + # sample code vector probs via gumbel in differentiateable way + codevector_probs = nn.functional.gumbel_softmax( + hidden_states.float(), tau=self.temperature, hard=True + ).type_as(hidden_states) + + # compute perplexity + codevector_soft_dist = torch.softmax( + hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1 + ) + perplexity = self._compute_perplexity(codevector_soft_dist) + else: + # take argmax in non-differentiable way + # comptute hard codevector distribution (one hot) + codevector_idx = hidden_states.argmax(dim=-1) + codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_( + -1, codevector_idx.view(-1, 1), 1.0 + ) + codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) + + perplexity = self._compute_perplexity(codevector_probs) + + codevector_probs = codevector_probs.view(batch_size * sequence_length, -1) + # use probs to retrieve codevectors + codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors + codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1) + codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1) + + return codevectors, perplexity + + +@auto_docstring +class UniSpeechPreTrainedModel(PreTrainedModel): + config: UniSpeechConfig + base_model_prefix = "unispeech" + main_input_name = "input_values" + supports_gradient_checkpointing = True + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + def _init_weights(self, module): + """Initialize the weights""" + # gumbel softmax requires special init + if isinstance(module, UniSpeechGumbelVectorQuantizer): + module.weight_proj.weight.data.normal_(mean=0.0, std=1) + module.weight_proj.bias.data.zero_() + nn.init.uniform_(module.codevectors) + elif isinstance(module, UniSpeechPositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, UniSpeechFeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + + def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]): + """ + Computes the output length of the convolutional layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 + + for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): + input_lengths = _conv_out_length(input_lengths, kernel_size, stride) + + return input_lengths + + def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor): + # Effectively attention_mask.sum(-1), but not inplace to be able to run + # on inference mode. + non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1] + output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long) + batch_size = attention_mask.shape[0] + + attention_mask = torch.zeros( + (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device + ) + # these two operations makes sure that all values before the output lengths idxs are attended to + attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1 + attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + return attention_mask + + +def _compute_mask_indices( + shape: tuple[int, int], + mask_prob: float, + mask_length: int, + attention_mask: Optional[torch.LongTensor] = None, + min_masks: int = 0, +) -> np.ndarray: + """ + Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for + ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on + CPU as part of the preprocessing during training. + + Args: + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. + mask_length: size of the mask + min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. + """ + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" + ) + + # epsilon is used for probabilistic rounding + epsilon = np.random.rand(1).item() + + def compute_num_masked_span(input_length): + """Given input length, compute how many spans should be masked""" + num_masked_span = int(mask_prob * input_length / mask_length + epsilon) + num_masked_span = max(num_masked_span, min_masks) + + # make sure num masked span <= sequence_length + if num_masked_span * mask_length > sequence_length: + num_masked_span = sequence_length // mask_length + + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + + return num_masked_span + + # compute number of masked spans in batch + input_lengths = ( + attention_mask.detach().sum(-1).tolist() + if attention_mask is not None + else [sequence_length for _ in range(batch_size)] + ) + + # SpecAugment mask to fill + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool) + spec_aug_mask_idxs = [] + + max_num_masked_span = compute_num_masked_span(sequence_length) + + if max_num_masked_span == 0: + return spec_aug_mask + + for input_length in input_lengths: + # compute num of masked spans for this input + num_masked_span = compute_num_masked_span(input_length) + + # get random indices to mask + spec_aug_mask_idx = np.random.choice( + np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False + ) + + # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] + + spec_aug_mask_idx = np.concatenate( + [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] + ) + spec_aug_mask_idxs.append(spec_aug_mask_idx) + + spec_aug_mask_idxs = np.array(spec_aug_mask_idxs) + + # expand masked indices to masked spans + spec_aug_mask_idxs = np.broadcast_to( + spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) + ) + spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + + # add offset to the starting indexes so that indexes now create a span + offsets = np.arange(mask_length)[None, None, :] + offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( + batch_size, max_num_masked_span * mask_length + ) + spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + + # scatter indices to mask + np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) + + return spec_aug_mask + + +UniSpeechBaseModelOutput = Wav2Vec2BaseModelOutput + + +@auto_docstring +class UniSpeechModel(UniSpeechPreTrainedModel): + def __init__(self, config: UniSpeechConfig): + super().__init__(config) + self.config = config + self.feature_extractor = UniSpeechFeatureEncoder(config) + self.feature_projection = UniSpeechFeatureProjection(config) + + if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: + self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_()) + + if config.do_stable_layer_norm: + self.encoder = UniSpeechEncoderStableLayerNorm(config) + else: + self.encoder = UniSpeechEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def _mask_hidden_states( + self, + hidden_states: torch.FloatTensor, + mask_time_indices: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ): + """ + Masks extracted features along time axis and/or along feature axis according to + [SpecAugment](https://huggingface.co/papers/1904.08779). + """ + + # `config.apply_spec_augment` can set masking to False + if not getattr(self.config, "apply_spec_augment", True): + return hidden_states + + # generate indices & apply SpecAugment along time axis + batch_size, sequence_length, hidden_size = hidden_states.size() + + if mask_time_indices is not None: + # apply SpecAugment along time axis with given mask_time_indices + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + elif self.config.mask_time_prob > 0 and self.training: + mask_time_indices = _compute_mask_indices( + (batch_size, sequence_length), + mask_prob=self.config.mask_time_prob, + mask_length=self.config.mask_time_length, + attention_mask=attention_mask, + min_masks=self.config.mask_time_min_masks, + ) + mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + + if self.config.mask_feature_prob > 0 and self.training: + # generate indices & apply SpecAugment along feature axis + mask_feature_indices = _compute_mask_indices( + (batch_size, hidden_size), + mask_prob=self.config.mask_feature_prob, + mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, + ) + mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) + mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) + hidden_states[mask_feature_indices] = 0 + + return hidden_states + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, UniSpeechBaseModelOutput]: + r""" + mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict + masked extracted features in *config.proj_codevector_dim* space. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + extract_features = self.feature_extractor(input_values) + extract_features = extract_features.transpose(1, 2) + + if attention_mask is not None: + # compute reduced attention_mask corresponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask) + + hidden_states, extract_features = self.feature_projection(extract_features) + hidden_states = self._mask_hidden_states( + hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + + if not return_dict: + return (hidden_states, extract_features) + encoder_outputs[1:] + + return UniSpeechBaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + UniSpeech Model with a vector-quantization module and ctc loss for pre-training. + """ +) +class UniSpeechForPreTraining(UniSpeechPreTrainedModel): + def __init__(self, config: UniSpeechConfig): + super().__init__(config) + self.unispeech = UniSpeechModel(config) + self.dropout_features = nn.Dropout(config.feat_quantizer_dropout) + + self.quantizer = UniSpeechGumbelVectorQuantizer(config) + self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim) + self.project_hid = nn.Linear(config.proj_codevector_dim, config.hidden_size) + + self.ctc_proj = nn.Linear(config.hidden_size, config.num_ctc_classes) + self.dropout = nn.Dropout(config.final_dropout) + + # Initialize weights and apply final processing + self.post_init() + + def set_gumbel_temperature(self, temperature: int): + """ + Set the Gumbel softmax temperature to a given value. Only necessary for training + """ + self.quantizer.temperature = temperature + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.unispeech.feature_extractor._freeze_parameters() + + @staticmethod + def compute_contrastive_logits( + target_features: torch.FloatTensor, + negative_features: torch.FloatTensor, + predicted_features: torch.FloatTensor, + temperature: int = 1, + ): + """ + Compute logits for contrastive loss based using cosine similarity as the distance measure between + `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied. + """ + target_features = torch.cat([target_features, negative_features], dim=0) + + logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1) + logits = logits.type_as(target_features) + + # apply temperature + logits = logits / temperature + return logits + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, UniSpeechForPreTrainingOutput]: + r""" + Example: + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv") + >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv") + >>> # TODO: Add full pretraining example + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.unispeech( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + transformer_features = outputs[0] + + # quantize all (unmasked) extracted features and project to final vq dim + extract_features = self.dropout_features(outputs[1]) + quantized_features, codevector_perplexity = self.quantizer(extract_features) + + # project quantized features twice + quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype)) + quantized_features = self.project_hid(quantized_features) + + prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_( + self.config.replace_prob + ) + prob_replace_matrix = prob_replace_matrix.transpose(0, 1) + sampled_replace_matrix = torch.bernoulli(prob_replace_matrix).bool().to(transformer_features.device) + sampled_replace_matrix = sampled_replace_matrix.transpose(0, 1) + sampled_replace_matrix = sampled_replace_matrix.unsqueeze(-1) + logits = transformer_features.masked_fill(sampled_replace_matrix, 0.0) + ( + quantized_features.masked_fill(~sampled_replace_matrix, 0.0) + ) + + # project to ctc units + logits = self.dropout(logits) + logits = self.ctc_proj(logits) + + # TODO(PVP) - add negative sampling & loss computation + loss = None + if not return_dict: + if loss is not None: + return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + + return UniSpeechForPreTrainingOutput( + loss=loss, + projected_states=transformer_features, + projected_quantized_states=quantized_features, + codevector_perplexity=codevector_perplexity, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +_HIDDEN_STATES_START_POSITION = 2 + + +@auto_docstring( + custom_intro=""" + UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + """ +) +class UniSpeechForCTC(UniSpeechPreTrainedModel): + def __init__(self, config, target_lang: Optional[str] = None): + r""" + target_lang (`str`, *optional*): + Language id of adapter weights. Adapter weights are stored in the format adapter..safetensors or + adapter..bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' by + default. + """ + super().__init__(config) + + self.unispeech = UniSpeechModel(config) + self.dropout = nn.Dropout(config.final_dropout) + + self.target_lang = target_lang + + if config.vocab_size is None: + raise ValueError( + f"You are trying to instantiate {self.__class__} with a configuration that " + "does not define the vocabulary size of the language model head. Please " + "instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. " + "or define `vocab_size` of your model's configuration." + ) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + def tie_weights(self): + """ + This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when + passing `target_lang=...` to `from_pretrained(...)`. + + This method is **not** supposed to be called by the user and is prone to be changed in the future. + """ + + # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to + # correctly load adapter layers for UniSpeech so that we do not have to introduce a new API to + # [`PreTrainedModel`]. While slightly hacky, UniSpeech never has to tie input and output embeddings, so that it is + # ok to repurpose this function here. + target_lang = self.target_lang + + if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None: + raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.") + elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None: + logger.info("By default `target_lang` is set to 'eng'.") + elif target_lang is not None: + self.load_adapter(target_lang, force_load=True) + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.unispeech.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.unispeech.parameters(): + param.requires_grad = False + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple, CausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*): + Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to + the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. + All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None and labels.max() >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + + outputs = self.unispeech( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states) + + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # retrieve loss input_lengths from attention_mask + attention_mask = ( + attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long) + ) + input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long) + + # assuming that padded tokens are filled with -100 + # when not being attended to + labels_mask = labels >= 0 + target_lengths = labels_mask.sum(-1) + flattened_targets = labels.masked_select(labels_mask) + + # ctc_loss doesn't support fp16 + log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1) + + with torch.backends.cudnn.flags(enabled=False): + loss = nn.functional.ctc_loss( + log_probs, + flattened_targets, + input_lengths, + target_lengths, + blank=self.config.pad_token_id, + reduction=self.config.ctc_loss_reduction, + zero_infinity=self.config.ctc_zero_infinity, + ) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@auto_docstring( + custom_intro=""" + UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like + SUPERB Keyword Spotting. + """ +) +class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)" + ) + self.unispeech = UniSpeechModel(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) + self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.unispeech.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.unispeech.parameters(): + param.requires_grad = False + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.unispeech( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + hidden_states = self.projector(hidden_states) + if attention_mask is None: + pooled_output = hidden_states.mean(dim=1) + else: + padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 + pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "UniSpeechForCTC", + "UniSpeechForPreTraining", + "UniSpeechForSequenceClassification", + "UniSpeechModel", + "UniSpeechPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..900079b7bb9b469f562bab3be8ea3605f03bd0f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.py @@ -0,0 +1,445 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch UniSpeech model.""" + +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...modeling_outputs import ModelOutput, Wav2Vec2BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from ..wav2vec2.modeling_wav2vec2 import ( + Wav2Vec2Encoder, + Wav2Vec2EncoderStableLayerNorm, + Wav2Vec2FeatureEncoder, + Wav2Vec2FeatureProjection, + Wav2Vec2ForCTC, + Wav2Vec2ForSequenceClassification, + Wav2Vec2GumbelVectorQuantizer, + Wav2Vec2Model, + Wav2Vec2PositionalConvEmbedding, +) +from .configuration_unispeech import UniSpeechConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions. + """ +) +class UniSpeechForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official + paper](https://huggingface.co/papers/2006.11477). + projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked + projected quantized states. + projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive + target vectors for contrastive loss. + codevector_perplexity (`torch.FloatTensor` of shape `(1,)`): + The perplexity of the codevector distribution, used to measure the diversity of the codebook. + """ + + loss: Optional[torch.FloatTensor] = None + projected_states: Optional[torch.FloatTensor] = None + projected_quantized_states: Optional[torch.FloatTensor] = None + codevector_perplexity: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +class UniSpeechPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding): + pass + + +class UniSpeechFeatureEncoder(Wav2Vec2FeatureEncoder): + pass + + +class UniSpeechFeatureProjection(Wav2Vec2FeatureProjection): + pass + + +class UniSpeechEncoder(Wav2Vec2Encoder): + pass + + +class UniSpeechEncoderStableLayerNorm(Wav2Vec2EncoderStableLayerNorm): + pass + + +class UniSpeechGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer): + @staticmethod + def _compute_perplexity(probs): + marginal_probs = probs.mean(dim=0) + perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum() + return perplexity + + def forward(self, hidden_states): + batch_size, sequence_length, hidden_size = hidden_states.shape + + # project to codevector dim + hidden_states = self.weight_proj(hidden_states) + hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1) + + if self.training: + # sample code vector probs via gumbel in differentiateable way + codevector_probs = nn.functional.gumbel_softmax( + hidden_states.float(), tau=self.temperature, hard=True + ).type_as(hidden_states) + + # compute perplexity + codevector_soft_dist = torch.softmax( + hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1 + ) + perplexity = self._compute_perplexity(codevector_soft_dist) + else: + # take argmax in non-differentiable way + # comptute hard codevector distribution (one hot) + codevector_idx = hidden_states.argmax(dim=-1) + codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_( + -1, codevector_idx.view(-1, 1), 1.0 + ) + codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) + + perplexity = self._compute_perplexity(codevector_probs) + + codevector_probs = codevector_probs.view(batch_size * sequence_length, -1) + # use probs to retrieve codevectors + codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors + codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1) + codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1) + + return codevectors, perplexity + + +@auto_docstring +class UniSpeechPreTrainedModel(PreTrainedModel): + config: UniSpeechConfig + base_model_prefix = "unispeech" + main_input_name = "input_values" + supports_gradient_checkpointing = True + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + def _init_weights(self, module): + """Initialize the weights""" + # gumbel softmax requires special init + if isinstance(module, UniSpeechGumbelVectorQuantizer): + module.weight_proj.weight.data.normal_(mean=0.0, std=1) + module.weight_proj.bias.data.zero_() + nn.init.uniform_(module.codevectors) + elif isinstance(module, UniSpeechPositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, UniSpeechFeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + + def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]): + """ + Computes the output length of the convolutional layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 + + for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): + input_lengths = _conv_out_length(input_lengths, kernel_size, stride) + + return input_lengths + + def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor): + # Effectively attention_mask.sum(-1), but not inplace to be able to run + # on inference mode. + non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1] + output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long) + batch_size = attention_mask.shape[0] + + attention_mask = torch.zeros( + (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device + ) + # these two operations makes sure that all values before the output lengths idxs are attended to + attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1 + attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + return attention_mask + + +UniSpeechBaseModelOutput = Wav2Vec2BaseModelOutput + + +class UniSpeechModel(UniSpeechPreTrainedModel, Wav2Vec2Model): + def __init__(self, config: UniSpeechConfig): + UniSpeechPreTrainedModel.__init__(self, config) + self.config = config + self.feature_extractor = UniSpeechFeatureEncoder(config) + self.feature_projection = UniSpeechFeatureProjection(config) + + if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: + self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_()) + + if config.do_stable_layer_norm: + self.encoder = UniSpeechEncoderStableLayerNorm(config) + else: + self.encoder = UniSpeechEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_extractor(self): + raise AttributeError("Not needed for UniSpeech") + + def freeze_feature_encoder(self): + raise AttributeError("Not needed for UniSpeech") + + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, UniSpeechBaseModelOutput]: + r""" + mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict + masked extracted features in *config.proj_codevector_dim* space. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + extract_features = self.feature_extractor(input_values) + extract_features = extract_features.transpose(1, 2) + + if attention_mask is not None: + # compute reduced attention_mask corresponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask) + + hidden_states, extract_features = self.feature_projection(extract_features) + hidden_states = self._mask_hidden_states( + hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + + if not return_dict: + return (hidden_states, extract_features) + encoder_outputs[1:] + + return UniSpeechBaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + UniSpeech Model with a vector-quantization module and ctc loss for pre-training. + """ +) +class UniSpeechForPreTraining(UniSpeechPreTrainedModel): + def __init__(self, config: UniSpeechConfig): + super().__init__(config) + self.unispeech = UniSpeechModel(config) + self.dropout_features = nn.Dropout(config.feat_quantizer_dropout) + + self.quantizer = UniSpeechGumbelVectorQuantizer(config) + self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim) + self.project_hid = nn.Linear(config.proj_codevector_dim, config.hidden_size) + + self.ctc_proj = nn.Linear(config.hidden_size, config.num_ctc_classes) + self.dropout = nn.Dropout(config.final_dropout) + + # Initialize weights and apply final processing + self.post_init() + + def set_gumbel_temperature(self, temperature: int): + """ + Set the Gumbel softmax temperature to a given value. Only necessary for training + """ + self.quantizer.temperature = temperature + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.unispeech.feature_extractor._freeze_parameters() + + @staticmethod + def compute_contrastive_logits( + target_features: torch.FloatTensor, + negative_features: torch.FloatTensor, + predicted_features: torch.FloatTensor, + temperature: int = 1, + ): + """ + Compute logits for contrastive loss based using cosine similarity as the distance measure between + `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied. + """ + target_features = torch.cat([target_features, negative_features], dim=0) + + logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1) + logits = logits.type_as(target_features) + + # apply temperature + logits = logits / temperature + return logits + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, UniSpeechForPreTrainingOutput]: + r""" + Example: + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv") + >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv") + >>> # TODO: Add full pretraining example + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.unispeech( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + transformer_features = outputs[0] + + # quantize all (unmasked) extracted features and project to final vq dim + extract_features = self.dropout_features(outputs[1]) + quantized_features, codevector_perplexity = self.quantizer(extract_features) + + # project quantized features twice + quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype)) + quantized_features = self.project_hid(quantized_features) + + prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_( + self.config.replace_prob + ) + prob_replace_matrix = prob_replace_matrix.transpose(0, 1) + sampled_replace_matrix = torch.bernoulli(prob_replace_matrix).bool().to(transformer_features.device) + sampled_replace_matrix = sampled_replace_matrix.transpose(0, 1) + sampled_replace_matrix = sampled_replace_matrix.unsqueeze(-1) + logits = transformer_features.masked_fill(sampled_replace_matrix, 0.0) + ( + quantized_features.masked_fill(~sampled_replace_matrix, 0.0) + ) + + # project to ctc units + logits = self.dropout(logits) + logits = self.ctc_proj(logits) + + # TODO(PVP) - add negative sampling & loss computation + loss = None + if not return_dict: + if loss is not None: + return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + + return UniSpeechForPreTrainingOutput( + loss=loss, + projected_states=transformer_features, + projected_quantized_states=quantized_features, + codevector_perplexity=codevector_perplexity, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class UniSpeechForCTC(Wav2Vec2ForCTC): + pass + + +class UniSpeechForSequenceClassification(Wav2Vec2ForSequenceClassification): + pass + + +__all__ = [ + "UniSpeechForCTC", + "UniSpeechForPreTraining", + "UniSpeechForSequenceClassification", + "UniSpeechModel", + "UniSpeechPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..614f80d221131a42e162d79490b5b1efa46680d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/configuration_unispeech_sat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/configuration_unispeech_sat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c79e7e2c348a2a3405630905cc6ba0fb9334e4c3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/configuration_unispeech_sat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modeling_unispeech_sat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modeling_unispeech_sat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bcfb3da23fc7b5986870a0ff0581adfec5bf0ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modeling_unispeech_sat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modular_unispeech_sat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modular_unispeech_sat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2d5fa35fac6543e1ba8099dbcb93cf92aa96e09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modular_unispeech_sat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..418e017d34f3fd75603f6294e26703ad239f50af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/configuration_univnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/configuration_univnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ced8cc30ac718592a38863c5c78a1f487d6dc713 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/configuration_univnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/feature_extraction_univnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/feature_extraction_univnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b1fd42c94ee1c8f7a7967eae2d5acaa584c29ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/feature_extraction_univnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/modeling_univnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/modeling_univnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c79f48b6009613c9a8d733cbe8424901dc58d80 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/modeling_univnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..613aae114b339649af52e6000dfa52ee18438f5f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_vision_encoder_decoder import * + from .modeling_flax_vision_encoder_decoder import * + from .modeling_tf_vision_encoder_decoder import * + from .modeling_vision_encoder_decoder import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..248bf73ff9faf4ec54388c7fd947a1e3a3333b13 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py @@ -0,0 +1,215 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging +from ..auto.configuration_auto import AutoConfig + + +if TYPE_CHECKING: + from ... import PreTrainedTokenizerBase, TensorType + +logger = logging.get_logger(__name__) + + +class VisionEncoderDecoderConfig(PretrainedConfig): + r""" + [`VisionEncoderDecoderConfig`] is the configuration class to store the configuration of a + [`VisionEncoderDecoderModel`]. It is used to instantiate a Vision-Encoder-Text-Decoder model according to the + specified arguments, defining the encoder and decoder configs. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + kwargs (*optional*): + Dictionary of keyword arguments. Notably: + + - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines + the encoder config. + - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines + the decoder config. + + Examples: + + ```python + >>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel + + >>> # Initializing a ViT & BERT style configuration + >>> config_encoder = ViTConfig() + >>> config_decoder = BertConfig() + + >>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) + + >>> # Initializing a ViTBert model (with random weights) from a ViT & google-bert/bert-base-uncased style configurations + >>> model = VisionEncoderDecoderModel(config=config) + + >>> # Accessing the model configuration + >>> config_encoder = model.config.encoder + >>> config_decoder = model.config.decoder + >>> # set decoder config to causal lm + >>> config_decoder.is_decoder = True + >>> config_decoder.add_cross_attention = True + + >>> # Saving the model, including its configuration + >>> model.save_pretrained("my-model") + + >>> # loading model and config from pretrained folder + >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained("my-model") + >>> model = VisionEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config) + ```""" + + model_type = "vision-encoder-decoder" + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} + has_no_defaults_at_init = True + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if "encoder" not in kwargs or "decoder" not in kwargs: + raise ValueError( + f"A configuration of type {self.model_type} cannot be instantiated because " + f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}" + ) + + encoder_config = kwargs.pop("encoder") + encoder_model_type = encoder_config.pop("model_type") + decoder_config = kwargs.pop("decoder") + decoder_model_type = decoder_config.pop("model_type") + + self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) + self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) + self.is_encoder_decoder = True + + @classmethod + def from_encoder_decoder_configs( + cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs + ) -> PretrainedConfig: + r""" + Instantiate a [`VisionEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model + configuration and decoder model configuration. + + Returns: + [`VisionEncoderDecoderConfig`]: An instance of a configuration object + """ + logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config") + decoder_config.is_decoder = True + decoder_config.add_cross_attention = True + + return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs) + + +class VisionEncoderDecoderEncoderOnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict({"last_hidden_state": {0: "batch", 1: "encoder_sequence"}}) + + +class VisionEncoderDecoderDecoderOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = OrderedDict() + common_inputs["input_ids"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + common_inputs["attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + common_inputs["encoder_hidden_states"] = {0: "batch", 1: "encoder_sequence"} + + return common_inputs + + def generate_dummy_inputs( + self, + tokenizer: "PreTrainedTokenizerBase", + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional["TensorType"] = None, + ) -> Mapping[str, Any]: + import torch + + common_inputs = OrderedDict() + + dummy_input = super().generate_dummy_inputs( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + batch, encoder_sequence = dummy_input["input_ids"].shape + encoder_hidden_states_shape = (batch, encoder_sequence, self._config.encoder_hidden_size) + common_inputs["input_ids"] = dummy_input.pop("input_ids") + common_inputs["attention_mask"] = dummy_input.pop("attention_mask") + common_inputs["encoder_hidden_states"] = torch.zeros(encoder_hidden_states_shape) + + return common_inputs + + +class VisionEncoderDecoderOnnxConfig(OnnxConfig): + @property + def inputs(self) -> None: + pass + + def get_encoder_config(self, encoder_config: PretrainedConfig) -> OnnxConfig: + r""" + Returns ONNX encoder config for `VisionEncoderDecoder` model. + + Args: + encoder_config (`PretrainedConfig`): + The encoder model's configuration to use when exporting to ONNX. + + Returns: + [`VisionEncoderDecoderEncoderOnnxConfig`]: An instance of the ONNX configuration object + """ + return VisionEncoderDecoderEncoderOnnxConfig(encoder_config) + + def get_decoder_config( + self, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, feature: str = "default" + ) -> OnnxConfig: + r""" + Returns ONNX decoder config for `VisionEncoderDecoder` model. + + Args: + encoder_config (`PretrainedConfig`): + The encoder model's configuration to use when exporting to ONNX. + decoder_config (`PretrainedConfig`): + The decoder model's configuration to use when exporting to ONNX + feature (`str`, *optional*): + The type of feature to export the model with. + + Returns: + [`VisionEncoderDecoderDecoderOnnxConfig`]: An instance of the ONNX configuration object. + """ + decoder_config.encoder_hidden_size = encoder_config.hidden_size + return VisionEncoderDecoderDecoderOnnxConfig(decoder_config, feature) + + +__all__ = ["VisionEncoderDecoderConfig", "VisionEncoderDecoderOnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..a59c799cc04afa51c0f2156baa06a88683f1dbf6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py @@ -0,0 +1,864 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Classes to support Vision-Encoder-Text-Decoder architectures""" + +import os +from typing import Optional, Union + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax +from jax.random import PRNGKey + +from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput +from ...modeling_flax_utils import FlaxPreTrainedModel +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ..auto.configuration_auto import AutoConfig +from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM +from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "VisionEncoderDecoderConfig" + +VISION_ENCODER_DECODER_START_DOCSTRING = r""" + This class can be used to initialize an image-to-text-sequence model with any pretrained vision autoencoding model + as the encoder and any pretrained text autoregressive model as the decoder. The encoder is loaded via + [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`] + function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream + generative task, like image captioning. + + The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation + tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation + Tasks](https://huggingface.co/papers/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi + Zhou, Wei Li, Peter J. Liu. + + Additionally, in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained + Models](https://huggingface.co/papers/2109.10282) it is shown how leveraging large pretrained vision models for optical + character recognition (OCR) yields a significant performance improvement. + + After such a Vision-Encoder-Text-Decoder model has been trained/fine-tuned, it can be saved/loaded just like any + other models (see the examples for more information). + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Parameters: + config ([`VisionEncoderDecoderConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using + [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details. + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + decoder_position_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.decoder.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple. +""" + +VISION_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using + [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple. +""" + +VISION_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r""" + Args: + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + For sequence to sequence training, `decoder_input_ids` should be provided. If no `decoder_input_ids` is + provided, the model will create this tensor by shifting the `input_ids` to the right for denoising + pre-training. + encoder_outputs (`tuple(tuple(jnp.ndarray)`): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + decoder_position_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.decoder.max_position_embeddings - 1]`. + past_key_values (`dict[str, jnp.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + If set to `True`, the model will return a [`~utils.FlaxCausalLMOutputWithCrossAttentions`] instead of a + plain tuple. +""" + + +class FlaxVisionEncoderDecoderModule(nn.Module): + config: VisionEncoderDecoderConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + encoder_config = self.config.encoder + decoder_config = self.config.decoder + + # Copied from `modeling_hybrid_clip.py` with modifications. + from ...models.auto.modeling_flax_auto import FLAX_MODEL_FOR_CAUSAL_LM_MAPPING, FLAX_MODEL_MAPPING + + encoder_module = FLAX_MODEL_MAPPING[encoder_config.__class__].module_class + decoder_module = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING[decoder_config.__class__].module_class + + self.encoder = encoder_module(encoder_config, dtype=self.dtype) + self.decoder = decoder_module(decoder_config, dtype=self.dtype) + + # encoder outputs might need to be projected to different dimension for decoder + if ( + self.encoder.config.hidden_size != self.decoder.config.hidden_size + and self.decoder.config.cross_attention_hidden_size is None + ): + self.enc_to_dec_proj = nn.Dense( + self.decoder.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range), + dtype=self.dtype, + ) + else: + self.enc_to_dec_proj = None + + def _get_encoder_module(self): + return self.encoder + + def _get_projection_module(self): + return self.enc_to_dec_proj + + def _get_decoder_module(self): + return self.decoder + + def __call__( + self, + pixel_values, + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + encoder_outputs = self.encoder( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + encoder_hidden_states = encoder_outputs[0] + + # optionally project encoder_hidden_states + if self.enc_to_dec_proj is not None: + encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states) + + # The advantage of explicitly setting this is TPU XLA compiler knows as soon as possible what shape this + # variable has and can better optimize. Also passing `None` can lead to some problems when jitting the model. + # In Flax/JAX, we only want to pass `None` for non-tensor function inputs. For all tensor function inputs, we + # should always pass a tensor and not `None`. + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return FlaxSeq2SeqLMOutput( + logits=decoder_outputs.logits, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings(VISION_ENCODER_DECODER_START_DOCSTRING) +class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): + r""" + [`FlaxVisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture + with the module (flax.nn.Module) of one of the base vision model classes of the library as encoder module and + another one as decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method + for the encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder. + """ + + config_class = VisionEncoderDecoderConfig + base_model_prefix = "vision_encoder_decoder" + main_input_name = "pixel_values" + module_class = FlaxVisionEncoderDecoderModule + + def __init__( + self, + config: VisionEncoderDecoderConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + if not _do_init: + raise ValueError( + "`FlaxVisionEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`." + ) + + if input_shape is None: + num_channels = getattr(config.encoder, "num_channels", 3) + input_shape = ( + (1, config.encoder.image_size, config.encoder.image_size, num_channels), + (1, 1), + ) + + if config.decoder.cross_attention_hidden_size is not None: + if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size: + raise ValueError( + "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal" + f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for" + f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for" + " `config.encoder.hidden_size`." + ) + + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + encoder_input_shape, decoder_input_shape = input_shape + + # init input tensors + pixel_values = jnp.zeros(encoder_input_shape, dtype=self.dtype) + decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4") + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + + batch_size, _, _, _ = pixel_values.shape + decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape + if not decoder_batch_size == batch_size: + raise ValueError( + f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder " + f"and {decoder_batch_size} for decoder." + ) + decoder_position_ids = jnp.broadcast_to( + jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length) + ) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, + pixel_values, + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length, encoder_outputs): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`): + `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: + `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) + is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + """ + # init input variables to retrieve cache + decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4") + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + decoder_position_ids = jnp.broadcast_to( + jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape + ) + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + **kwargs, + ) + + init_variables = self.module.init( + jax.random.PRNGKey(0), + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + init_cache=True, + method=_decoder_forward, # we only need to call the decoder to init the cache + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings(VISION_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC) + def encode( + self, + pixel_values: jnp.ndarray, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoImageProcessor, FlaxVisionEncoderDecoderModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") + + >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized + >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained( + ... "google/vit-base-patch16-224-in21k", "openai-community/gpt2" + ... ) + + >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values + >>> encoder_outputs = model.encode(pixel_values) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # `FlaxViTModel` expects channel first format, but `FlaxViTModule` expects channel last format. + # Currently, we assume this holds for all Flax vision models, and perform a transpose here. + pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + def _encoder_forward(module, pixel_values, **kwargs): + encode_module = module._get_encoder_module() + return encode_module(pixel_values, **kwargs) + + outputs = self.module.apply( + {"params": params or self.params}, + pixel_values=jnp.array(pixel_values, dtype=self.dtype), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + method=_encoder_forward, + ) + + if return_dict: + outputs = FlaxBaseModelOutput( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + return outputs + + @add_start_docstrings(VISION_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def decode( + self, + decoder_input_ids, + encoder_outputs, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: Optional[dict] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoImageProcessor, FlaxVisionEncoderDecoderModel + >>> import jax.numpy as jnp + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") + + >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized + >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained( + ... "google/vit-base-patch16-224-in21k", "openai-community/gpt2" + ... ) + + >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values + >>> encoder_outputs = model.encode(pixel_values) + + >>> decoder_start_token_id = model.config.decoder.bos_token_id + >>> decoder_input_ids = jnp.ones((pixel_values.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> logits = outputs.logits + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBartAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward( + module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs + ): + projection_module = module._get_projection_module() + decoder_module = module._get_decoder_module() + + # optionally project encoder_hidden_states + if projection_module is not None: + encoder_hidden_states = projection_module(encoder_hidden_states) + + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + encoder_hidden_states, + **kwargs, + ) + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past = outputs + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past = outputs + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + @add_start_docstrings_to_model_forward(VISION_ENCODER_DECODER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def __call__( + self, + pixel_values: jnp.ndarray, + decoder_input_ids: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Examples: + + ```python + >>> from transformers import FlaxVisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") + + >>> # load output tokenizer + >>> tokenizer_output = AutoTokenizer.from_pretrained("openai-community/gpt2") + + >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized + >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained( + ... "google/vit-base-patch16-224-in21k", "openai-community/gpt2" + ... ) + + >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values + + >>> # use GPT2's eos_token as the pad as well as eos token + >>> model.config.eos_token_id = model.config.decoder.eos_token_id + >>> model.config.pad_token_id = model.config.eos_token_id + + >>> # generation + >>> sequences = model.generate(pixel_values, num_beams=4, max_length=12).sequences + + >>> captions = tokenizer_output.batch_decode(sequences, skip_special_tokens=True) + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # prepare encoder inputs + + # `FlaxViTModel` expects channel first format, but `FlaxViTModule` expects channel last format. + # Currently, we assume this holds for all Flax vision models, and perform a transpose here. + pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1)) + + # prepare decoder inputs + if decoder_input_ids is None: + raise ValueError("`decoder_input_ids` can't be `None`.") + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + if decoder_position_ids is None: + batch_size, sequence_length = decoder_input_ids.shape + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} + + return self.module.apply( + {"params": params or self.params}, + pixel_values=jnp.array(pixel_values, dtype=self.dtype), + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + ) + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + max_length, + decoder_attention_mask: Optional[jax.Array] = None, + encoder_outputs=None, + **kwargs, + ): + # initializing the cache + batch_size, seq_length = decoder_input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length, encoder_outputs) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if decoder_attention_mask is not None: + decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0)) + else: + decoder_position_ids = jnp.broadcast_to( + jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length) + ) + + return { + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "decoder_attention_mask": extended_attention_mask, + "decoder_position_ids": decoder_position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1 + return model_kwargs + + @classmethod + def from_encoder_decoder_pretrained( + cls, + encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, + decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, + *model_args, + **kwargs, + ) -> FlaxPreTrainedModel: + r""" + Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model + checkpoints. + + Params: + encoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*): + Information necessary to initiate the encoder. Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An + example is `google/vit-base-patch16-224-in21k`. + - A path to a *directory* containing model weights saved using + [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + + decoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*, defaults to `None`): + Information necessary to initiate the decoder. Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + + model_args (remaining positional arguments, *optional*): + All remaining positional arguments will be passed to the underlying model's `__init__` method. + + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). + + - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter. + - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter. + - To update the parent model configuration, do not use a prefix for each configuration parameter. + + Behaves differently depending on whether a `config` is provided or automatically loaded. + + Example: + + ```python + >>> from transformers import FlaxVisionEncoderDecoderModel + + >>> # initialize a vit-gpt2 from a pretrained ViT and a pretrained GPT2 model. Note that the cross-attention layers will be randomly initialized + >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained( + ... "google/vit-base-patch16-224-in21k", "openai-community/gpt2" + ... ) + >>> # saving model after fine-tuning + >>> model.save_pretrained("./vit-gpt2") + >>> # load fine-tuned model + >>> model = FlaxVisionEncoderDecoderModel.from_pretrained("./vit-gpt2") + ```""" + + kwargs_encoder = { + argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") + } + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + # remove encoder, decoder kwargs from kwargs + for key in kwargs_encoder: + del kwargs["encoder_" + key] + for key in kwargs_decoder: + del kwargs["decoder_" + key] + + # Load and initialize the encoder and decoder + # The distinction between encoder and decoder at the model level is made + # by the value of the flag `is_decoder` that we need to set correctly. + encoder = kwargs_encoder.pop("model", None) + if encoder is None: + if encoder_pretrained_model_name_or_path is None: + raise ValueError( + "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has " + "to be defined." + ) + + if "config" not in kwargs_encoder: + encoder_config = AutoConfig.from_pretrained(encoder_pretrained_model_name_or_path) + if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True: + logger.info( + f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model " + "from a decoder model. Cross-attention and causal mask are disabled." + ) + encoder_config.is_decoder = False + encoder_config.add_cross_attention = False + + kwargs_encoder["config"] = encoder_config + + encoder = FlaxAutoModel.from_pretrained( + encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder + ) + + decoder = kwargs_decoder.pop("model", None) + if decoder is None: + if decoder_pretrained_model_name_or_path is None: + raise ValueError( + "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has " + "to be defined." + ) + + if "config" not in kwargs_decoder: + decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path) + if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False: + logger.info( + f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention" + f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if" + f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers." + ) + decoder_config.is_decoder = True + decoder_config.add_cross_attention = True + + kwargs_decoder["config"] = decoder_config + + if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False: + logger.warning( + f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. " + f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, " + "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` " + "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a " + "`decoder_config` to `.from_encoder_decoder_pretrained(...)`" + ) + + decoder = FlaxAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) + + # instantiate config with corresponding kwargs + dtype = kwargs.pop("dtype", jnp.float32) + config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs) + + # init model + model = cls(config, dtype=dtype) + model.params["encoder"] = encoder.params + model.params["decoder"] = decoder.params + + return model + + +__all__ = ["FlaxVisionEncoderDecoderModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..ef2ea2109987beabf5f8dc7b758d055f0a543bbf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py @@ -0,0 +1,696 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Classes to support TF Vision-Encoder-Text-Decoder architectures""" + +from __future__ import annotations + +import re +import warnings + +import numpy as np +import tensorflow as tf + +from ...configuration_utils import PretrainedConfig +from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput +from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, keras, unpack_inputs +from ...tf_utils import shape_list +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ..auto.configuration_auto import AutoConfig +from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM +from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "VisionEncoderDecoderConfig" + +DEPRECATION_WARNING = ( + "Version v4.17.0 introduces a better way to train encoder-decoder models by computing the loss inside the" + " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if" + " fine-tuning a model trained with versions anterior to 4.17.0. The decoder_input_ids are now created based on the" + " labels, no need to pass them yourself anymore." +) + +VISION_ENCODER_DECODER_START_DOCSTRING = r""" + This class can be used to initialize an image-to-text-sequence model with any pretrained vision autoencoding model + as the encoder and any pretrained text autoregressive model as the decoder. The encoder is loaded via + [`~TFAutoModel.from_pretrained`] function and the decoder is loaded via [`~TFAutoModelForCausalLM.from_pretrained`] + function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream + generative task, like image captioning. + + The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation + tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation + Tasks](https://huggingface.co/papers/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi + Zhou, Wei Li, Peter J. Liu. + + Additionally, in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained + Models](https://huggingface.co/papers/2109.10282) it is shown how leveraging large pretrained vision models for optical + character recognition (OCR) yields a significant performance improvement. + + After such a Vision-Encoder-Text-Decoder model has been trained/fine-tuned, it can be saved/loaded just like any + other models (see the examples for more information). + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`VisionEncoderDecoderConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using the vision's model's image processor. For example, using + [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details. + decoder_input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + Provide for sequence to sequence training to the decoder. Indices can be obtained using + [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for + details. + decoder_attention_mask (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + encoder_outputs (`tuple(tuple(tf.Tensor)`, *optional*): + This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` (`tf.Tensor` of shape `({0}, hidden_size)`) is a tensor of hidden-states at the output + of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(tf.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `({0})`. + decoder_inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert `decoder_input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + labels (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0, + ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). + kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments come in two flavors: + + - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function. + - With a *decoder_* prefix which will be input as `**decoder_kwargs` for the decoder forward function. +""" + + +# Copied from transformers.models.encoder_decoder.modeling_tf_encoder_decoder.shift_tokens_right +def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): + if pad_token_id is None: + raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.") + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + + if decoder_start_token_id is None: + raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.") + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + + start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + ) + + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) + + return shifted_input_ids + + +@add_start_docstrings(VISION_ENCODER_DECODER_START_DOCSTRING) +class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss): + r""" + [`TFVisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture + with one of the base vision model classes of the library as encoder and another one of the base model classes as + decoder when created with the [`~TFAutoModel.from_pretrained`] class method for the encoder and + [`~TFAutoModelForCausalLM.from_pretrained`] class method for the decoder. + """ + + config_class = VisionEncoderDecoderConfig + base_model_prefix = "vision_encoder_decoder" + load_weight_prefix = "tf_vision_encoder_decoder_model" + main_input_name = "pixel_values" + + def __init__( + self, + config: PretrainedConfig | None = None, + encoder: TFPreTrainedModel | None = None, + decoder: TFPreTrainedModel | None = None, + ): + if config is None and (encoder is None or decoder is None): + raise ValueError("Either a configuration or an encoder and a decoder has to be provided.") + if config is None: + config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config) + else: + if not isinstance(config, self.config_class): + raise ValueError(f"config: {config} has to be of type {self.config_class}") + + if config.decoder.cross_attention_hidden_size is not None: + if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size: + raise ValueError( + "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal" + f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for" + f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for" + " `config.encoder.hidden_size`." + ) + + # initialize with config + super().__init__(config) + + if encoder is None: + encoder = TFAutoModel.from_config(config.encoder, name="encoder") + + if decoder is None: + decoder = TFAutoModelForCausalLM.from_config(config.decoder, name="decoder") + + self.encoder = encoder + self.decoder = decoder + + if self.encoder.config.to_dict() != self.config.encoder.to_dict(): + logger.warning( + f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:" + f" {self.config.encoder}" + ) + if self.decoder.config.to_dict() != self.config.decoder.to_dict(): + logger.warning( + f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:" + f" {self.config.decoder}" + ) + + # make sure that the individual model's config refers to the shared config + # so that the updates to the config will be synced + self.encoder.config = self.config.encoder + self.decoder.config = self.config.decoder + + # encoder outputs might need to be projected to different dimension for decoder + if ( + self.encoder.config.hidden_size != self.decoder.config.hidden_size + and self.decoder.config.cross_attention_hidden_size is None + ): + self.enc_to_dec_proj = keras.layers.Dense( + units=self.decoder.config.hidden_size, + kernel_initializer=get_initializer(config.encoder.initializer_range), + name="enc_to_dec_proj", + ) + + if self.encoder.get_output_embeddings() is not None: + raise ValueError( + f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head" + ) + + @property + def input_signature(self): + vision_config = self.config.encoder + if hasattr(vision_config, "vision_config"): + vision_config = vision_config.vision_config + if hasattr(vision_config, "image_size"): + image_size = vision_config.image_size + else: + image_size = vision_config.input_size + return { + "pixel_values": tf.TensorSpec( + shape=( + None, + vision_config.num_channels, + image_size, + image_size, + ), + dtype=tf.float32, + ), + "decoder_input_ids": tf.TensorSpec(shape=(None, None), dtype=tf.int32, name="decoder_input_ids"), + } + + def get_encoder(self): + return self.encoder + + def get_input_embeddings(self): + return self.encoder.get_input_embeddings() + + def get_output_embeddings(self): + return self.decoder.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + return self.decoder.set_output_embeddings(new_embeddings) + + def tf_to_pt_weight_rename(self, tf_weight): + # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models + # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal. + # However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption + # here that the config model_type is the same as the name of the MainLayer. I don't know of anywhere that's + # not the case, and I wasn't sure how else to go from the config to the correct MainLayer name! + + # This override is only needed in the case where we're crossloading weights from PT. However, since weights are + # often safetensors now, we don't know if we're going to be crossloading until we sniff the weights file. + # Therefore, we specify tf_to_pt_weight_rename anyway, and let the super method figure out if it needs it + # or not. + encoder_model_type = self.config.encoder.model_type + if "encoder" in tf_weight and "decoder" not in tf_weight: + return (re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight),) + else: + return (tf_weight,) + + @classmethod + def from_encoder_decoder_pretrained( + cls, + encoder_pretrained_model_name_or_path: str | None = None, + decoder_pretrained_model_name_or_path: str | None = None, + *model_args, + **kwargs, + ) -> TFPreTrainedModel: + r""" + Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model + checkpoints. + + + Params: + encoder_pretrained_model_name_or_path (`str`, *optional*): + Information necessary to initiate the encoder. Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An + example is `google/vit-base-patch16-224-in21k`. + - A path to a *directory* containing model weights saved using + [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case, + `encoder_from_pt` should be set to `True`. + + decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to *None*): + Information necessary to initiate the decoder. Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *pytorch checkpoint file* (e.g, `./pt_model/`). In this case, + `decoder_from_pt` should be set to `True`. + + model_args (remaining positional arguments, *optional*): + All remaining positional arguments will be passed to the underlying model's `__init__` method. + + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). + + - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter. + - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter. + - To update the parent model configuration, do not use a prefix for each configuration parameter. + + Behaves differently depending on whether a `config` is provided or automatically loaded. + + Example: + + ```python + >>> from transformers import TFVisionEncoderDecoderModel + + >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized + >>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( + ... "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased" + ... ) + >>> # saving model after fine-tuning + >>> model.save_pretrained("./vit-bert") + >>> # load fine-tuned model + >>> model = TFVisionEncoderDecoderModel.from_pretrained("./vit-bert") + ```""" + + kwargs_encoder = { + argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") + } + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + # remove encoder, decoder kwargs from kwargs + for key in kwargs_encoder: + del kwargs["encoder_" + key] + for key in kwargs_decoder: + del kwargs["decoder_" + key] + + # Load and initialize the encoder and decoder + # The distinction between encoder and decoder at the model level is made + # by the value of the flag `is_decoder` that we need to set correctly. + encoder = kwargs_encoder.pop("model", None) + if encoder is None: + if encoder_pretrained_model_name_or_path is None: + raise ValueError( + "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has " + "to be defined." + ) + + if "config" not in kwargs_encoder: + encoder_config = AutoConfig.from_pretrained(encoder_pretrained_model_name_or_path) + if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True: + logger.info( + f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model " + "from a decoder model. Cross-attention and causal mask are disabled." + ) + encoder_config.is_decoder = False + encoder_config.add_cross_attention = False + + kwargs_encoder["config"] = encoder_config + + kwargs_encoder["name"] = "encoder" + kwargs_encoder["load_weight_prefix"] = cls.load_weight_prefix + encoder = TFAutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder) + + decoder = kwargs_decoder.pop("model", None) + if decoder is None: + if decoder_pretrained_model_name_or_path is None: + raise ValueError( + "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has " + "to be defined." + ) + + if "config" not in kwargs_decoder: + decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path) + if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False: + logger.info( + f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention" + f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if" + f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers." + ) + decoder_config.is_decoder = True + decoder_config.add_cross_attention = True + + kwargs_decoder["config"] = decoder_config + + if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False: + logger.warning( + f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. " + f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, " + "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` " + "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a " + "`decoder_config` to `.from_encoder_decoder_pretrained(...)`" + ) + + kwargs_decoder["name"] = "decoder" + kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix + decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) + + # Make sure these 2 `keras.Model` have fixed names so `from_pretrained` could load model weights correctly. + if encoder.name != "encoder": + raise ValueError("encoder model must be created with the name `encoder`.") + if decoder.name != "decoder": + raise ValueError("decoder model must be created with the name `decoder`.") + + # instantiate config with corresponding kwargs + config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs) + return cls(encoder=encoder, decoder=decoder, config=config) + + @unpack_inputs + @add_start_docstrings_to_model_forward( + VISION_ENCODER_DECODER_INPUTS_DOCSTRING.format("batch_size, sequence_length") + ) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: tuple | TFBaseModelOutput | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + labels: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + **kwargs, + ) -> TFSeq2SeqLMOutput | tuple[tf.Tensor]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoTokenizer, TFVisionEncoderDecoderModel + >>> from PIL import Image + >>> import requests + + >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") + >>> decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + + >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized + >>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( + ... "google/vit-base-patch16-224-in21k", "openai-community/gpt2" + ... ) + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> img = Image.open(requests.get(url, stream=True).raw) + + >>> # forward + >>> pixel_values = image_processor(images=img, return_tensors="tf").pixel_values # Batch size 1 + >>> decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids # Batch size 1 + >>> outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids) + + >>> # training + >>> outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) + >>> loss, logits = outputs.loss, outputs.logits + + >>> # save and load from pretrained + >>> model.save_pretrained("vit-gpt2") + >>> model = TFVisionEncoderDecoderModel.from_pretrained("vit-gpt2") + + >>> # generation + >>> generated = model.generate(pixel_values, decoder_start_token_id=model.config.decoder.bos_token_id) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + # Let the user be responsible for the expected format. + if encoder_outputs is not None: + if return_dict and not isinstance(encoder_outputs, ModelOutput): + raise ValueError( + "If `return_dict=True` and `encoder_outputs` is provided, it should be an instance of " + f"`ModelOutput`. Got an instance {type(encoder_outputs)} for `encoder_outputs`." + ) + + if encoder_outputs is None: + encoder_inputs = { + "input_ids": pixel_values, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + "training": training, + } + + # Add arguments to encoder from `kwargs_encoder` + encoder_inputs.update(kwargs_encoder) + + if "input_ids" in encoder_inputs: + encoder_inputs["pixel_values"] = encoder_inputs.pop("input_ids") + + if encoder_inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") + + # Handle the case where the inputs are passed as a single dict which contains `labels`. + # The `labels` shouldn't be passed to `self.encoder` below, because it is a based model without this + # parameter (otherwise, an error occurs when `input_processing` is called inside `self.encoder.call()`). + if "labels" in encoder_inputs: + labels = encoder_inputs.pop("labels") + + # handle the init case where `dummy_inputs` returns a dict containing `decoder_input_ids`. + if "decoder_input_ids" in encoder_inputs: + decoder_input_ids = encoder_inputs.pop("decoder_input_ids") + # handle the init case where `dummy_inputs` returns a dict containing `decoder_input_ids`. + if "decoder_attention_mask" in encoder_inputs: + decoder_attention_mask = encoder_inputs.pop("decoder_attention_mask") + + encoder_outputs = self.encoder(**encoder_inputs) + + encoder_hidden_states = encoder_outputs[0] + + # optionally project encoder_hidden_states + if ( + self.encoder.config.hidden_size != self.decoder.config.hidden_size + and self.decoder.config.cross_attention_hidden_size is None + ): + encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states) + + if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None): + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + batch_size, sequence_length = shape_list(encoder_hidden_states)[:2] + encoder_attention_mask = tf.ones(shape=(batch_size, sequence_length), dtype=tf.int32) + + decoder_inputs = { + "input_ids": decoder_input_ids, + "attention_mask": decoder_attention_mask, + "encoder_hidden_states": encoder_hidden_states, + "encoder_attention_mask": encoder_attention_mask, + "inputs_embeds": decoder_inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "use_cache": use_cache, + "past_key_values": past_key_values, + "return_dict": return_dict, + "training": training, + } + + # Add arguments to decoder from `kwargs_decoder` + decoder_inputs.update(kwargs_decoder) + + decoder_outputs = self.decoder(**decoder_inputs) + + logits = decoder_outputs[0] + + # Compute loss independent from decoder (as some shift the logits inside them) + loss = None + if labels is not None: + warnings.warn(DEPRECATION_WARNING, FutureWarning) + loss = self.hf_compute_loss(labels, logits) + + if not return_dict: + past_key_values = None + if use_cache: + past_key_values = decoder_outputs[1] + # The starting index of the remaining elements in `decoder_outputs` + start_index = sum([1 if x is not None else 0 for x in (loss, logits, past_key_values)]) + + if not isinstance(encoder_outputs, tuple): + encoder_outputs = encoder_outputs.to_tuple() + output = (loss, logits, past_key_values) + decoder_outputs[start_index:] + encoder_outputs + output = tuple(x for x in output if x is not None) + return output + + return TFSeq2SeqLMOutput( + loss=loss, + logits=decoder_outputs.logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.decoder.use_cache else None + dec_hs = ( + tf.convert_to_tensor(output.decoder_hidden_states) if self.config.decoder.output_hidden_states else None + ) + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.decoder.output_attentions else None + enc_hs = ( + tf.convert_to_tensor(output.encoder_hidden_states) if self.config.encoder.output_hidden_states else None + ) + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.encoder.output_attentions else None + cross_attns = ( + tf.convert_to_tensor(output.cross_attentions) + if self.config.decoder.output_attentions and output.cross_attentions is not None + else None + ) + + return TFSeq2SeqLMOutput( + logits=output.logits, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + cross_attentions=cross_attns, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs + ): + decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values) + decoder_attention_mask = decoder_inputs.get("attention_mask", None) + past_key_values = decoder_inputs.get("past_key_values") + input_dict = { + "pixel_values": None, # needs to be passed to make Keras.layer.__call__ happy + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "decoder_input_ids": decoder_inputs["input_ids"], + # TODO (joao): the `TFBaseModelOutput` wrapper should not be needed after the generate refactor is complete + "encoder_outputs": TFBaseModelOutput(last_hidden_state=encoder_outputs[0]), + "past_key_values": past_key_values, + "use_cache": use_cache, + } + return input_dict + + def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + def resize_token_embeddings(self, *args, **kwargs): + raise NotImplementedError( + "Resizing the embedding layers via the TFVisionEncoderDecoderModel directly is not supported. " + "Please use the respective methods of the wrapped objects (model.decoder.resize_token_embeddings(...))" + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "enc_to_dec_proj", None) is not None: + with tf.name_scope(self.enc_to_dec_proj.name): + self.enc_to_dec_proj.build([None, None, self.encoder.config.hidden_size]) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + + +__all__ = ["TFVisionEncoderDecoderModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..d6bc2dcc0f8e94e0c79c01e9f47ab2e34aedb3cd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -0,0 +1,602 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Classes to support Vision-Encoder-Text-Decoder architectures""" + +import gc +import os +import tempfile +from typing import Optional, Union + +import torch +from torch import nn + +from ...cache_utils import Cache +from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from ..auto.configuration_auto import AutoConfig +from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM +from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig + + +# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + if decoder_start_token_id is None: + raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.") + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +logger = logging.get_logger(__name__) + + +@auto_docstring +class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin): + r""" + [`VisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with + one of the base vision model classes of the library as encoder and another one as decoder when created with the + :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and + :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder. + """ + + config: VisionEncoderDecoderConfig + base_model_prefix = "vision_encoder_decoder" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _supports_param_buffer_assignment = False + _supports_flash_attn = True + _supports_sdpa = True + + def __init__( + self, + config: Optional[PretrainedConfig] = None, + encoder: Optional[PreTrainedModel] = None, + decoder: Optional[PreTrainedModel] = None, + ): + r""" + encoder (`PreTrainedModel`, *optional*): + The encoder model to use. + decoder (`PreTrainedModel`, *optional*): + The decoder model to use. + """ + if config is None and (encoder is None or decoder is None): + raise ValueError("Either a configuration or an encoder and a decoder has to be provided.") + if config is None: + config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config) + else: + if not isinstance(config, self.config_class): + raise ValueError(f"Config: {config} has to be of type {self.config_class}") + + if config.decoder.cross_attention_hidden_size is not None: + if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size: + raise ValueError( + "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal" + f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for" + f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for" + " `config.encoder.hidden_size`." + ) + + # initialize with config + # make sure input & output embeddings is not tied + config.tie_word_embeddings = False + super().__init__(config) + + if encoder is None: + encoder = AutoModel.from_config(config.encoder) + + if decoder is None: + decoder = AutoModelForCausalLM.from_config(config.decoder) + + self.encoder = encoder + self.decoder = decoder + self._can_compile_fullgraph = decoder._can_compile_fullgraph + + if self.encoder.config.to_dict() != self.config.encoder.to_dict(): + logger.warning( + f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:" + f" {self.config.encoder}" + ) + if self.decoder.config.to_dict() != self.config.decoder.to_dict(): + logger.warning( + f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:" + f" {self.config.decoder}" + ) + + # make sure that the individual model's config refers to the shared config + # so that the updates to the config will be synced + self.config.encoder._attn_implementation = self.encoder.config._attn_implementation + self.config.decoder._attn_implementation = self.decoder.config._attn_implementation + self.encoder.config = self.config.encoder + self.decoder.config = self.config.decoder + + # encoder outputs might need to be projected to different dimension for decoder + if ( + self.encoder.config.hidden_size != self.decoder.config.hidden_size + and self.decoder.config.cross_attention_hidden_size is None + ): + self.enc_to_dec_proj = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.hidden_size) + + if self.encoder.get_output_embeddings() is not None: + raise ValueError( + f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head" + ) + + def get_encoder(self): + return self.encoder + + def get_input_embeddings(self): + return self.decoder.get_input_embeddings() + + def get_output_embeddings(self): + return self.decoder.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + return self.decoder.set_output_embeddings(new_embeddings) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" + Example: + + ```python + >>> from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer + >>> from PIL import Image + >>> import requests + + >>> image_processor = AutoImageProcessor.from_pretrained("ydshieh/vit-gpt2-coco-en") + >>> decoder_tokenizer = AutoTokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en") + >>> model = VisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> img = Image.open(requests.get(url, stream=True).raw) + >>> pixel_values = image_processor(images=img, return_tensors="pt").pixel_values # Batch size 1 + + >>> output_ids = model.generate( + ... pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True + ... ).sequences + + >>> preds = decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True) + >>> preds = [pred.strip() for pred in preds] + + >>> assert preds == ["a cat laying on top of a couch next to another cat"] + ```""" + + from_tf = kwargs.pop("from_tf", False) + if from_tf: + from transformers import TFVisionEncoderDecoderModel + + # a workaround to load from tensorflow checkpoint + # Using `_tf_model` won't work, because the weight names in the encoder/decoder of `_tf_model` get + # extended before saving those components. For example, The name of `_tf_model.encoder.vit` is + # `[top model name]/encoder/vit`, but the name of `tf_model.encoder.vit` is `[top model name]/vit`. The + # [top model name] is handled (stripped) by the conversion method, and the former case gets extra `encoder`, + # which should not occur when we want to save the components alone. + # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see + # https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245 + # (the change in `src/transformers/modeling_tf_utils.py`) + _tf_model = TFVisionEncoderDecoderModel.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + config = _tf_model.config + + # Using `tf_model` instead + encoder = _tf_model.encoder.__class__(_tf_model.config.encoder) + decoder = _tf_model.decoder.__class__(_tf_model.config.decoder) + # Make sure models are built + encoder(encoder.dummy_inputs) + decoder(decoder.dummy_inputs) + + # Get the variable correspondence between `_tf_model` and `encoder` and `decoder` + encoder_variables = {} + for v in encoder.trainable_variables + encoder.non_trainable_variables: + encoder_variables["/".join(v.name.split("/")[1:])] = v + decoder_variables = {} + for v in decoder.trainable_variables + decoder.non_trainable_variables: + decoder_variables["/".join(v.name.split("/")[1:])] = v + + _encoder_variables = {} + for v in _tf_model.encoder.trainable_variables + _tf_model.encoder.non_trainable_variables: + _encoder_variables["/".join(v.name.split("/")[2:])] = v + _decoder_variables = {} + for v in _tf_model.decoder.trainable_variables + _tf_model.decoder.non_trainable_variables: + _decoder_variables["/".join(v.name.split("/")[2:])] = v + + # assign weight values to `encoder` and `decoder` from `_tf_model` + for name, v in encoder_variables.items(): + v.assign(_encoder_variables[name]) + for name, v in decoder_variables.items(): + v.assign(_decoder_variables[name]) + + tf_model = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder) + + # Deal with `enc_to_dec_proj` + if hasattr(_tf_model, "enc_to_dec_proj"): + tf_model(tf_model.dummy_inputs) + tf_model.enc_to_dec_proj.kernel.assign(_tf_model.enc_to_dec_proj.kernel) + tf_model.enc_to_dec_proj.bias.assign(_tf_model.enc_to_dec_proj.bias) + + with tempfile.TemporaryDirectory() as tmpdirname: + encoder_dir = os.path.join(tmpdirname, "encoder") + decoder_dir = os.path.join(tmpdirname, "decoder") + tf_model.encoder.save_pretrained(encoder_dir) + tf_model.decoder.save_pretrained(decoder_dir) + + if hasattr(tf_model, "enc_to_dec_proj"): + enc_to_dec_proj_weight = torch.transpose( + torch.from_numpy(tf_model.enc_to_dec_proj.kernel.numpy()), 1, 0 + ) + enc_to_dec_proj_bias = torch.from_numpy(tf_model.enc_to_dec_proj.bias.numpy()) + + del _tf_model + del tf_model + gc.collect() + + attn_implementation = kwargs.get("attn_implementation") + kwargs_encoder_decoder = {} + if attn_implementation: + kwargs_encoder_decoder = { + "encoder_attn_implementation": attn_implementation, + "decoder_attn_implementation": attn_implementation, + } + + model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( + encoder_dir, + decoder_dir, + encoder_from_tf=True, + decoder_from_tf=True, + **kwargs_encoder_decoder, + ) + # This is only for copying some specific attributes of this particular model. + model.config = config + + if hasattr(model, "enc_to_dec_proj"): + model.enc_to_dec_proj.weight.data = enc_to_dec_proj_weight.contiguous() + model.enc_to_dec_proj.bias.data = enc_to_dec_proj_bias.contiguous() + + return model + + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + @classmethod + def from_encoder_decoder_pretrained( + cls, + encoder_pretrained_model_name_or_path: Optional[str] = None, + decoder_pretrained_model_name_or_path: Optional[str] = None, + *model_args, + **kwargs, + ) -> PreTrainedModel: + r""" + Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model + checkpoints. + + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train + the model, you need to first set it back in training mode with `model.train()`. + + Params: + encoder_pretrained_model_name_or_path (`str`, *optional*): + Information necessary to initiate the image encoder. Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An + example is `google/vit-base-patch16-224-in21k`. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In + this case, `from_tf` should be set to `True` and a configuration object should be provided as + `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a + PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`): + Information necessary to initiate the text decoder. Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In + this case, `from_tf` should be set to `True` and a configuration object should be provided as + `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a + PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + model_args (remaining positional arguments, *optional*): + All remaining positional arguments will be passed to the underlying model's `__init__` method. + + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). + + - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter. + - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter. + - To update the parent model configuration, do not use a prefix for each configuration parameter. + + Behaves differently depending on whether a `config` is provided or automatically loaded. + + Example: + + ```python + >>> from transformers import VisionEncoderDecoderModel + + >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized + >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( + ... "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased" + ... ) + >>> # saving model after fine-tuning + >>> model.save_pretrained("./vit-bert") + >>> # load fine-tuned model + >>> model = VisionEncoderDecoderModel.from_pretrained("./vit-bert") + ```""" + + kwargs_encoder = { + argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") + } + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + # remove encoder, decoder kwargs from kwargs + for key in kwargs_encoder: + del kwargs["encoder_" + key] + for key in kwargs_decoder: + del kwargs["decoder_" + key] + + # Load and initialize the encoder and decoder + # The distinction between encoder and decoder at the model level is made + # by the value of the flag `is_decoder` that we need to set correctly. + encoder = kwargs_encoder.pop("model", None) + if encoder is None: + if encoder_pretrained_model_name_or_path is None: + raise ValueError( + "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has " + "to be defined." + ) + + if "config" not in kwargs_encoder: + encoder_config, kwargs_encoder = AutoConfig.from_pretrained( + encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True + ) + + if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True: + logger.info( + f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model " + "from a decoder model. Cross-attention and causal mask are disabled." + ) + encoder_config.is_decoder = False + encoder_config.add_cross_attention = False + + kwargs_encoder["config"] = encoder_config + + encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder) + + decoder = kwargs_decoder.pop("model", None) + if decoder is None: + if decoder_pretrained_model_name_or_path is None: + raise ValueError( + "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has " + "to be defined." + ) + + if "config" not in kwargs_decoder: + decoder_config, kwargs_decoder = AutoConfig.from_pretrained( + decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True + ) + + if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False: + logger.info( + f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention" + f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if" + f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers." + ) + decoder_config.is_decoder = True + decoder_config.add_cross_attention = True + + kwargs_decoder["config"] = decoder_config + + if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False: + logger.warning( + f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. " + f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, " + "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` " + "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a " + "`decoder_config` to `.from_encoder_decoder_pretrained(...)`" + ) + + decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) + + # instantiate config with corresponding kwargs + config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs) + + # make sure input & output embeddings is not tied + config.tie_word_embeddings = False + return cls(encoder=encoder, decoder=decoder, config=config) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[tuple[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + For training, `decoder_input_ids` are automatically created by the model by shifting the `labels` to the + right, replacing -100 by the `pad_token_id` and prepending them with the `decoder_start_token_id`. + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert `decoder_input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0, + ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Examples: + + ```python + >>> from transformers import AutoProcessor, VisionEncoderDecoderModel + >>> import requests + >>> from PIL import Image + >>> import torch + + >>> processor = AutoProcessor.from_pretrained("microsoft/trocr-base-handwritten") + >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") + + >>> # load image from the IAM dataset + >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + + >>> # training + >>> model.config.decoder_start_token_id = processor.tokenizer.eos_token_id + >>> model.config.pad_token_id = processor.tokenizer.pad_token_id + >>> model.config.vocab_size = model.config.decoder.vocab_size + + >>> pixel_values = processor(image, return_tensors="pt").pixel_values + >>> text = "hello world" + >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids + >>> outputs = model(pixel_values=pixel_values, labels=labels) + >>> loss = outputs.loss + + >>> # inference (generation) + >>> generated_ids = model.generate(pixel_values) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # num_items_in_batch is only needed for loss computation + num_items_in_batch = kwargs.pop("num_items_in_batch", None) + + kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + if encoder_outputs is None: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + encoder_outputs = self.encoder( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs_encoder, + ) + elif isinstance(encoder_outputs, tuple): + encoder_outputs = BaseModelOutput(*encoder_outputs) + + encoder_hidden_states = encoder_outputs[0] + + # optionally project encoder_hidden_states + if ( + self.encoder.config.hidden_size != self.decoder.config.hidden_size + and self.decoder.config.cross_attention_hidden_size is None + ): + encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states) + + # else: + encoder_attention_mask = None + + if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None): + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=use_cache, + past_key_values=past_key_values, + return_dict=return_dict, + cache_position=cache_position, + **kwargs_decoder, + ) + + # Compute loss independent from decoder (as some shift the logits inside them) + loss = None + if labels is not None: + logits = decoder_outputs.logits if return_dict else decoder_outputs[0] + + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.decoder.config.vocab_size, + num_items_in_batch=num_items_in_batch, + ) + + if not return_dict: + if loss is not None: + return (loss,) + decoder_outputs + encoder_outputs + else: + return decoder_outputs + encoder_outputs + + return Seq2SeqLMOutput( + loss=loss, + logits=decoder_outputs.logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + +__all__ = ["VisionEncoderDecoderModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5577c10f76f32c3339fe87ced37e1d4b84f8a260 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/configuration_visual_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/configuration_visual_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec826413bb66646f88ea3f41c29e25abf0ef7a51 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/configuration_visual_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/modeling_visual_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/modeling_visual_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc21fcb8d77a3c4282062ccce6e9592e81f02e04 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/modeling_visual_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7520263c51bcd3b63747ee0a55e1baff24a777f9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_wav2vec2_bert import * + from .modeling_wav2vec2_bert import * + from .processing_wav2vec2_bert import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/configuration_wav2vec2_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/configuration_wav2vec2_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d32005e0fc61dbe0e8b0fa3c229ebb86282902e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/configuration_wav2vec2_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/modeling_wav2vec2_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/modeling_wav2vec2_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..363305004a96998e5916a2748ba3a5d5be646463 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/modeling_wav2vec2_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..2462976cfbbeb908fa3e8bbd04722f6aa066a7f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py @@ -0,0 +1,313 @@ +# coding=utf-8 +# Copyright 2024 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Wav2Vec2Bert model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class Wav2Vec2BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Wav2Vec2BertModel`]. It is used to + instantiate an Wav2Vec2Bert model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Bert + [facebook/wav2vec2-bert-rel-pos-large](https://huggingface.co/facebook/wav2vec2-bert-rel-pos-large) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*): + Vocabulary size of the Wav2Vec2Bert model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`Wav2Vec2BertModel`]. Vocabulary size of the + model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward + method of [`Wav2Vec2BertModel`]. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + feature_projection_input_dim (`int`, *optional*, defaults to 160): + Input dimension of this model, i.e the dimension after processing input audios with [`SeamlessM4TFeatureExtractor`] or [`Wav2Vec2BertProcessor`]. + hidden_act (`str` or `function`, *optional*, defaults to `"swish"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported. + hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + feat_proj_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for the feature projection. + final_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the final projection layer of [`Wav2Vec2BertForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more + details. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + apply_spec_augment (`bool`, *optional*, defaults to `True`): + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://huggingface.co/papers/1904.08779). + mask_time_prob (`float`, *optional*, defaults to 0.05): + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procedure generates `mask_time_prob*len(time_axis)/mask_time_length ``independent masks over the axis. If + reasoning from the probability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, *optional*, defaults to 10): + Length of vector span along the time axis. + mask_time_min_masks (`int`, *optional*, defaults to 2): + The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if `mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks`. + mask_feature_prob (`float`, *optional*, defaults to 0.0): + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procedure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over + the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, *optional*, defaults to 10): + Length of vector span along the feature axis. + mask_feature_min_masks (`int`, *optional*, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`. + ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`): + Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an + instance of [`Wav2Vec2BertForCTC`]. + ctc_zero_infinity (`bool`, *optional*, defaults to `False`): + Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly + occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance + of [`Wav2Vec2BertForCTC`]. + use_weighted_layer_sum (`bool`, *optional*, defaults to `False`): + Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an + instance of [`Wav2Vec2BertForSequenceClassification`]. + classifier_proj_size (`int`, *optional*, defaults to 768): + Dimensionality of the projection before token mean-pooling for classification. + tdnn_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`): + A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* + module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. + tdnn_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the + *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. + tdnn_dilation (`tuple[int]` or `list[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`): + A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the + *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. + xvector_output_dim (`int`, *optional*, defaults to 512): + Dimensionality of the *XVector* embedding vectors. + pad_token_id (`int`, *optional*, defaults to 0): The id of the _beginning-of-stream_ token. + bos_token_id (`int`, *optional*, defaults to 1): The id of the _padding_ token. + eos_token_id (`int`, *optional*, defaults to 2): The id of the _end-of-stream_ token. + add_adapter (`bool`, *optional*, defaults to `False`): + Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very + useful for warm-starting Wav2Vec2Bert for SpeechEncoderDecoder models. + adapter_kernel_size (`int`, *optional*, defaults to 3): + Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adapter_stride (`int`, *optional*, defaults to 2): + Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + num_adapter_layers (`int`, *optional*, defaults to 1): + Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + adapter_act (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the adapter layers. If string, `"gelu"`, + `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported. + use_intermediate_ffn_before_adapter (`bool`, *optional*, defaults to `False`): + Whether an intermediate feed-forward block should be stacked on top of the Wav2Vec2Bert Encoder and before the adapter network. + Only relevant if `add_adapter is True`. + output_hidden_size (`int`, *optional*): + Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant + if `add_adapter is True`. + position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`): + Can be specified to : + - `rotary`, for rotary position embeddings. + - `relative`, for relative position embeddings. + - `relative_key`, for relative position embeddings as defined by Shaw in [Self-Attention + with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + If left to `None`, no relative position embeddings is applied. + rotary_embedding_base (`int`, *optional*, defaults to 10000): + If `"rotary"` position embeddings are used, defines the size of the embedding base. + max_source_positions (`int`, *optional*, defaults to 5000): + if `"relative"` position embeddings are used, defines the maximum source input positions. + left_max_position_embeddings (`int`, *optional*, defaults to 64): + If `"relative_key"` (aka Shaw) position embeddings are used, defines the left clipping value for relative positions. + right_max_position_embeddings (`int`, *optional*, defaults to 8): + If `"relative_key"` (aka Shaw) position embeddings are used, defines the right clipping value for relative positions. + conv_depthwise_kernel_size (`int`, *optional*, defaults to 31): + Kernel size of convolutional depthwise 1D layer in Conformer blocks. + conformer_conv_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all convolutional layers in Conformer blocks. + Example: + + ```python + >>> from transformers import Wav2Vec2BertConfig, Wav2Vec2BertModel + + >>> # Initializing a Wav2Vec2Bert facebook/wav2vec2-bert-rel-pos-large style configuration + >>> configuration = Wav2Vec2BertConfig() + + >>> # Initializing a model (with random weights) from the facebook/wav2vec2-bert-rel-pos-large style configuration + >>> model = Wav2Vec2BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "wav2vec2-bert" + + def __init__( + self, + vocab_size=None, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + feature_projection_input_dim=160, + hidden_act="swish", + hidden_dropout=0.0, + activation_dropout=0.0, + attention_dropout=0.0, + feat_proj_dropout=0.0, + final_dropout=0.1, + layerdrop=0.1, + initializer_range=0.02, + layer_norm_eps=1e-5, + apply_spec_augment=True, + mask_time_prob=0.05, + mask_time_length=10, + mask_time_min_masks=2, + mask_feature_prob=0.0, + mask_feature_length=10, + mask_feature_min_masks=0, + ctc_loss_reduction="sum", + ctc_zero_infinity=False, + use_weighted_layer_sum=False, + classifier_proj_size=768, + tdnn_dim=(512, 512, 512, 512, 1500), + tdnn_kernel=(5, 3, 3, 1, 1), + tdnn_dilation=(1, 2, 3, 1, 1), + xvector_output_dim=512, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + add_adapter=False, + adapter_kernel_size=3, + adapter_stride=2, + num_adapter_layers=1, + adapter_act="relu", + use_intermediate_ffn_before_adapter=False, + output_hidden_size=None, + position_embeddings_type="relative_key", + rotary_embedding_base=10000, + max_source_positions=5000, + left_max_position_embeddings=64, + right_max_position_embeddings=8, + conv_depthwise_kernel_size=31, + conformer_conv_dropout=0.1, + **kwargs, + ): + super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_attention_heads = num_attention_heads + self.feature_projection_input_dim = feature_projection_input_dim + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.feat_proj_dropout = feat_proj_dropout + self.final_dropout = final_dropout + self.layerdrop = layerdrop + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.vocab_size = vocab_size + self.use_weighted_layer_sum = use_weighted_layer_sum + self.max_source_positions = max_source_positions + + if position_embeddings_type is not None and position_embeddings_type not in [ + "rotary", + "relative", + "relative_key", + ]: + raise ValueError( + """ + `position_embeddings_type` is not valid. It must be one of the following values: + `["rotary", "relative", "relative_key"]` or left as `None`. + """ + ) + self.position_embeddings_type = position_embeddings_type + self.rotary_embedding_base = rotary_embedding_base + self.left_max_position_embeddings = left_max_position_embeddings + self.right_max_position_embeddings = right_max_position_embeddings + + # Conformer-block related + self.conv_depthwise_kernel_size = conv_depthwise_kernel_size + self.conformer_conv_dropout = conformer_conv_dropout + + # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 + self.apply_spec_augment = apply_spec_augment + self.mask_time_prob = mask_time_prob + self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks + self.mask_feature_prob = mask_feature_prob + self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks + + # ctc loss + self.ctc_loss_reduction = ctc_loss_reduction + self.ctc_zero_infinity = ctc_zero_infinity + + # adapter + self.add_adapter = add_adapter + self.adapter_kernel_size = adapter_kernel_size + self.adapter_stride = adapter_stride + self.num_adapter_layers = num_adapter_layers + self.adapter_act = adapter_act + self.output_hidden_size = output_hidden_size if output_hidden_size is not None else hidden_size + if use_intermediate_ffn_before_adapter and not add_adapter: + raise ValueError("`use_intermediate_ffn_before_adapter` is `True` but `add_adapter` is `False`.") + self.use_intermediate_ffn_before_adapter = use_intermediate_ffn_before_adapter + + # SequenceClassification-specific parameter. Feel free to ignore for other classes. + self.classifier_proj_size = classifier_proj_size + + # XVector-specific parameters. Feel free to ignore for other classes. + self.tdnn_dim = list(tdnn_dim) + self.tdnn_kernel = list(tdnn_kernel) + self.tdnn_dilation = list(tdnn_dilation) + self.xvector_output_dim = xvector_output_dim + + @property + def inputs_to_logits_ratio(self): + ratio = self.feature_projection_input_dim * 2 + if self.add_adapter: + ratio = ratio * (self.adapter_stride**self.num_adapter_layers) + return ratio + + +__all__ = ["Wav2Vec2BertConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..3448089c632bbc5515f0b1c75da424ed1cc23173 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py @@ -0,0 +1,1515 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_wav2vec2_bert.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import math +import warnings +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...integrations.fsdp import is_fsdp_managed_module +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + CausalLMOutput, + SequenceClassifierOutput, + TokenClassifierOutput, + Wav2Vec2BaseModelOutput, + XVectorOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, is_peft_available +from .configuration_wav2vec2_bert import Wav2Vec2BertConfig + + +class Wav2Vec2BertRotaryPositionalEmbedding(nn.Module): + """Rotary positional embedding + Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864 + """ + + def __init__(self, config): + super().__init__() + dim = config.hidden_size // config.num_attention_heads + base = config.rotary_embedding_base + + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) + # Ignore copy + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.cached_sequence_length = None + self.cached_rotary_positional_embedding = None + + def forward(self, hidden_states): + sequence_length = hidden_states.shape[1] + + if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None: + return self.cached_rotary_positional_embedding + + self.cached_sequence_length = sequence_length + # Embeddings are computed in the dtype of the inv_freq constant + time_stamps = torch.arange(sequence_length).type_as(self.inv_freq) + freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq) + embeddings = torch.cat((freqs, freqs), dim=-1) + + cos_embeddings = embeddings.cos()[:, None, None, :] + sin_embeddings = embeddings.sin()[:, None, None, :] + # Computed embeddings are cast to the dtype of the hidden state inputs + self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states) + return self.cached_rotary_positional_embedding + + +class Wav2Vec2BertRelPositionalEmbedding(nn.Module): + """Relative positional encoding module.""" + + def __init__(self, config): + super().__init__() + self.max_len = config.max_source_positions + self.d_model = config.hidden_size + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, self.max_len)) + + def extend_pe(self, x): + # Reset the positional encodings + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` is the position of query vector and `j` is the + # position of key vector. We use positive relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i