koichi12 commited on Feb 12, 2025

Commit

1e8ae5e

verified ·

1 Parent(s): 9aa22f7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/anyio/_core/__pycache__/_eventloop.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/_core/__pycache__/_fileio.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/_core/__pycache__/_streams.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/_core/__pycache__/_testing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/__init__.py +55 -0
.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_eventloop.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_resources.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_sockets.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_streams.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_subprocesses.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_tasks.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_testing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/anyio/abc/_resources.py +33 -0
.venv/lib/python3.11/site-packages/anyio/abc/_sockets.py +194 -0
.venv/lib/python3.11/site-packages/anyio/abc/_streams.py +203 -0
.venv/lib/python3.11/site-packages/anyio/abc/_tasks.py +101 -0
.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/INSTALLER +1 -0
.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/METADATA +55 -0
.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/RECORD +33 -0
.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/WHEEL +4 -0
.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/licenses/COPYING +19 -0
.venv/lib/python3.11/site-packages/xformers/__init__.py +73 -0
.venv/lib/python3.11/site-packages/xformers/_cpp_lib.py +155 -0
.venv/lib/python3.11/site-packages/xformers/_deprecation_warning.py +12 -0
.venv/lib/python3.11/site-packages/xformers/attn_bias_utils.py +501 -0
.venv/lib/python3.11/site-packages/xformers/checkpoint.py +546 -0
.venv/lib/python3.11/site-packages/xformers/cpp_lib.json +1 -0
.venv/lib/python3.11/site-packages/xformers/factory/__init__.py +11 -0
.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/block_configs.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/block_factory.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/hydra_helper.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/model_factory.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/weight_init.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/factory/block_configs.py +237 -0
.venv/lib/python3.11/site-packages/xformers/factory/block_factory.py +358 -0
.venv/lib/python3.11/site-packages/xformers/factory/hydra_helper.py +36 -0
.venv/lib/python3.11/site-packages/xformers/factory/model_factory.py +313 -0
.venv/lib/python3.11/site-packages/xformers/factory/weight_init.py +293 -0
.venv/lib/python3.11/site-packages/xformers/info.py +77 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/differentiable_collectives.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/indexing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/ipc.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/modpar_layers.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/rmsnorm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/rope_padded.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/seqpar.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/sequence_parallel_fused_ops.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/anyio/_core/__pycache__/_eventloop.cpython-311.pyc ADDED Viewed

Binary file (6.92 kB). View file

.venv/lib/python3.11/site-packages/anyio/_core/__pycache__/_fileio.cpython-311.pyc ADDED Viewed

Binary file (41.4 kB). View file

.venv/lib/python3.11/site-packages/anyio/_core/__pycache__/_streams.cpython-311.pyc ADDED Viewed

Binary file (2.65 kB). View file

.venv/lib/python3.11/site-packages/anyio/_core/__pycache__/_testing.cpython-311.pyc ADDED Viewed

Binary file (3.86 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from __future__ import annotations
+from ._eventloop import AsyncBackend as AsyncBackend
+from ._resources import AsyncResource as AsyncResource
+from ._sockets import ConnectedUDPSocket as ConnectedUDPSocket
+from ._sockets import ConnectedUNIXDatagramSocket as ConnectedUNIXDatagramSocket
+from ._sockets import IPAddressType as IPAddressType
+from ._sockets import IPSockAddrType as IPSockAddrType
+from ._sockets import SocketAttribute as SocketAttribute
+from ._sockets import SocketListener as SocketListener
+from ._sockets import SocketStream as SocketStream
+from ._sockets import UDPPacketType as UDPPacketType
+from ._sockets import UDPSocket as UDPSocket
+from ._sockets import UNIXDatagramPacketType as UNIXDatagramPacketType
+from ._sockets import UNIXDatagramSocket as UNIXDatagramSocket
+from ._sockets import UNIXSocketStream as UNIXSocketStream
+from ._streams import AnyByteReceiveStream as AnyByteReceiveStream
+from ._streams import AnyByteSendStream as AnyByteSendStream
+from ._streams import AnyByteStream as AnyByteStream
+from ._streams import AnyUnreliableByteReceiveStream as AnyUnreliableByteReceiveStream
+from ._streams import AnyUnreliableByteSendStream as AnyUnreliableByteSendStream
+from ._streams import AnyUnreliableByteStream as AnyUnreliableByteStream
+from ._streams import ByteReceiveStream as ByteReceiveStream
+from ._streams import ByteSendStream as ByteSendStream
+from ._streams import ByteStream as ByteStream
+from ._streams import Listener as Listener
+from ._streams import ObjectReceiveStream as ObjectReceiveStream
+from ._streams import ObjectSendStream as ObjectSendStream
+from ._streams import ObjectStream as ObjectStream
+from ._streams import UnreliableObjectReceiveStream as UnreliableObjectReceiveStream
+from ._streams import UnreliableObjectSendStream as UnreliableObjectSendStream
+from ._streams import UnreliableObjectStream as UnreliableObjectStream
+from ._subprocesses import Process as Process
+from ._tasks import TaskGroup as TaskGroup
+from ._tasks import TaskStatus as TaskStatus
+from ._testing import TestRunner as TestRunner
+# Re-exported here, for backwards compatibility
+# isort: off
+from .._core._synchronization import (
+    CapacityLimiter as CapacityLimiter,
+    Condition as Condition,
+    Event as Event,
+    Lock as Lock,
+    Semaphore as Semaphore,
+)
+from .._core._tasks import CancelScope as CancelScope
+from ..from_thread import BlockingPortal as BlockingPortal
+# Re-export imports so they look like they live directly in this package
+for __value in list(locals().values()):
+    if getattr(__value, "__module__", "").startswith("anyio.abc."):
+        __value.__module__ = __name__
+del __value

.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.89 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_eventloop.cpython-311.pyc ADDED Viewed

Binary file (16.4 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_resources.cpython-311.pyc ADDED Viewed

Binary file (1.82 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_sockets.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_streams.cpython-311.pyc ADDED Viewed

Binary file (9.6 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_subprocesses.cpython-311.pyc ADDED Viewed

Binary file (3.66 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_tasks.cpython-311.pyc ADDED Viewed

Binary file (4.99 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/__pycache__/_testing.cpython-311.pyc ADDED Viewed

Binary file (3.02 kB). View file

.venv/lib/python3.11/site-packages/anyio/abc/_resources.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from __future__ import annotations
+from abc import ABCMeta, abstractmethod
+from types import TracebackType
+from typing import TypeVar
+T = TypeVar("T")
+class AsyncResource(metaclass=ABCMeta):
+    """
+    Abstract base class for all closeable asynchronous resources.
+    Works as an asynchronous context manager which returns the instance itself on enter,
+    and calls :meth:`aclose` on exit.
+    """
+    __slots__ = ()
+    async def __aenter__(self: T) -> T:
+        return self
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        await self.aclose()
+    @abstractmethod
+    async def aclose(self) -> None:
+        """Close the resource."""

.venv/lib/python3.11/site-packages/anyio/abc/_sockets.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from __future__ import annotations
+import socket
+from abc import abstractmethod
+from collections.abc import Callable, Collection, Mapping
+from contextlib import AsyncExitStack
+from io import IOBase
+from ipaddress import IPv4Address, IPv6Address
+from socket import AddressFamily
+from types import TracebackType
+from typing import Any, TypeVar, Union
+from .._core._typedattr import (
+    TypedAttributeProvider,
+    TypedAttributeSet,
+    typed_attribute,
+)
+from ._streams import ByteStream, Listener, UnreliableObjectStream
+from ._tasks import TaskGroup
+IPAddressType = Union[str, IPv4Address, IPv6Address]
+IPSockAddrType = tuple[str, int]
+SockAddrType = Union[IPSockAddrType, str]
+UDPPacketType = tuple[bytes, IPSockAddrType]
+UNIXDatagramPacketType = tuple[bytes, str]
+T_Retval = TypeVar("T_Retval")
+class _NullAsyncContextManager:
+    async def __aenter__(self) -> None:
+        pass
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> bool | None:
+        return None
+class SocketAttribute(TypedAttributeSet):
+    #: the address family of the underlying socket
+    family: AddressFamily = typed_attribute()
+    #: the local socket address of the underlying socket
+    local_address: SockAddrType = typed_attribute()
+    #: for IP addresses, the local port the underlying socket is bound to
+    local_port: int = typed_attribute()
+    #: the underlying stdlib socket object
+    raw_socket: socket.socket = typed_attribute()
+    #: the remote address the underlying socket is connected to
+    remote_address: SockAddrType = typed_attribute()
+    #: for IP addresses, the remote port the underlying socket is connected to
+    remote_port: int = typed_attribute()
+class _SocketProvider(TypedAttributeProvider):
+    @property
+    def extra_attributes(self) -> Mapping[Any, Callable[[], Any]]:
+        from .._core._sockets import convert_ipv6_sockaddr as convert
+        attributes: dict[Any, Callable[[], Any]] = {
+            SocketAttribute.family: lambda: self._raw_socket.family,
+            SocketAttribute.local_address: lambda: convert(
+                self._raw_socket.getsockname()
+            ),
+            SocketAttribute.raw_socket: lambda: self._raw_socket,
+        }
+        try:
+            peername: tuple[str, int] | None = convert(self._raw_socket.getpeername())
+        except OSError:
+            peername = None
+        # Provide the remote address for connected sockets
+        if peername is not None:
+            attributes[SocketAttribute.remote_address] = lambda: peername
+        # Provide local and remote ports for IP based sockets
+        if self._raw_socket.family in (AddressFamily.AF_INET, AddressFamily.AF_INET6):
+            attributes[SocketAttribute.local_port] = (
+                lambda: self._raw_socket.getsockname()[1]
+            )
+            if peername is not None:
+                remote_port = peername[1]
+                attributes[SocketAttribute.remote_port] = lambda: remote_port
+        return attributes
+    @property
+    @abstractmethod
+    def _raw_socket(self) -> socket.socket:
+        pass
+class SocketStream(ByteStream, _SocketProvider):
+    """
+    Transports bytes over a socket.
+    Supports all relevant extra attributes from :class:`~SocketAttribute`.
+    """
+class UNIXSocketStream(SocketStream):
+    @abstractmethod
+    async def send_fds(self, message: bytes, fds: Collection[int | IOBase]) -> None:
+        """
+        Send file descriptors along with a message to the peer.
+        :param message: a non-empty bytestring
+        :param fds: a collection of files (either numeric file descriptors or open file
+            or socket objects)
+        """
+    @abstractmethod
+    async def receive_fds(self, msglen: int, maxfds: int) -> tuple[bytes, list[int]]:
+        """
+        Receive file descriptors along with a message from the peer.
+        :param msglen: length of the message to expect from the peer
+        :param maxfds: maximum number of file descriptors to expect from the peer
+        :return: a tuple of (message, file descriptors)
+        """
+class SocketListener(Listener[SocketStream], _SocketProvider):
+    """
+    Listens to incoming socket connections.
+    Supports all relevant extra attributes from :class:`~SocketAttribute`.
+    """
+    @abstractmethod
+    async def accept(self) -> SocketStream:
+        """Accept an incoming connection."""
+    async def serve(
+        self,
+        handler: Callable[[SocketStream], Any],
+        task_group: TaskGroup | None = None,
+    ) -> None:
+        from .. import create_task_group
+        async with AsyncExitStack() as stack:
+            if task_group is None:
+                task_group = await stack.enter_async_context(create_task_group())
+            while True:
+                stream = await self.accept()
+                task_group.start_soon(handler, stream)
+class UDPSocket(UnreliableObjectStream[UDPPacketType], _SocketProvider):
+    """
+    Represents an unconnected UDP socket.
+    Supports all relevant extra attributes from :class:`~SocketAttribute`.
+    """
+    async def sendto(self, data: bytes, host: str, port: int) -> None:
+        """
+        Alias for :meth:`~.UnreliableObjectSendStream.send` ((data, (host, port))).
+        """
+        return await self.send((data, (host, port)))
+class ConnectedUDPSocket(UnreliableObjectStream[bytes], _SocketProvider):
+    """
+    Represents an connected UDP socket.
+    Supports all relevant extra attributes from :class:`~SocketAttribute`.
+    """
+class UNIXDatagramSocket(
+    UnreliableObjectStream[UNIXDatagramPacketType], _SocketProvider
+):
+    """
+    Represents an unconnected Unix datagram socket.
+    Supports all relevant extra attributes from :class:`~SocketAttribute`.
+    """
+    async def sendto(self, data: bytes, path: str) -> None:
+        """Alias for :meth:`~.UnreliableObjectSendStream.send` ((data, path))."""
+        return await self.send((data, path))
+class ConnectedUNIXDatagramSocket(UnreliableObjectStream[bytes], _SocketProvider):
+    """
+    Represents a connected Unix datagram socket.
+    Supports all relevant extra attributes from :class:`~SocketAttribute`.
+    """

.venv/lib/python3.11/site-packages/anyio/abc/_streams.py ADDED Viewed

	@@ -0,0 +1,203 @@

+from __future__ import annotations
+from abc import abstractmethod
+from collections.abc import Callable
+from typing import Any, Generic, TypeVar, Union
+from .._core._exceptions import EndOfStream
+from .._core._typedattr import TypedAttributeProvider
+from ._resources import AsyncResource
+from ._tasks import TaskGroup
+T_Item = TypeVar("T_Item")
+T_co = TypeVar("T_co", covariant=True)
+T_contra = TypeVar("T_contra", contravariant=True)
+class UnreliableObjectReceiveStream(
+    Generic[T_co], AsyncResource, TypedAttributeProvider
+):
+    """
+    An interface for receiving objects.
+    This interface makes no guarantees that the received messages arrive in the order in
+    which they were sent, or that no messages are missed.
+    Asynchronously iterating over objects of this type will yield objects matching the
+    given type parameter.
+    """
+    def __aiter__(self) -> UnreliableObjectReceiveStream[T_co]:
+        return self
+    async def __anext__(self) -> T_co:
+        try:
+            return await self.receive()
+        except EndOfStream:
+            raise StopAsyncIteration
+    @abstractmethod
+    async def receive(self) -> T_co:
+        """
+        Receive the next item.
+        :raises ~anyio.ClosedResourceError: if the receive stream has been explicitly
+            closed
+        :raises ~anyio.EndOfStream: if this stream has been closed from the other end
+        :raises ~anyio.BrokenResourceError: if this stream has been rendered unusable
+            due to external causes
+        """
+class UnreliableObjectSendStream(
+    Generic[T_contra], AsyncResource, TypedAttributeProvider
+):
+    """
+    An interface for sending objects.
+    This interface makes no guarantees that the messages sent will reach the
+    recipient(s) in the same order in which they were sent, or at all.
+    """
+    @abstractmethod
+    async def send(self, item: T_contra) -> None:
+        """
+        Send an item to the peer(s).
+        :param item: the item to send
+        :raises ~anyio.ClosedResourceError: if the send stream has been explicitly
+            closed
+        :raises ~anyio.BrokenResourceError: if this stream has been rendered unusable
+            due to external causes
+        """
+class UnreliableObjectStream(
+    UnreliableObjectReceiveStream[T_Item], UnreliableObjectSendStream[T_Item]
+):
+    """
+    A bidirectional message stream which does not guarantee the order or reliability of
+    message delivery.
+    """
+class ObjectReceiveStream(UnreliableObjectReceiveStream[T_co]):
+    """
+    A receive message stream which guarantees that messages are received in the same
+    order in which they were sent, and that no messages are missed.
+    """
+class ObjectSendStream(UnreliableObjectSendStream[T_contra]):
+    """
+    A send message stream which guarantees that messages are delivered in the same order
+    in which they were sent, without missing any messages in the middle.
+    """
+class ObjectStream(
+    ObjectReceiveStream[T_Item],
+    ObjectSendStream[T_Item],
+    UnreliableObjectStream[T_Item],
+):
+    """
+    A bidirectional message stream which guarantees the order and reliability of message
+    delivery.
+    """
+    @abstractmethod
+    async def send_eof(self) -> None:
+        """
+        Send an end-of-file indication to the peer.
+        You should not try to send any further data to this stream after calling this
+        method. This method is idempotent (does nothing on successive calls).
+        """
+class ByteReceiveStream(AsyncResource, TypedAttributeProvider):
+    """
+    An interface for receiving bytes from a single peer.
+    Iterating this byte stream will yield a byte string of arbitrary length, but no more
+    than 65536 bytes.
+    """
+    def __aiter__(self) -> ByteReceiveStream:
+        return self
+    async def __anext__(self) -> bytes:
+        try:
+            return await self.receive()
+        except EndOfStream:
+            raise StopAsyncIteration
+    @abstractmethod
+    async def receive(self, max_bytes: int = 65536) -> bytes:
+        """
+        Receive at most ``max_bytes`` bytes from the peer.
+        .. note:: Implementors of this interface should not return an empty
+            :class:`bytes` object, and users should ignore them.
+        :param max_bytes: maximum number of bytes to receive
+        :return: the received bytes
+        :raises ~anyio.EndOfStream: if this stream has been closed from the other end
+        """
+class ByteSendStream(AsyncResource, TypedAttributeProvider):
+    """An interface for sending bytes to a single peer."""
+    @abstractmethod
+    async def send(self, item: bytes) -> None:
+        """
+        Send the given bytes to the peer.
+        :param item: the bytes to send
+        """
+class ByteStream(ByteReceiveStream, ByteSendStream):
+    """A bidirectional byte stream."""
+    @abstractmethod
+    async def send_eof(self) -> None:
+        """
+        Send an end-of-file indication to the peer.
+        You should not try to send any further data to this stream after calling this
+        method. This method is idempotent (does nothing on successive calls).
+        """
+#: Type alias for all unreliable bytes-oriented receive streams.
+AnyUnreliableByteReceiveStream = Union[
+    UnreliableObjectReceiveStream[bytes], ByteReceiveStream
+]
+#: Type alias for all unreliable bytes-oriented send streams.
+AnyUnreliableByteSendStream = Union[UnreliableObjectSendStream[bytes], ByteSendStream]
+#: Type alias for all unreliable bytes-oriented streams.
+AnyUnreliableByteStream = Union[UnreliableObjectStream[bytes], ByteStream]
+#: Type alias for all bytes-oriented receive streams.
+AnyByteReceiveStream = Union[ObjectReceiveStream[bytes], ByteReceiveStream]
+#: Type alias for all bytes-oriented send streams.
+AnyByteSendStream = Union[ObjectSendStream[bytes], ByteSendStream]
+#: Type alias for all bytes-oriented streams.
+AnyByteStream = Union[ObjectStream[bytes], ByteStream]
+class Listener(Generic[T_co], AsyncResource, TypedAttributeProvider):
+    """An interface for objects that let you accept incoming connections."""
+    @abstractmethod
+    async def serve(
+        self, handler: Callable[[T_co], Any], task_group: TaskGroup | None = None
+    ) -> None:
+        """
+        Accept incoming connections as they come in and start tasks to handle them.
+        :param handler: a callable that will be used to handle each accepted connection
+        :param task_group: the task group that will be used to start tasks for handling
+            each accepted connection (if omitted, an ad-hoc task group will be created)
+        """

.venv/lib/python3.11/site-packages/anyio/abc/_tasks.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from __future__ import annotations
+import sys
+from abc import ABCMeta, abstractmethod
+from collections.abc import Awaitable, Callable
+from types import TracebackType
+from typing import TYPE_CHECKING, Any, Protocol, TypeVar, overload
+if sys.version_info >= (3, 11):
+    from typing import TypeVarTuple, Unpack
+else:
+    from typing_extensions import TypeVarTuple, Unpack
+if TYPE_CHECKING:
+    from .._core._tasks import CancelScope
+T_Retval = TypeVar("T_Retval")
+T_contra = TypeVar("T_contra", contravariant=True)
+PosArgsT = TypeVarTuple("PosArgsT")
+class TaskStatus(Protocol[T_contra]):
+    @overload
+    def started(self: TaskStatus[None]) -> None: ...
+    @overload
+    def started(self, value: T_contra) -> None: ...
+    def started(self, value: T_contra | None = None) -> None:
+        """
+        Signal that the task has started.
+        :param value: object passed back to the starter of the task
+        """
+class TaskGroup(metaclass=ABCMeta):
+    """
+    Groups several asynchronous tasks together.
+    :ivar cancel_scope: the cancel scope inherited by all child tasks
+    :vartype cancel_scope: CancelScope
+    .. note:: On asyncio, support for eager task factories is considered to be
+        **experimental**. In particular, they don't follow the usual semantics of new
+        tasks being scheduled on the next iteration of the event loop, and may thus
+        cause unexpected behavior in code that wasn't written with such semantics in
+        mind.
+    """
+    cancel_scope: CancelScope
+    @abstractmethod
+    def start_soon(
+        self,
+        func: Callable[[Unpack[PosArgsT]], Awaitable[Any]],
+        *args: Unpack[PosArgsT],
+        name: object = None,
+    ) -> None:
+        """
+        Start a new task in this task group.
+        :param func: a coroutine function
+        :param args: positional arguments to call the function with
+        :param name: name of the task, for the purposes of introspection and debugging
+        .. versionadded:: 3.0
+        """
+    @abstractmethod
+    async def start(
+        self,
+        func: Callable[..., Awaitable[Any]],
+        *args: object,
+        name: object = None,
+    ) -> Any:
+        """
+        Start a new task and wait until it signals for readiness.
+        :param func: a coroutine function
+        :param args: positional arguments to call the function with
+        :param name: name of the task, for the purposes of introspection and debugging
+        :return: the value passed to ``task_status.started()``
+        :raises RuntimeError: if the task finishes without calling
+            ``task_status.started()``
+        .. versionadded:: 3.0
+        """
+    @abstractmethod
+    async def __aenter__(self) -> TaskGroup:
+        """Enter the task group context and allow starting new tasks."""
+    @abstractmethod
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> bool | None:
+        """Exit the task group context waiting for all tasks to finish."""

.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/INSTALLER ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip

.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/METADATA ADDED Viewed

	@@ -0,0 +1,55 @@

+Metadata-Version: 2.3
+Name: jsonschema-specifications
+Version: 2024.10.1
+Summary: The JSON Schema meta-schemas and vocabularies, exposed as a Registry
+Project-URL: Documentation, https://jsonschema-specifications.readthedocs.io/
+Project-URL: Homepage, https://github.com/python-jsonschema/jsonschema-specifications
+Project-URL: Issues, https://github.com/python-jsonschema/jsonschema-specifications/issues/
+Project-URL: Funding, https://github.com/sponsors/Julian
+Project-URL: Tidelift, https://tidelift.com/subscription/pkg/pypi-jsonschema-specifications?utm_source=pypi-jsonschema-specifications&utm_medium=referral&utm_campaign=pypi-link
+Project-URL: Source, https://github.com/python-jsonschema/jsonschema-specifications
+Author-email: Julian Berman <Julian+jsonschema-specifications@GrayVines.com>
+License-File: COPYING
+Keywords: data validation,json,json schema,jsonschema,validation
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: File Formats :: JSON
+Classifier: Topic :: File Formats :: JSON :: JSON Schema
+Requires-Python: >=3.9
+Requires-Dist: referencing>=0.31.0
+Description-Content-Type: text/x-rst
+=============================
+``jsonschema-specifications``
+=============================
+|PyPI| |Pythons| |CI| |ReadTheDocs|
+JSON support files from the `JSON Schema Specifications <https://json-schema.org/specification.html>`_ (metaschemas, vocabularies, etc.), packaged for runtime access from Python as a `referencing-based Schema Registry <https://referencing.readthedocs.io/en/stable/api/#referencing.Registry>`_.
+.. |PyPI| image:: https://img.shields.io/pypi/v/jsonschema-specifications.svg
+  :alt: PyPI version
+  :target: https://pypi.org/project/jsonschema-specifications/
+.. |Pythons| image:: https://img.shields.io/pypi/pyversions/jsonschema-specifications.svg
+  :alt: Supported Python versions
+  :target: https://pypi.org/project/jsonschema-specifications/
+.. |CI| image:: https://github.com/python-jsonschema/jsonschema-specifications/workflows/CI/badge.svg
+  :alt: Build status
+  :target: https://github.com/python-jsonschema/jsonschema-specifications/actions?query=workflow%3ACI
+.. |ReadTheDocs| image:: https://readthedocs.org/projects/jsonschema-specifications/badge/?version=stable&style=flat
+  :alt: ReadTheDocs status
+  :target: https://jsonschema-specifications.readthedocs.io/en/stable/

.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/RECORD ADDED Viewed

	@@ -0,0 +1,33 @@

+jsonschema_specifications-2024.10.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+jsonschema_specifications-2024.10.1.dist-info/METADATA,sha256=-jCfClPka5D4aDTtJ683zNiEcSHXhPBLuk9r9XWwyHI,2985
+jsonschema_specifications-2024.10.1.dist-info/RECORD,,
+jsonschema_specifications-2024.10.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+jsonschema_specifications-2024.10.1.dist-info/licenses/COPYING,sha256=QtzWNJX4e063x3V6-jebtVpT-Ur9el9lfZrfVyNuUVw,1057
+jsonschema_specifications/__init__.py,sha256=qoTB2DKY7qvNrGhMPH6gtmAJRLilmVQ-fFZwT6ryqw0,386
+jsonschema_specifications/__pycache__/__init__.cpython-311.pyc,,
+jsonschema_specifications/__pycache__/_core.cpython-311.pyc,,
+jsonschema_specifications/_core.py,sha256=tFhc1CMleJ3AJOK_bjxOpFQTdrsUClFGfFxPBU_CebM,1140
+jsonschema_specifications/schemas/draft201909/metaschema.json,sha256=e3YbPhIfCgyh6ioLjizIVrz4AWBLgmjXG6yqICvAwTs,1785
+jsonschema_specifications/schemas/draft201909/vocabularies/applicator,sha256=aJUQDplyb7sQcFhRK77D7P1LJOj9L6zuPlBe5ysNTDE,1860
+jsonschema_specifications/schemas/draft201909/vocabularies/content,sha256=m31PVaTi_bAsQwBo_f-rxzKt3OI42j8d8mkCScM1MnQ,517
+jsonschema_specifications/schemas/draft201909/vocabularies/core,sha256=taLElX9kldClCB8ECevooU5BOayyA_x0hHH47eKvWyw,1531
+jsonschema_specifications/schemas/draft201909/vocabularies/meta-data,sha256=1H4kRd1qgicaKY2DzGxsuNSuHhXg3Fa-zTehY-zwEoY,892
+jsonschema_specifications/schemas/draft201909/vocabularies/validation,sha256=HlJsHTNac0gF_ILPV5jBK5YK19olF8Zs2lobCTWcPBw,2834
+jsonschema_specifications/schemas/draft202012/metaschema.json,sha256=Qdp29a-3zgYtJI92JGOpL3ykfk4PkFsiS6av7vkd7Q8,2452
+jsonschema_specifications/schemas/draft202012/vocabularies/applicator,sha256=xKbkFHuR_vf-ptwFjLG_k0AvdBS3ZXiosWqvHa1qrO8,1659
+jsonschema_specifications/schemas/draft202012/vocabularies/content,sha256=CDQ3R3ZOSlgUJieTz01lIFenkThjxZUNQyl-jh_axbY,519
+jsonschema_specifications/schemas/draft202012/vocabularies/core,sha256=wtEqjk3RHTNt_IOj9mOqTGnwtJs76wlP_rJbUxb0gD0,1564
+jsonschema_specifications/schemas/draft202012/vocabularies/format,sha256=UOu_55BhGoSbjMQAoJwdDg-2q1wNQ6DyIgH9NiUFa_Q,403
+jsonschema_specifications/schemas/draft202012/vocabularies/format-annotation,sha256=q8d1rf79idIjWBcNm_k_Tr0jSVY7u-3WDwK-98gSvMA,448
+jsonschema_specifications/schemas/draft202012/vocabularies/format-assertion,sha256=xSJCuaG7eGsmw-gset1CjDH5yW5XXc6Z5W6l_qptogw,445
+jsonschema_specifications/schemas/draft202012/vocabularies/meta-data,sha256=j3bW4U9Bubku-TO3CM3FFEyLUmhlGtEZGEhfsXVPHHY,892
+jsonschema_specifications/schemas/draft202012/vocabularies/unevaluated,sha256=Lb-8tzmUtnCwl2SSre4f_7RsIWgnhNL1pMpWH54tDLQ,506
+jsonschema_specifications/schemas/draft202012/vocabularies/validation,sha256=cBCjHlQfMtK-ch4t40jfdcmzaHaj7TBId_wKvaHTelg,2834
+jsonschema_specifications/schemas/draft3/metaschema.json,sha256=LPdfZENvtb43Si6qJ6uLfh_WUcm0ba6mxnsC_WTiRYs,2600
+jsonschema_specifications/schemas/draft4/metaschema.json,sha256=4UidC0dV8CeTMCWR0_y48Htok6gqlPJIlfjk7fEbguI,4357
+jsonschema_specifications/schemas/draft6/metaschema.json,sha256=wp386fVINcOgbAOzxdXsDtp3cGVo-cTffPvHVmpRAG0,4437
+jsonschema_specifications/schemas/draft7/metaschema.json,sha256=PVOSCIJhYGxVm2A_OFMpyfGrRbXWZ-uZBodFOwVdQF4,4819
+jsonschema_specifications/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+jsonschema_specifications/tests/__pycache__/__init__.cpython-311.pyc,,
+jsonschema_specifications/tests/__pycache__/test_jsonschema_specifications.cpython-311.pyc,,
+jsonschema_specifications/tests/test_jsonschema_specifications.py,sha256=WkbYRW6A6FoZ0rivShfqVLSCsAiHJ2x8TxqECJTXPTY,1106

.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/WHEEL ADDED Viewed

	@@ -0,0 +1,4 @@

+Wheel-Version: 1.0
+Generator: hatchling 1.25.0
+Root-Is-Purelib: true
+Tag: py3-none-any

.venv/lib/python3.11/site-packages/jsonschema_specifications-2024.10.1.dist-info/licenses/COPYING ADDED Viewed

	@@ -0,0 +1,19 @@

+Copyright (c) 2022 Julian Berman
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

.venv/lib/python3.11/site-packages/xformers/__init__.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+import torch
+from . import _cpp_lib
+from .checkpoint import (  # noqa: E402, F401
+    checkpoint,
+    get_optimal_checkpoint_policy,
+    list_operators,
+    selective_checkpoint_wrapper,
+)
+try:
+    from .version import __version__  # noqa: F401
+except ImportError:
+    __version__ = "0.0.0"
+logger = logging.getLogger("xformers")
+_has_cpp_library: bool = _cpp_lib._cpp_library_load_exception is None
+_is_opensource: bool = True
+def compute_once(func):
+    value = None
+    def func_wrapper():
+        nonlocal value
+        if value is None:
+            value = func()
+        return value
+    return func_wrapper
+@compute_once
+def _is_triton_available():
+    if os.environ.get("XFORMERS_ENABLE_TRITON", "0") == "1":
+        return True
+    if not torch.cuda.is_available():
+        return False
+    if os.environ.get("XFORMERS_FORCE_DISABLE_TRITON", "0") == "1":
+        return False
+    # We have many errors on V100 with recent triton versions
+    # Let's just drop support for triton kernels below A100
+    if torch.cuda.get_device_capability("cuda") < (8, 0):
+        return False
+    try:
+        import triton  # noqa
+        return True
+    except (ImportError, AttributeError):
+        logger.warning(
+            "A matching Triton is not available, some optimizations will not be enabled",
+            exc_info=True,
+        )
+        return False
+@compute_once
+def get_python_lib():
+    return torch.library.Library("xformers_python", "DEF")
+# end of file

.venv/lib/python3.11/site-packages/xformers/_cpp_lib.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import dataclasses
+import json
+import logging
+import os
+import platform
+from typing import Any, Dict, Optional
+import torch
+logger = logging.getLogger("xformers")
+UNAVAILABLE_FEATURES_MSG = (
+    "  Memory-efficient attention, SwiGLU, sparse and more won't be available."
+)
+@dataclasses.dataclass
+class _BuildInfo:
+    metadata: Dict[str, Any]
+    @property
+    def cuda_version(self) -> Optional[int]:
+        return self.metadata["version"]["cuda"]
+    @property
+    def hip_version(self) -> Optional[int]:
+        return self.metadata["version"]["hip"]
+    @property
+    def torch_version(self) -> str:
+        return self.metadata["version"]["torch"]
+    @property
+    def python_version(self) -> str:
+        return self.metadata["version"]["python"]
+    @property
+    def flash_version(self) -> str:
+        return self.metadata["version"].get("flash", "0.0.0")
+    @property
+    def use_torch_flash(self) -> bool:
+        return self.metadata["version"].get("use_torch_flash", False)
+    @property
+    def build_env(self) -> Dict[str, Any]:
+        return self.metadata["env"]
+class xFormersWasNotBuiltException(Exception):
+    def __str__(self) -> str:
+        return (
+            "Need to compile C++ extensions to use all xFormers features.\n"
+            "    Please install xformers properly "
+            "(see https://github.com/facebookresearch/xformers#installing-xformers)\n"
+            + UNAVAILABLE_FEATURES_MSG
+        )
+class xFormersInvalidLibException(Exception):
+    def __init__(self, build_info: Optional[_BuildInfo]) -> None:
+        self.build_info = build_info
+    def __str__(self) -> str:
+        if self.build_info is None:
+            msg = "xFormers was built for a different version of PyTorch or Python."
+        else:
+            msg = f"""xFormers was built for:
+    PyTorch {self.build_info.torch_version} with CUDA {self.build_info.cuda_version} (you have {torch.__version__})
+    Python  {self.build_info.python_version} (you have {platform.python_version()})"""
+        return (
+            "xFormers can't load C++/CUDA extensions. "
+            + msg
+            + "\n  Please reinstall xformers "
+            "(see https://github.com/facebookresearch/xformers#installing-xformers)\n"
+            + UNAVAILABLE_FEATURES_MSG
+        )
+def _register_extensions():
+    import importlib
+    import os
+    import torch
+    # load the custom_op_library and register the custom ops
+    lib_dir = os.path.dirname(__file__)
+    if os.name == "nt":
+        # Register the main torchvision library location on the default DLL path
+        import ctypes
+        import sys
+        kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+        with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
+        prev_error_mode = kernel32.SetErrorMode(0x0001)
+        if with_load_library_flags:
+            kernel32.AddDllDirectory.restype = ctypes.c_void_p
+        if sys.version_info >= (3, 8):
+            os.add_dll_directory(lib_dir)
+        elif with_load_library_flags:
+            res = kernel32.AddDllDirectory(lib_dir)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += f' Error adding "{lib_dir}" to the DLL directories.'
+                raise err
+        kernel32.SetErrorMode(prev_error_mode)
+    loader_details = (
+        importlib.machinery.ExtensionFileLoader,
+        importlib.machinery.EXTENSION_SUFFIXES,
+    )
+    extfinder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    if torch.version.hip and not hasattr(torch.version, "git_version"):
+        ext_specs = extfinder.find_spec("_C_hip")
+    else:
+        ext_specs = extfinder.find_spec("_C")
+    if ext_specs is None:
+        raise xFormersWasNotBuiltException()
+    cpp_lib_json = os.path.join(lib_dir, "cpp_lib.json")
+    with open(cpp_lib_json, "r") as fp:
+        build_metadata = _BuildInfo(json.load(fp))
+    try:
+        torch.ops.load_library(ext_specs.origin)
+    except OSError as exc:
+        raise xFormersInvalidLibException(build_metadata) from exc
+    return build_metadata
+_cpp_library_load_exception = None
+_build_metadata: Optional[_BuildInfo] = None
+try:
+    _build_metadata = _register_extensions()
+except (xFormersInvalidLibException, xFormersWasNotBuiltException) as e:
+    ENV_VAR_FOR_DETAILS = "XFORMERS_MORE_DETAILS"
+    if os.environ.get(ENV_VAR_FOR_DETAILS, False):
+        logger.warning(f"WARNING[XFORMERS]: {e}", exc_info=e)
+    else:
+        logger.warning(
+            f"WARNING[XFORMERS]: {e}\n  Set {ENV_VAR_FOR_DETAILS}=1 for more details"
+        )
+    _cpp_library_load_exception = e
+_built_with_cuda = (
+    _build_metadata is not None and _build_metadata.cuda_version is not None
+)

.venv/lib/python3.11/site-packages/xformers/_deprecation_warning.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import warnings
+def deprecated_function(self):
+    name = repr(self)  # self.__name__
+    msg = f"{name} is deprecated and is not maintained anymore. It might be removed in a future version of xFormers"
+    warnings.warn(msg, FutureWarning, stacklevel=2)

.venv/lib/python3.11/site-packages/xformers/attn_bias_utils.py ADDED Viewed

	@@ -0,0 +1,501 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type
+import torch
+from xformers.ops import AttentionBias, fmha
+from xformers.ops.fmha.attn_bias import AttentionBiasSubTensor
+from xformers.ops.fmha.common import AttentionOpBase
+def _create_aligned_bias(*shape: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                *shape[:-1],
+                align_to * ((shape[-1] + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    ).narrow(-1, 0, shape[-1])
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    num_heads_groups: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Optional[Type[AttentionOpBase]] = None,
+    page_size: Optional[int] = None,
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    window_size = {0: 3, 1: 128, 2: 300}[r.randint(0, 2)]
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        if op is not None and issubclass(op, fmha.triton_splitk.FwOp):
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads_groups, num_heads, q_len, kv_len),
+                    device=device,
+                    dtype=dtype,
+                )
+                * 3
+            )
+            if fmt in ["BMK", "BMHK"]:
+                attn_bias = attn_bias[:, 0]
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads_groups,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+            # make sure it also works if the first columns/rows are partially masked out
+            attn_bias[0, 0, 0, : q_len - 1, : kv_len - 1] = -math.inf
+            if fmt in ["BMK", "BMHK"]:
+                attn_bias = attn_bias[:, 0]
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return bias_type()
+    if bias_type is fmha.attn_bias.LowerTriangularFromBottomRightMask:
+        return bias_type()
+    if bias_type is fmha.attn_bias.LowerTriangularFromBottomRightLocalAttentionMask:
+        return bias_type(window_size)
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads_groups,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if fmt in ["BMK", "BMHK"]:
+            attn_bias = attn_bias[:, 0]
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+        fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
+        fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+    ]:
+        # These bias types are not supported in BMK format
+        assert fmt in ["BMGHK", "BMHK"]
+        max_q_minus_k = None
+        if bias_type in {
+            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+        }:
+            max_q_minus_k = 0
+        elif bias_type == fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask:
+            assert window_size is not None
+            max_q_minus_k = window_size - 1
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                max_q_minus_k=max_q_minus_k,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type in {
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+        }:
+            block_diag = fmha.attn_bias.BlockDiagonalMask(
+                q_seqinfo=block_diag.q_seqinfo,
+                k_seqinfo=block_diag.k_seqinfo,
+                _batch_sizes=block_diag._batch_sizes,
+            )
+            assert window_size is not None
+            if bias_type is fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask:
+                block_diag = block_diag.make_local_attention(window_size)
+            else:
+                block_diag = block_diag.make_local_attention_from_bottomright(
+                    window_size
+                )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalPaddedKeysMask,
+        fmha.attn_bias.BlockDiagonalCausalLocalAttentionPaddedKeysMask,
+        fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask,
+        fmha.attn_bias.PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
+    ]:
+        assert fmt in ["BMHK", "BMGHK"]
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        block_diag_type = (
+            bias_type._UNPAGED_TYPE
+            if issubclass(bias_type, fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask)
+            else bias_type
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalLocalAttentionPaddedKeysMask:
+            g_block_diag = block_diag_type.from_seqlens_local(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+                window_size=min(window_size, min(k)),
+            )
+        else:
+            g_block_diag = block_diag_type.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        if issubclass(bias_type, fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask):
+            assert page_size is not None
+            pages_per_row = (kv_len + page_size - 1) // page_size
+            block_tables = torch.tensor(
+                r.sample(range(batch_size * pages_per_row), batch_size * pages_per_row),
+                device=device,
+                dtype=torch.int32,
+            ).reshape(batch_size, pages_per_row)
+            return g_block_diag.make_paged(
+                block_tables=block_tables, page_size=page_size, paged_type=bias_type
+            )
+        return g_block_diag
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalCausalWithOffsetGappyKeysMask,
+        fmha.attn_bias.BlockDiagonalGappyKeysMask,
+    ]:
+        assert fmt in ["BMHK", "BMGHK"]
+        max_q_minus_k = (
+            None if bias_type is fmha.attn_bias.BlockDiagonalGappyKeysMask else 0
+        )
+        q, k = _rand_seqlens(r, batch_size, q_len, kv_len, max_q_minus_k)
+        total_kv_len = kv_len * batch_size
+        starts = [r.randint(0, total_kv_len - ki) for ki in k] + [total_kv_len]
+        return fmha.attn_bias.BlockDiagonalGappyKeysMask.from_seqlens(
+            q_seqlen=q,
+            kv_seqstarts=starts,
+            kv_seqlen=k,
+        )
+    if bias_type in [
+        fmha.attn_bias.PagedBlockDiagonalGappyKeysMask,
+    ]:
+        assert fmt in ["BMHK", "BMGHK"]
+        assert page_size is not None
+        pages_per_row = (kv_len + page_size - 1) // page_size
+        total_queries = q_len * batch_size
+        q = _rand_maxed_partition(r, total_queries, batch_size, total_queries, False)
+        k = [r.randint(1, kv_len) for _ in range(batch_size)]
+        row_size = pages_per_row * page_size
+        starts = [row_size * i + r.randint(0, row_size - ki) for i, ki in enumerate(k)]
+        starts.append(pages_per_row * batch_size * page_size)
+        block_diag_type = bias_type._UNPAGED_TYPE  # type: ignore
+        g_block_diag = block_diag_type.from_seqlens(
+            q_seqlen=q,
+            kv_seqstarts=starts,
+            kv_seqlen=k,
+        )
+        block_tables = torch.tensor(
+            r.sample(range(batch_size * pages_per_row), batch_size * pages_per_row),
+            device=device,
+            dtype=torch.int32,
+        ).reshape(batch_size, pages_per_row)
+        return g_block_diag.make_paged(
+            block_tables=block_tables,
+            page_size=page_size,
+            paged_type=bias_type,
+            notional_padding=page_size * pages_per_row,
+        )
+    if bias_type == fmha.attn_bias.LocalAttentionFromBottomRightMask:
+        return bias_type(
+            window_left=r.randint(0, 5),
+            window_right=r.randint(0, 5),
+        )
+    assert False, f"Unsupported bias type: {bias_type}"
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    max_q_minus_k: Optional[int],
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    max_q_minus_k: maximum allowed num_queries - num_keys.
+        For "bottom-right" masks it's 0, we need to have more keys than
+        queries, otherwise some queries have no keys to attend to.
+        For BlockDiagonalCausalMask it's None, there is no constraint
+        on num_queries - num_keys.
+        For BlockDiagonalCausalLocalAttentionMask it's equal
+        to the window size.
+    """
+    if max_q_minus_k == 0:
+        # In case max_q_minus_k > 0 the exact condition is
+        # kv_len >= q_len - max_q_minus_k * batch_size,
+        # but we can't check it without knowing the actual batch size,
+        # which is determined in the loop below.
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        if max_q_minus_k is None:
+            # Simple case - no constraint on the number of queries and keys.
+            num_queries = r.randrange(*step_q)
+            seqlens_q.append(num_queries)
+            seqlens_k.append(r.randrange(*step_k))
+        else:
+            # In this case we need to make sure num_queries - num_keys < max_q_minus_k holds for every batch element.
+            # To do this, when choosing num_queries and num_keys at a given step,
+            # we ensure two conditions are satisfied:
+            # 1) num_queries <= num_keys + max_q_minus_k for the current batch element
+            # 2) Same holds for the remaining keys and queries, i.e.
+            #    queries_left - num_queries <= keys_left - num_keys + max_q_minus_k
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q, 0)
+            assert (
+                keys_left >= queries_left - max_q_minus_k
+            ), f"{keys_left=} {queries_left=} {max_q_minus_k=} {kv_len=} {q_len=} {seqlens_k=} {seqlens_q=}"
+            # Limit num_queries from above: if num_queries > keys_left + max_q_minus_k,
+            # condition num_queries <= num_keys + max_q_minus_k can't be satisfied even if we take
+            # all the remaining keys
+            max_queries_to_take = min(queries_left, keys_left + max_q_minus_k)
+            num_queries = r.randrange(1, max_queries_to_take + 1)
+            seqlens_q.append(num_queries)
+            # Now we know num_queries, let's select num_keys.
+            # How many keys can we use for the current batch element so that
+            # for the remaining keys and values the constraint
+            # num_queries - num_keys < max_q_minus_k holds on the next step?
+            extra_keys_available = keys_left - queries_left + max_q_minus_k + 1
+            assert extra_keys_available >= 0
+            if extra_keys_available > 0:
+                seqlens_k.append(num_queries + r.randrange(0, extra_keys_available))
+            else:
+                seqlens_k.append(num_queries)
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 5:
+        def attn_bias_group(group: int):
+            if isinstance(attn_bias, fmha.attn_bias.AttentionBiasSubTensor):
+                if attn_bias.HOLDS_DENSE_TENSOR:
+                    return attn_bias[:, group]
+            elif isinstance(attn_bias, torch.Tensor):
+                return attn_bias[:, group]
+            return attn_bias
+        return torch.stack(
+            [
+                ref_attention_bmhk(
+                    q[:, :, g],
+                    k[:, :, g],
+                    v[:, :, g],
+                    scale=scale,
+                    attn_bias=attn_bias_group(g),
+                )
+                for g in range(q.shape[2])
+            ],
+            dim=2,
+        )
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, scale=scale, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    q = q * scale
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, (AttentionBias, AttentionBiasSubTensor)):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor.float()
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+    assert q.ndim == 4
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+    if isinstance(attn_bias, (AttentionBias, AttentionBiasSubTensor)):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+def pack_kv_cache(
+    cache_k: torch.Tensor,
+    cache_v: torch.Tensor,
+    kv_seqlens: List[int],
+    BLOCK_N: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Create block tables and pages K/V cache for testing paged attention.
+    Args:
+        cache_k, cache_v: K/V caches, each of shape [B, MAX_T, H_kv, D].
+            Note that these tensors are unexpanded,
+            i.e. for multiquery case cache_k.shape[2] = 1
+        kv_seqlens: list of K/V sequence lengths
+        BLOCK_N: number of tokens per per paged attention block
+        B: batch size
+    Returns:
+        block_tables: [B, MAX_BLOCKS]
+        packed_cache_k: [1, total_len_rounded, H_kv, D]
+        packed_cache_v: [1, total_len_rounded, H_kv, D]
+    where total_len_rounded is a sum of K/V seqlens, each rounded up
+    to a multiple of BLOCK_N.
+    """
+    kv_seqlens_rounded = [(x + BLOCK_N - 1) // BLOCK_N * BLOCK_N for x in kv_seqlens]
+    total_len_rounded = sum(kv_seqlens_rounded)
+    B, MAX_T, H, D = cache_k.shape
+    packed_cache_k = torch.empty(
+        total_len_rounded, H, D, device=cache_k.device, dtype=cache_k.dtype
+    )
+    packed_cache_v = torch.empty(
+        total_len_rounded, H, D, device=cache_k.device, dtype=cache_k.dtype
+    )
+    seqstart = 0
+    for b in range(B):
+        packed_cache_k[seqstart : seqstart + kv_seqlens[b]] = cache_k[
+            b, : kv_seqlens[b]
+        ].clone()
+        packed_cache_v[seqstart : seqstart + kv_seqlens[b]] = cache_v[
+            b, : kv_seqlens[b]
+        ].clone()
+        seqstart += kv_seqlens_rounded[b]
+    num_blocks_per_row = (MAX_T + BLOCK_N - 1) // BLOCK_N
+    block_tables = (
+        torch.arange(num_blocks_per_row, device="cuda", dtype=torch.int32)
+        .unsqueeze(0)
+        .expand(B, num_blocks_per_row)
+    )
+    seqstarts = (
+        (
+            torch.tensor(kv_seqlens_rounded).cumsum(dim=0)
+            - torch.tensor(kv_seqlens_rounded)
+        )
+        .to(device="cuda")
+        .unsqueeze(1)
+    ) // BLOCK_N
+    block_tables = (block_tables + seqstarts).contiguous().to(dtype=torch.int32)
+    return (
+        block_tables,
+        packed_cache_k.unsqueeze(0),
+        packed_cache_v.unsqueeze(0),
+    )

.venv/lib/python3.11/site-packages/xformers/checkpoint.py ADDED Viewed

	@@ -0,0 +1,546 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+import time
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import astuple, dataclass
+from typing import Any, Callable, ContextManager, Dict, List, Optional, Tuple
+import torch
+from torch.testing._internal.composite_compliance import (
+    is_inplace,
+    is_inplace_view_fn,
+    is_view_fn,
+)
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map
+_scipy_is_available = False
+try:
+    from scipy.optimize import Bounds, LinearConstraint, milp
+    _scipy_is_available = True
+except ImportError:
+    _scipy_is_available = False
+try:
+    # let's keep BC for older PyTorch for now
+    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+        ActivationWrapper,
+    )
+    from torch.utils.checkpoint import (  # type: ignore
+        _CachedTorchDispatchMode,
+        _CachingTorchDispatchMode,
+    )
+except ImportError:
+    ActivationWrapper = torch.nn.Module  # type: ignore
+    class _NotAvailable:
+        def __init__(self, *args, **kwargs):
+            raise RuntimeError("Need PyTorch >= 2.2")
+    _CachedTorchDispatchMode = _NotAvailable  # type: ignore
+    _CachingTorchDispatchMode = _NotAvailable  # type: ignore
+try:
+    from torch.utils.checkpoint import SAC_IGNORED_OPS as _ignored_ops  # type: ignore
+    _PT_HAS_NEW_IMPL = True
+except ImportError:
+    from torch.utils.checkpoint import _ignored_ops  # type: ignore
+    _PT_HAS_NEW_IMPL = False
+_additional_ignored_ops = {
+    torch.ops.aten.lift_fresh.default,
+    torch.ops.profiler._record_function_exit._RecordFunction,
+    torch.ops.aten.clone.default,  # seems needed for torch.compile
+}
+OPS_TO_ALWAYS_SKIP = _ignored_ops | _additional_ignored_ops
+@dataclass
+class ProfileMetadata:
+    name: str
+    time_taken: float
+    memory_used: float
+    curr_idx: int
+    output_ids: Any
+    inplace_info: Tuple[int, int]
+    is_view_like: bool
+    is_rand_op: bool
+def _get_default_policy(allow_list=None):
+    _default_allow_list = [
+        "xformers.efficient_attention_forward_cutlass.default",
+        "xformers_flash.flash_fwd.default",
+        "aten.addmm.default",
+        "aten.mm.default",
+    ]
+    if allow_list is None:
+        allow_list = _default_allow_list
+    def _default_policy(ctx, func, *args, **kwargs):
+        return str(func) in allow_list
+    return _default_policy
+class VerboseTorchDispatchMode(TorchDispatchMode):
+    def __init__(self):
+        self.operators = []
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        self.operators.append(func)
+        return func(*args, **kwargs)
+def list_operators(function, *args, **kwargs):
+    """
+    Returns the list of operators used inside `function` with
+    *args and **kwargs
+    """
+    verbose_mode = VerboseTorchDispatchMode()
+    with verbose_mode:
+        function(*args, **kwargs)
+    return verbose_mode.operators
+class CachedTorchDispatchMode(_CachedTorchDispatchMode):
+    def __init__(self, policy_fn, storage, allow_cache_entry_mutation):
+        global _PT_HAS_NEW_IMPL
+        if _PT_HAS_NEW_IMPL:
+            super().__init__(policy_fn, storage, allow_cache_entry_mutation)
+        else:
+            super().__init__(policy_fn, storage)
+    # this is here for the old implementations
+    def pop_from_storage(self, func, args, kwargs):
+        # the autograd engine might add spurious views. This is a basic
+        # guard and should be improved
+        if self.storage[func]:
+            return self.storage[func].pop(0)
+        return func(*args, **kwargs)
+class NullTorchDispatchMode(TorchDispatchMode):
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        return func(*args, **kwargs)
+def selective_checkpoint_context_fn(policy_fn=None):
+    """An activation checkpoint context_fn for selectively deciding what to
+    store and what to recompute. Accepts a custom policy.
+    Args:
+        policy_fn(Union[List[Op], callable]): policy for deciding what to
+            store (instead of recompute). If it's a function, it should
+            be of form (func, *args, **kwargs) -> bool which indicates
+            if func outputs with *args and **kwargs should be stored or not.
+            Additionally, a list[Op] is also supported for easier cases.
+            The op should be in the format `torch.ops.***`, where the `***`
+            names of operators can be obtained with `list_operators`.
+    """
+    if policy_fn is None:
+        policy_fn = _get_default_policy()
+    elif isinstance(policy_fn, list):
+        policy_fn = _get_default_policy(policy_fn)
+    else:
+        assert callable(policy_fn), "policy_fn should be None, list or a callable"
+    temp_storage: Dict[Any, List[Any]] = defaultdict(list)
+    # assumption: grad_mode doesn't change inside function
+    caching_mode: ContextManager[None]
+    if torch.is_grad_enabled():
+        caching_mode = _CachingTorchDispatchMode(deepcopy(policy_fn), temp_storage)
+    else:
+        caching_mode = NullTorchDispatchMode()
+    cached_mode = CachedTorchDispatchMode(deepcopy(policy_fn), temp_storage, True)
+    return caching_mode, cached_mode
+def checkpoint(
+    function, *args, preserve_rng_state=True, policy_fn=None, **kwargs
+) -> Any:
+    """Wrapper around torch.utils.checkpoint that accepts a custom policy
+    function for selectively deciding what to store and what to recompute
+    Args:
+        function: describes what to run in the forward pass of the model or
+            part of the model. It should also know how to handle the inputs
+            passed as the tuple. For example, in LSTM, if user passes
+            ``(activation, hidden)``, :attr:`function` should correctly use the
+            first input as ``activation`` and the second input as ``hidden``
+        preserve_rng_state(bool, optional):  Omit stashing and restoring
+            the RNG state during each checkpoint.
+            Default: ``True``
+        policy_fn(Union[List[Op], callable]): policy for deciding what to
+            store (instead of recompute). If it's a function, it should
+            be of form (func, *args, **kwargs) -> bool which indicates
+            if func outputs with *args and **kwargs should be stored or not.
+            Additionally, a list[Op] is also supported for easier cases.
+            The op should be in the format `torch.ops.***`, where the `***`
+            names of operators can be obtained with `list_operators`.
+        *args: Arguments to pass in to the given ``function``.
+        **kwargs: Keyword arguments to pass into the given ``function``.
+    """
+    return torch.utils.checkpoint.checkpoint(
+        function,
+        *args,
+        use_reentrant=False,
+        preserve_rng_state=preserve_rng_state,
+        context_fn=functools.partial(selective_checkpoint_context_fn, policy_fn),
+        **kwargs,
+    )
+class ProfileOperatorsTorchDispatchMode(TorchDispatchMode):
+    def __init__(self, num_runs: int = 10) -> None:
+        self.data: List[ProfileMetadata] = []
+        self.num_runs: int = num_runs
+    def _get_inplace_metadata(self, func, out) -> Tuple[int, int, Tuple[int, ...]]:
+        curr_idx = len(self.data)
+        def get_tensor_id(e):
+            return (
+                e.untyped_storage().data_ptr() if isinstance(e, torch.Tensor) else None
+            )
+        output_ids = tree_map(get_tensor_id, out)
+        if not is_inplace(func):
+            return curr_idx, output_ids, ()
+        op_id = curr_idx
+        op_parent_id = -1
+        for i, d in enumerate(self.data):
+            # find the first occurence of a tensor that
+            # shares the same storage as the current tensor
+            past_output_ids = d.output_ids
+            past_output_ids = (
+                [past_output_ids]
+                if not isinstance(past_output_ids, (list, tuple, dict))
+                else past_output_ids
+            )
+            if output_ids in past_output_ids:
+                op_parent_id = i
+                break
+        if op_parent_id < 0:
+            op_parent_id = op_id
+        inplace_info = (op_id, op_parent_id)
+        return curr_idx, output_ids, inplace_info
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        out = func(*args, **kwargs)
+        curr_idx, output_ids, inplace_info = self._get_inplace_metadata(func, out)
+        is_view_like = is_view_fn(func) or is_inplace_view_fn(func)
+        is_rand_op = torch.Tag.nondeterministic_seeded in func.tags
+        # sdpa has non-deterministic seed, but might be deterministic
+        # if no dropout is applied
+        if func.overloadpacket.__name__ == "_scaled_dot_product_flash_attention":
+            is_rand_op = kwargs.get("dropout_p", 0) != 0
+        # get runtime info of func
+        torch.cuda.synchronize()
+        t = time.time()
+        for i in range(self.num_runs):
+            func(*args, **kwargs)
+        torch.cuda.synchronize()
+        time_taken = (time.time() - t) / self.num_runs
+        # get memory usage of func
+        torch.cuda.reset_peak_memory_stats()
+        mem1 = torch.cuda.max_memory_allocated() / 2**20
+        func(*args, **kwargs)
+        mem2 = torch.cuda.max_memory_allocated() / 2**20
+        self.data.append(
+            ProfileMetadata(
+                func,
+                time_taken,
+                mem2 - mem1,
+                curr_idx,
+                output_ids,
+                inplace_info,
+                is_view_like,
+                is_rand_op,
+            )
+        )
+        return out
+def _analyze_operators(function, *args) -> List[ProfileMetadata]:
+    """
+    Use ProfileOperatorsTorchDispatchMode to get runtime and memory info.
+    Args:
+        function: The function to optimize which will be selectively checkpointed. Usually the forward pass
+            of the model.
+        *args: Arguments to pass in to the given ``function``.
+    Returns:
+        A list of tuples, where each tuples contains the name of the operator, the runtime of the operator,
+            and the memory usage of the operator.
+    """
+    profile_ops = ProfileOperatorsTorchDispatchMode()
+    with profile_ops:
+        function(*args)
+    data = profile_ops.data
+    return data
+def get_optimal_checkpoint_policy(function, *args, memory_budget: float) -> Callable:
+    """
+    Given a function, its arguments, and the maximum amount of memory available,
+    find the subset of operators that can be optimized to reduce runtime while still fitting within the memory budget.
+    Args:
+        function: The function to optimize which will be selectively checkpointed. Usually the forward pass
+            of the model.
+        *args: Arguments to pass in to the given ``function``.
+        memory_budget (float): A float between 0 and 1 which describes what percentage of the total memory to use.
+    Returns:
+        A callable policy which can be passed to xformers.checkpoint()
+    Raises:
+        RuntimeError: If `scipy` is not available.
+        ValueError: If `memory_budget` is not a float between 0 and 1.
+    """
+    if not _scipy_is_available:
+        raise RuntimeError(
+            "Please install scipy 1.9.0+ to use `get_optimal_checkpoint_policy`. You can do so using "
+            "`pip install scipy`."
+        )
+    if memory_budget < 0 or memory_budget > 1:
+        raise ValueError(
+            f"`memory_budget` must be a float between 0 and 1. Got {memory_budget}."
+        )
+    data = _analyze_operators(function, *args)
+    # remove aten.detach.default from the list of ops because autograd
+    # inserts those during backward and it breaks the fwd-bwd alignment
+    data = [x for x in data if x.name not in OPS_TO_ALWAYS_SKIP]
+    ops, runtimes_, memory_, new_ids, _, inplace_ops_, view_like_ops_, rand_ops_ = zip(
+        *[astuple(x) for x in data]
+    )
+    runtimes = torch.tensor(runtimes_, dtype=torch.float64)
+    memory = torch.tensor(memory_, dtype=torch.float64)
+    view_like_ops = [i for i, x in enumerate(view_like_ops_) if x]
+    rand_ops = [i for i, x in enumerate(rand_ops_) if x]
+    # remap the inplace indices as we have removed OPS_TO_ALWAYS_SKIP
+    inplace_ops = [tuple(map(new_ids.index, x)) for x in inplace_ops_ if x]
+    # the last operation is always stored as the output of the checkpoint
+    # block, so we can avoid recomputing it. We set the memory to zero
+    # instead of adding a new constraint because we want both the 0 and 1
+    # endpoints for memory_budget to be valid
+    # FIXME: this heuristic for finding the last non-view non-inplace op
+    # might not always be correct, which would yield suboptimal policies
+    last_op = len(ops) - 1
+    skip_ops_ = set(view_like_ops) | set([x[0] for x in inplace_ops])
+    skip_ops = sorted(list(skip_ops_))
+    for op in reversed(skip_ops):
+        if op == last_op:
+            last_op -= 1
+    memory[last_op] = 0
+    max_memory = memory_budget * memory.sum().item()
+    # workaround to fix https://github.com/pytorch/pytorch/issues/121212
+    force_store_random = all([not isinstance(x, torch.Tensor) for x in args])
+    optim_output = _optimize_runtime_with_given_memory(
+        memory=memory,
+        runtimes=runtimes,
+        max_memory=max_memory,
+        view_like_ops=view_like_ops,
+        inplace_ops=inplace_ops,
+        random_ops=rand_ops,
+        force_store_random=force_store_random,
+    )
+    return _OptimalPolicy(optim_output=optim_output)
+def _optimize_runtime_with_given_memory(
+    memory: torch.Tensor,
+    runtimes: torch.Tensor,
+    max_memory: float,
+    view_like_ops: List[int],
+    inplace_ops: List[Tuple[int, ...]],
+    random_ops: List[int],
+    force_store_random: bool,
+) -> torch.Tensor:
+    """
+    Given a list of operator names, their corresponding runtimes, and the maximum amount of memory available,
+    find the subset of operators that can be optimized to reduce runtime while still fitting within the memory budget.
+    Uses https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.milp.html
+    Args:
+        memory (torch.Tensor): Tensor containing the memory usage of each operator.
+        runtimes (torch.Tensor): Tensor containing the runtime of each operator.
+        max_memory (float): Maximum amount of memory to use.
+        view_like_ops ([List[int]): Indices of the view-like ops.
+        inplace_ops (List[Tuple[int, int]]): Tuple with the pair of inplace op -> parent of inplace op.
+            This will be used to add the constraint that in-place ops need to either be
+            stored in memory with the previous op, or recomputed with the previous op.
+        random_ops ([List[int]): Indices of the random ops, which will always be recomputed.
+        force_store_random (bool): force random ops to always be stored (instead of recomputed)
+    """
+    c = -runtimes  # type: ignore[operator]
+    memory_constraint = LinearConstraint(A=memory, ub=max_memory)
+    constraints = [memory_constraint]
+    # view-like ops should always be recomputed
+    for i in view_like_ops:
+        A = torch.zeros_like(c)
+        A[i] = 1
+        constraints.append(LinearConstraint(A=A, lb=0, ub=0))
+    # inplace ops should always be done in conjunction with its parent op
+    # i.e., if we recompute the parent op the inplace should also be
+    # recomputed, and vice versa
+    for op, op_parent in inplace_ops:
+        A = torch.zeros_like(c)
+        if op != op_parent:
+            A[op_parent] = 1
+            A[op] = -1
+            constraints.append(LinearConstraint(A=A, lb=0, ub=0))
+        else:
+            # if op == op_parent, it's because it's the first op
+            # that is inplace. Thus never recompute it
+            A[op] = 1
+            constraints.append(LinearConstraint(A=A, lb=1, ub=1))
+    # ideally, always recompute random ops
+    # in practice, due to a bug in https://github.com/pytorch/pytorch/issues/121212
+    # sometimes we need to store them to avoid correctness issues
+    for i in random_ops:
+        A = torch.zeros_like(c)
+        A[i] = 1
+        val = int(force_store_random)
+        constraints.append(LinearConstraint(A=A, lb=val, ub=val))
+    integrality = torch.ones_like(c)
+    res = milp(
+        c=c, constraints=constraints, integrality=integrality, bounds=Bounds(0, 1)
+    )
+    if not res.success:
+        raise ValueError(
+            "The problem is infeasible, and probably due to a change in xformers "
+            "that makes random ops always be stored. Try passing a larger memory_budget. "
+            "This will be fixed once https://github.com/pytorch/pytorch/issues/121212 "
+            "is solved"
+        )
+    x = torch.from_numpy(res.x)
+    return x
+class _OptimalPolicy:
+    def __init__(self, optim_output: torch.Tensor):
+        self.counter = 0
+        self.optim_output = optim_output.tolist()
+    def __call__(self, ctx, func, *args, **kwargs) -> bool:
+        # returning False means recompute, True means store in memory
+        if func in OPS_TO_ALWAYS_SKIP:
+            return False
+        count = self.counter
+        self.counter += 1
+        return self.optim_output[count] == 1
+class SelectiveCheckpointWrapper(ActivationWrapper):
+    def __init__(self, mod, memory_budget=None, policy_fn=None):
+        super().__init__(mod)
+        if not ((memory_budget is None) ^ (policy_fn is None)):
+            raise ValueError("Need to specify either policy_fn or memory_budget")
+        self.memory_budget = memory_budget
+        self.policy_fn = policy_fn
+        try:
+            # for backward-compatibility as this doesn't exist in PT anymore
+            torch._dynamo.config._experimental_support_context_fn_in_torch_utils_checkpoint = (
+                True
+            )
+        except AttributeError:
+            pass
+    @torch.compiler.disable
+    def _get_policy_fn(self, *args, **kwargs):
+        if not torch.is_grad_enabled():
+            # no need to compute a policy as it won't be used
+            return []
+        # if policy is not specified, initialize policy for a given memory budget
+        with torch.random.fork_rng():
+            policy_fn = get_optimal_checkpoint_policy(
+                self._checkpoint_wrapped_module,
+                *args,
+                **kwargs,
+                memory_budget=self.memory_budget,
+            )
+        if (
+            torch.distributed.is_available()
+            and torch.distributed.is_initialized()
+            and torch.distributed.get_world_size() > 1
+        ):
+            # use the same policy across different GPUs
+            objects = [policy_fn]
+            torch.distributed.broadcast_object_list(objects, src=0)
+            policy_fn = objects[0]
+        return policy_fn
+    def get_policy_fn(self, *args, **kwargs):
+        if self.policy_fn is None:
+            self.policy_fn = self._get_policy_fn(*args, **kwargs)
+        return self.policy_fn
+    def forward(self, *args, **kwargs):
+        policy_fn = self.get_policy_fn(*args, **kwargs)
+        return checkpoint(
+            self._checkpoint_wrapped_module, *args, **kwargs, policy_fn=policy_fn
+        )
+def selective_checkpoint_wrapper(
+    module: torch.nn.Module,
+    memory_budget: Optional[float] = None,
+    policy_fn: Optional[Callable] = None,
+):
+    """
+    Wrap a module with selective activation checkpointing.
+    It behaves similarly to PyTorch's checkpoint_wrapper, but gives the possibility
+    to the user to either specify a handcrafted policy_fn, or to let an optimization
+    algorithm to select the policy given a user-specified memory_budget.
+    The user should either specify the memory_budget argument or the policy_fn.
+    The memory_budget is a float value between 0 (recompute everything in the backward) or 1
+    (store everything for backward). Using a value of 0 should be similar to PyTorch's
+    activation checkpoint, while 1 should be similar to the behavior of not using any
+    activation checkpointing.
+    """
+    return SelectiveCheckpointWrapper(module, memory_budget, policy_fn)

.venv/lib/python3.11/site-packages/xformers/cpp_lib.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"version": {"cuda": 1201, "hip": null, "torch": "2.5.1+cu121", "python": "3.11.10", "flash": "v2.6.3-24-gbdf733b", "use_torch_flash": true}, "env": {"TORCH_CUDA_ARCH_LIST": "6.0+PTX 7.0 7.5 8.0+PTX 9.0a", "PYTORCH_ROCM_ARCH": null, "XFORMERS_BUILD_TYPE": "Release", "XFORMERS_ENABLE_DEBUG_ASSERTIONS": null, "NVCC_FLAGS": "-allow-unsupported-compiler", "XFORMERS_PACKAGE_FROM": "wheel-v0.0.28.post3"}}

.venv/lib/python3.11/site-packages/xformers/factory/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from xformers.components import MultiHeadDispatchConfig  # noqa
+from xformers.components.attention import AttentionConfig  # noqa
+from xformers.components.feedforward import FeedforwardConfig  # noqa
+from xformers.components.positional_embedding import PositionEmbeddingConfig  # noqa
+from .block_factory import xFormerDecoderBlock  # noqa
+from .block_factory import xFormerDecoderConfig  # noqa
+from .block_factory import xFormerEncoderBlock  # noqa
+from .block_factory import xFormerEncoderConfig  # noqa
+from .model_factory import xFormer, xFormerConfig  # noqa
+from .weight_init import xFormerWeightInit  # noqa

.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (962 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/block_configs.cpython-311.pyc ADDED Viewed

Binary file (9.6 kB). View file

.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/block_factory.cpython-311.pyc ADDED Viewed

Binary file (13.7 kB). View file

.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/hydra_helper.cpython-311.pyc ADDED Viewed

Binary file (1.87 kB). View file

.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/model_factory.cpython-311.pyc ADDED Viewed

Binary file (14 kB). View file

.venv/lib/python3.11/site-packages/xformers/factory/__pycache__/weight_init.cpython-311.pyc ADDED Viewed

Binary file (13 kB). View file

.venv/lib/python3.11/site-packages/xformers/factory/block_configs.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, Optional
+from xformers.components import NormalizationType, ResidualNormStyle
+from xformers.components.feedforward import FEEDFORWARD_REGISTRY, FeedforwardConfig
+from xformers.components.positional_embedding import (
+    POSITION_EMBEDDING_REGISTRY,
+    PositionEmbeddingConfig,
+)
+from xformers.utils import generate_matching_config
+class LayerPositionBitmask(int, Enum):
+    First = 0b01
+    Last = 0b10
+    Default = 0b11
+class LayerPosition:
+    """Bitmask to mark this layer as first, last, nothing or both"""
+    def __init__(self):
+        self.bitmask = LayerPositionBitmask.Default
+    def is_first(self):
+        return bool(self.bitmask & LayerPositionBitmask.First)
+    def is_last(self):
+        return bool(self.bitmask & LayerPositionBitmask.Last)
+    def mark_not_first(self):
+        self.bitmask &= ~LayerPositionBitmask.First
+    def mark_not_last(self):
+        self.bitmask &= ~LayerPositionBitmask.Last
+class BlockType(str, Enum):
+    Encoder = "encoder"
+    Decoder = "decoder"
+@dataclass(init=False)  # handle constructors explicitly to force type changes
+class xFormerBlockConfig:
+    """
+    The configuration structure to define a Transformer block.
+    This base class is applicable to both encoder and decoder definitions.
+    This completely defines each of the blocks, for instance in terms of dimensions,
+    position encoding, pre or post layer norms or reversibility.
+    """
+    dim_model: int
+    feedforward_config: FeedforwardConfig
+    position_encoding_config: Optional[PositionEmbeddingConfig]
+    block_type: BlockType
+    residual_norm_style: ResidualNormStyle
+    normalization: NormalizationType
+    layer_position: LayerPosition
+    use_triton: bool
+    reversible: bool
+    num_layers: int
+    def __init__(
+        self,
+        dim_model: int,
+        feedforward_config: Dict[str, Any],
+        position_encoding_config: Optional[Dict[str, Any]],
+        block_type: BlockType,
+        residual_norm_style: ResidualNormStyle = ResidualNormStyle("post"),
+        normalization: NormalizationType = NormalizationType.LayerNorm,
+        reversible: bool = False,
+        num_layers: int = 1,
+        layer_position: Optional[LayerPosition] = None,
+    ):
+        self.dim_model = dim_model
+        self.block_type = block_type
+        self.residual_norm_style = residual_norm_style
+        self.reversible = reversible
+        self.num_layers = num_layers
+        self.normalization = normalization
+        # Fill in possible gaps in the config for subparts of the block
+        self.feedforward_config = generate_matching_config(
+            feedforward_config,
+            FEEDFORWARD_REGISTRY[feedforward_config["name"]].config,
+        )
+        self.position_encoding_config = (
+            generate_matching_config(
+                position_encoding_config,
+                POSITION_EMBEDDING_REGISTRY[position_encoding_config["name"]].config,
+            )
+            if position_encoding_config is not None
+            else None
+        )
+        # Default is that this layer is the only one, so both first and last
+        if layer_position:
+            self.layer_position = layer_position
+        else:
+            self.layer_position = LayerPosition()
+@dataclass(init=False)
+class xFormerEncoderConfig(xFormerBlockConfig):
+    """
+    The configuration structure for an encoder block
+    """
+    multi_head_config: Dict[str, Any]
+    use_triton: bool
+    simplicial_embeddings: Optional[Dict[str, Any]]
+    patch_embedding_config: Optional[Dict[str, Any]]
+    def __init__(
+        self,
+        dim_model: int,
+        feedforward_config: Dict[str, Any],
+        multi_head_config: Dict[str, Any],
+        position_encoding_config: Optional[Dict[str, Any]] = None,
+        residual_norm_style: str = "post",
+        normalization: NormalizationType = NormalizationType.LayerNorm,
+        use_triton: bool = True,
+        simplicial_embeddings: Optional[Dict[str, Any]] = None,
+        patch_embedding_config: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        # Convenience, fill in duplicated fields
+        try:
+            if "dim_model" not in multi_head_config.keys():
+                multi_head_config["dim_model"] = dim_model
+            if "dim_model" not in feedforward_config.keys():
+                feedforward_config["dim_model"] = dim_model
+            if (
+                position_encoding_config is not None
+                and "dim_model" not in position_encoding_config.keys()
+            ):
+                position_encoding_config["dim_model"] = dim_model
+            if (
+                patch_embedding_config is not None
+                and "out_channels" not in patch_embedding_config.keys()
+            ):
+                patch_embedding_config["out_channels"] = dim_model
+        except AttributeError:
+            # A config instance was passed in, this is fine
+            pass
+        if "block_type" in kwargs:
+            assert kwargs["block_type"] == "encoder"
+        kwargs["block_type"] = BlockType("encoder")
+        super().__init__(
+            dim_model=dim_model,
+            feedforward_config=feedforward_config,
+            position_encoding_config=position_encoding_config,
+            residual_norm_style=ResidualNormStyle(residual_norm_style),
+            normalization=NormalizationType(normalization),
+            **kwargs,
+        )
+        self.multi_head_config = multi_head_config
+        self.use_triton = use_triton
+        self.simplicial_embeddings = simplicial_embeddings
+        self.patch_embedding_config = patch_embedding_config
+@dataclass(init=False)
+class xFormerDecoderConfig(xFormerBlockConfig):
+    """
+    The configuration structure for a decoder block.
+    This specifically defines the masked and cross attention mechanisms,
+    on top of the settings defining all blocks.
+    """
+    multi_head_config_masked: Dict[str, Any]  # prior to encoder output
+    multi_head_config_cross: Dict[str, Any]  # cross attention, takes encoder output
+    def __init__(
+        self,
+        dim_model: int,
+        feedforward_config: Dict[str, Any],
+        multi_head_config_masked: Dict[str, Any],
+        multi_head_config_cross: Dict[str, Any],
+        position_encoding_config: Optional[Dict[str, Any]] = None,
+        residual_norm_style: str = "post",
+        normalization: NormalizationType = NormalizationType.LayerNorm,
+        use_triton: bool = True,
+        **kwargs,
+    ):
+        # Convenience, fill in duplicated field
+        try:
+            if "dim_model" not in multi_head_config_masked.keys():
+                multi_head_config_masked["dim_model"] = dim_model
+            if "dim_model" not in multi_head_config_cross.keys():
+                multi_head_config_cross["dim_model"] = dim_model
+            if "dim_model" not in feedforward_config.keys():
+                feedforward_config["dim_model"] = dim_model
+            if (
+                position_encoding_config is not None
+                and "dim_model" not in position_encoding_config.keys()
+            ):
+                position_encoding_config["dim_model"] = dim_model
+        except AttributeError:
+            # A config instance was passed in, this is fine
+            pass
+        if "block_type" in kwargs.keys():
+            assert kwargs["block_type"] == "decoder"
+        kwargs["block_type"] = BlockType("decoder")
+        super().__init__(
+            dim_model=dim_model,
+            feedforward_config=feedforward_config,
+            position_encoding_config=position_encoding_config,
+            residual_norm_style=ResidualNormStyle(residual_norm_style),
+            normalization=NormalizationType(normalization),
+            **kwargs,
+        )
+        self.multi_head_config_masked = multi_head_config_masked
+        self.multi_head_config_cross = multi_head_config_cross
+        self.use_triton = use_triton

.venv/lib/python3.11/site-packages/xformers/factory/block_factory.py ADDED Viewed

	@@ -0,0 +1,358 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import asdict
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from xformers._deprecation_warning import deprecated_function
+from xformers.components import (
+    PatchEmbeddingConfig,
+    PostNorm,
+    PreNorm,
+    Residual,
+    ResidualNormStyle,
+    build_multi_head_attention,
+    build_patch_embedding,
+)
+from xformers.components.attention import AttentionMask
+from xformers.components.feedforward import build_feedforward
+from xformers.components.positional_embedding import build_positional_embedding
+from xformers.components.residual import get_deepnorm_coefficients
+from xformers.components.simplicial_embedding import SimplicialEmbedding
+from xformers.factory.block_configs import (
+    NormalizationType,
+    xFormerDecoderConfig,
+    xFormerEncoderConfig,
+)
+logger = logging.getLogger("xformers")
+def _get_ln_factory(
+    d_model: int,
+    residual_norm_style: Optional[ResidualNormStyle],
+    use_triton: bool,
+    residual: bool,
+    normalization: NormalizationType = NormalizationType.LayerNorm,
+    residual_scale: float = 1.0,
+):
+    """
+    Handle all the supported residual path configurations.
+    ..Note: we return the appropriate constructor, not an actual layer
+    """
+    def get_layer_wrapper(
+        d_model: int,
+        sublayer: nn.Module,
+        residual_norm_style: Optional[ResidualNormStyle],
+        residual: bool,
+        residual_scale: float,
+    ):
+        if residual:
+            if residual_norm_style == ResidualNormStyle.Pre:
+                return Residual(
+                    layer=PreNorm(d_model, sublayer, normalization, use_triton),
+                    scale=None,
+                )
+            elif residual_norm_style == ResidualNormStyle.Post:
+                return PostNorm(
+                    d_model,
+                    Residual(layer=sublayer, scale=None),
+                    normalization,
+                    use_triton,
+                )
+            elif residual_norm_style == ResidualNormStyle.DeepNorm:
+                return PostNorm(
+                    d_model,
+                    Residual(layer=sublayer, scale=residual_scale),
+                    normalization,
+                    use_triton=use_triton,
+                )
+            else:
+                raise ValueError
+        return (
+            PreNorm(d_model, sublayer, normalization, use_triton)
+            if residual_norm_style == ResidualNormStyle.Pre
+            else PostNorm(d_model, sublayer, normalization, use_triton)
+        )
+    def ln_factory(sublayer: nn.Module):
+        return get_layer_wrapper(
+            d_model, sublayer, residual_norm_style, residual, residual_scale
+        )
+    return ln_factory
+class xFormerEncoderBlock(torch.nn.Module):
+    r"""A vanilla Transformer Encoder block"""
+    def __init__(self, config: xFormerEncoderConfig, **kwargs):
+        super().__init__()
+        deprecated_function(self)
+        self.reversible_f = None
+        self.reversible_g = None
+        self.residual_norm_style = config.residual_norm_style
+        self.dim_model = config.dim_model
+        # If this layer is the first one, and a pose encoding has been requested
+        if (
+            config.position_encoding_config is not None
+            and config.layer_position.is_first()
+        ):
+            self.pose_encoding = build_positional_embedding(
+                asdict(config.position_encoding_config)
+            )
+            pos_encoding_dim = config.position_encoding_config.dim_model
+            mha_dim = config.multi_head_config["dim_model"]
+            if pos_encoding_dim != mha_dim:
+                logger.warning(
+                    f"The embedding dim and model dim do not match ({pos_encoding_dim} vs {mha_dim}), adding a projector layer."  # noqa
+                )
+                self.embedding_projector = nn.Linear(pos_encoding_dim, mha_dim)
+        else:
+            self.pose_encoding = None
+        if config.residual_norm_style == ResidualNormStyle.DeepNorm:
+            # Just use the layer norm coefficient here,
+            # the init will be handled at the xformers level (knows about encoder and decoder blocks)
+            deep_norm_coefficients, _ = get_deepnorm_coefficients(
+                encoder_layers=config.num_layers, decoder_layers=0
+            )
+            assert deep_norm_coefficients is not None
+            residual_scale = deep_norm_coefficients.alpha
+        else:
+            residual_scale = 1.0
+        # mini helper, builds a normalization layer with the right Pre/Post config, residuals, and the right dimensions
+        ln_factory = _get_ln_factory(
+            config.dim_model,
+            config.residual_norm_style,
+            use_triton=config.use_triton,
+            residual=True,
+            residual_scale=residual_scale,
+            normalization=config.normalization,
+        )
+        mha = build_multi_head_attention(config.multi_head_config)
+        feedforward = build_feedforward(asdict(config.feedforward_config))
+        # Expose attention specific capabilities
+        self.supports_attention_mask = mha.attention.supports_attention_mask
+        self.requires_same_k_q_dimensions = mha.attention.requires_same_k_q_dimensions
+        self.causal = (
+            mha.attention.causal if hasattr(mha.attention, "causal") else False
+        )
+        # Wrappers handle the different layer norm styles (pre- and post-) and the residual path
+        self.wrap_att = ln_factory(mha)
+        self.wrap_ff: Union[Residual, PostNorm] = ln_factory(feedforward)
+        if (
+            config.residual_norm_style == ResidualNormStyle.Pre
+            and config.layer_position.is_last()
+        ):
+            self.wrap_ff = PostNorm(
+                config.dim_model,
+                self.wrap_ff,
+                normalization=config.normalization,
+                use_triton=config.use_triton,
+            )
+        # Simplicial embeddings are only used if specified, and on the last layer
+        self.simplicial_embedding: Optional[SimplicialEmbedding] = None
+        if config.simplicial_embeddings is not None and config.layer_position.is_last():
+            self.simplicial_embedding = SimplicialEmbedding(
+                **config.simplicial_embeddings
+            )
+        # Optional patch embedding
+        self.patch_emb: Optional[nn.Module] = None
+        if config.patch_embedding_config is not None:
+            self.patch_emb = build_patch_embedding(
+                PatchEmbeddingConfig(**config.patch_embedding_config)
+            )
+    @classmethod
+    def from_config(cls, config: xFormerEncoderConfig):
+        return cls(config)
+    @staticmethod
+    def get_reversible_layer(config) -> Tuple[nn.Module, nn.Module]:
+        ln_factory = _get_ln_factory(
+            config.dim_model,
+            config.residual_norm_style,
+            residual=False,
+            use_triton=config.use_triton,
+            normalization=config.normalization,
+        )
+        mha = build_multi_head_attention(config.multi_head_config)
+        feedforward = build_feedforward(asdict(config.feedforward_config))
+        reversible_f = ln_factory(mha)
+        reversible_g = ln_factory(feedforward)
+        return reversible_f, reversible_g
+    def forward(
+        self,
+        x: torch.Tensor,
+        att_mask: Optional[Union[torch.Tensor, AttentionMask]] = None,
+        input_mask: Optional[torch.Tensor] = None,
+    ):
+        if self.patch_emb is not None:
+            x = self.patch_emb(x)
+        if self.pose_encoding is not None:
+            x = self.pose_encoding(x)
+            if hasattr(self, "embedding_projector"):
+                x = self.embedding_projector(x)
+        # Handle the optional input masking, differs on Q, K, V
+        if input_mask is not None:
+            q = x
+            k = x * input_mask.unsqueeze(-1)
+            v = k
+        else:
+            q, k, v = x, x, x
+        # Pre/Post norms and residual paths are already handled
+        x = self.wrap_att(inputs=[q, k, v], att_mask=att_mask)
+        x = self.wrap_ff(inputs=[x])
+        # Optional simplicial embeddings
+        if self.simplicial_embedding is not None:
+            x = self.simplicial_embedding(x)
+        return x
+class xFormerDecoderBlock(torch.nn.Module):
+    r"""A vanilla Transformer Decoder block
+    ... note: this implementation is not (yet ?) reversible"""
+    def __init__(self, config: xFormerDecoderConfig, **kwargs):
+        super().__init__()
+        deprecated_function(self)
+        # If this layer is the first one, and a pose encoding as been requested
+        if (
+            config.position_encoding_config is not None
+            and config.layer_position.is_first()
+        ):
+            self.pose_encoding = build_positional_embedding(
+                config.position_encoding_config
+            )
+            pos_encoding_dim = config.position_encoding_config.dim_model
+            mha_dim = config.multi_head_config_masked["dim_model"]
+            if pos_encoding_dim != mha_dim:
+                logger.warning(
+                    f"The embedding dim and model dim do not match ({pos_encoding_dim} vs {mha_dim}), adding a projector layer."  # noqa
+                )
+                self.embedding_projector = nn.Linear(pos_encoding_dim, mha_dim)
+        else:
+            self.pose_encoding = None
+        if config.residual_norm_style == ResidualNormStyle.DeepNorm:
+            # Just use the layer norm coefficient here,
+            # the init will be handled at the xformers level (knows about encoder and decoder blocks)
+            _, deep_norm_coefficients = get_deepnorm_coefficients(
+                encoder_layers=0, decoder_layers=config.num_layers
+            )
+            assert deep_norm_coefficients is not None
+            residual_scale = deep_norm_coefficients.alpha
+        else:
+            residual_scale = 1.0
+        # mini helper, builds a LayerNorm with the right Pre/Post config and the right dimensions
+        ln_factory = _get_ln_factory(
+            config.dim_model,
+            config.residual_norm_style,
+            use_triton=config.use_triton,
+            residual=True,
+            residual_scale=residual_scale,
+            normalization=config.normalization,
+        )
+        mha = build_multi_head_attention(config.multi_head_config_masked)
+        cross_mha = build_multi_head_attention(config.multi_head_config_cross)
+        feedforward = build_feedforward(config.feedforward_config)
+        # Expose attention or feedforward specific capabilities
+        self.supports_attention_mask = mha.attention.supports_attention_mask
+        self.requires_same_k_q_dimensions = mha.attention.requires_same_k_q_dimensions
+        self.requires_squared_context_length = (
+            feedforward.requires_squared_context
+            or mha.attention.requires_squared_context
+        )
+        self.causal_attention = (
+            mha.attention.causal if hasattr(mha.attention, "causal") else False
+        )
+        # Wrappers handle the different layer norm styles (pre- and post-) and the residual path
+        self.wrap_att = ln_factory(mha)
+        self.wrap_cross = ln_factory(cross_mha)
+        self.wrap_ff: Union[Residual, PostNorm] = ln_factory(feedforward)
+        if (
+            config.residual_norm_style == ResidualNormStyle.Pre
+            and config.layer_position.is_last()
+        ):
+            self.wrap_ff = PostNorm(
+                config.dim_model,
+                self.wrap_ff,
+                normalization=NormalizationType.LayerNorm,
+            )
+    @classmethod
+    def from_config(cls, config: xFormerDecoderConfig):
+        return cls(config)
+    def forward(
+        self,
+        target: torch.Tensor,
+        memory: torch.Tensor,
+        encoder_att_mask: Optional[Union[torch.Tensor, AttentionMask]] = None,
+        decoder_att_mask: Optional[Union[torch.Tensor, AttentionMask]] = None,
+        input_mask: Optional[torch.Tensor] = None,
+    ):
+        if self.pose_encoding is not None:
+            target = self.pose_encoding(target)
+            if hasattr(self, "embedding_projector"):
+                target = self.embedding_projector(target)
+        # Handle the optional input masking, differs on Q, K, V
+        if input_mask is not None:
+            target_q = target
+            target_k = target * input_mask.unsqueeze(-1)
+            target_v = target_k
+        else:
+            target_q, target_k, target_v = target, target, target
+        x = self.wrap_att(
+            inputs=[target_q, target_k, target_v], att_mask=decoder_att_mask
+        )
+        x = self.wrap_cross(inputs=[x, memory, memory], att_mask=encoder_att_mask)
+        x = self.wrap_ff(inputs=[x])
+        return x

.venv/lib/python3.11/site-packages/xformers/factory/hydra_helper.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# register components configs into Hydra ConfigStore
+# component config classes could be used to validate configs
+import logging
+from hydra.core.config_store import ConfigStore
+from omegaconf.errors import ValidationError
+from xformers.components.attention import ATTENTION_REGISTRY
+from xformers.components.feedforward import FEEDFORWARD_REGISTRY
+from xformers.components.positional_embedding import POSITION_EMBEDDING_REGISTRY
+logger = logging.getLogger("xformers")
+def import_xformer_config_schema():
+    """
+    Best effort - OmegaConf supports limited typing, so we may fail to import
+    certain config classes. For example, pytorch typing are not supported.
+    """
+    cs = ConfigStore.instance()
+    for k, v in {
+        "ff": FEEDFORWARD_REGISTRY,
+        "pe": POSITION_EMBEDDING_REGISTRY,
+        "attention": ATTENTION_REGISTRY,
+    }.items():
+        for kk in v.keys():
+            try:
+                cs.store(name=f"{kk}_schema", node=v[kk].config, group=f"xformers/{k}")
+            except ValidationError as e:
+                logger.debug(f"Error registering {kk}_schema, error: {e}")

.venv/lib/python3.11/site-packages/xformers/factory/model_factory.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+import torch
+from xformers._deprecation_warning import deprecated_function
+from xformers.components import reversible as rv
+from xformers.components.residual import ResidualNormStyle, get_deepnorm_coefficients
+from xformers.factory.block_configs import (
+    xFormerBlockConfig,
+    xFormerDecoderConfig,
+    xFormerEncoderConfig,
+)
+from xformers.factory.block_factory import xFormerDecoderBlock, xFormerEncoderBlock
+from xformers.factory.weight_init import get_weight_init_fn, xFormerWeightInit
+logger = logging.getLogger("xformers")
+@dataclass(init=False)
+class xFormerConfig:
+    """
+    The configuration structure to define a full Transformer.
+    This can include a stack of encoder layers, and a stack of decoder layers.
+    It is optionally possible to share the embedding weights in between
+    the encoder and decoder positional encoding, as proposed for instance by
+    `Using the Output Embedding to Improve Language Models`, Press et al.
+    A full config example is for instance as follows:
+    ::
+        xformer_config = [
+            {
+                "reversible": False,  # Turn on to test the effect of using reversible layers
+                "block_type": "encoder",
+                "num_layers": LAYERS,
+                "dim_model": EMB,
+                "residual_norm_style": "pre",
+                "position_encoding_config": {
+                    "name": "vocab",
+                    "seq_len": CONTEXT,
+                    "vocab_size": VOCAB_SIZE,
+                },
+                "multi_head_config": {
+                    "num_heads": NUM_HEADS,
+                    "residual_dropout": RES_DROP,
+                    "use_rotary_embeddings": True,
+                    "attention": {
+                        "name": ATTENTION_MECHANISM_STR,
+                        "dropout": ATTN_DROP,
+                        "causal": True,
+                        "seq_len": CONTEXT,
+                    },
+                },
+                "feedforward_config": {
+                    "name": "MLP",
+                    "dropout": MLP_DROP,
+                    "activation": "gelu",
+                    "hidden_layer_multiplier": MLP_MULTIPLIER,
+                },
+            }
+        ]
+    .. _`Using the Output Embedding to Improve Language Models`: https://arxiv.org/pdf/1608.05859.pdf
+    """
+    stack_configs: Union[List[xFormerBlockConfig], Dict[str, xFormerBlockConfig]]
+    tie_embedding_weights: bool = False
+    weight_init: xFormerWeightInit = xFormerWeightInit.ViT
+    def __init__(
+        self,
+        stack_configs: Union[List[Dict[str, Any]], Dict[str, Dict[str, Any]]],
+        tie_embedding_weights: bool = False,
+        weight_init: xFormerWeightInit = xFormerWeightInit.ViT,
+    ):
+        # Type all the configurations. Possible typos are caught here
+        if isinstance(stack_configs, dict):
+            self.stack_configs = {}
+            for k, config in stack_configs.items():
+                if config["block_type"] == "encoder":
+                    self.stack_configs[k] = xFormerEncoderConfig(**config)
+                else:
+                    self.stack_configs[k] = xFormerDecoderConfig(**config)
+        else:
+            self.stack_configs = []
+            for config in stack_configs:
+                if config["block_type"] == "encoder":
+                    self.stack_configs.append(xFormerEncoderConfig(**config))
+                else:
+                    self.stack_configs.append(xFormerDecoderConfig(**config))
+        self.tie_embedding_weights = tie_embedding_weights
+        self.weight_init = weight_init
+        deprecated_function(self)
+class xFormer(torch.nn.Module):
+    def __init__(
+        self,
+        stack_configs: Union[
+            xFormerBlockConfig, List[xFormerBlockConfig], Dict[str, xFormerBlockConfig]
+        ],
+        tie_embedding_weights: bool = False,
+        weight_init: xFormerWeightInit = xFormerWeightInit.ViT,
+    ):
+        """
+        Given a serialized configuration, generate the corresponding model.
+        This is only a helper and can easily be bypassed
+        """
+        super().__init__()
+        deprecated_function(self)
+        if isinstance(stack_configs, Dict):
+            stack_configs = list(stack_configs.values())
+        # Convenience, users can pass either a list of configs or a single one
+        if not isinstance(stack_configs, List):
+            stack_configs = [stack_configs]
+        # Sanity checks, some config combinations do not make sense
+        self._verify_reversible(stack_configs)
+        self._verify_deepnorm(stack_configs)
+        encoders: List[torch.nn.Module] = []
+        decoders: List[torch.nn.Module] = []
+        self.reversible_encoder = False
+        self.rev_enc_pose_encoding = None
+        # Unroll the configs and build the model
+        for config in stack_configs:
+            # Handle either Encoder or Decoder stacks
+            builder = (
+                xFormerEncoderBlock.from_config
+                if isinstance(config, xFormerEncoderConfig)
+                else xFormerDecoderBlock.from_config
+            )
+            recipient = (
+                encoders if isinstance(config, xFormerEncoderConfig) else decoders
+            )
+            # Build up the stack
+            for i in range(config.num_layers):
+                # Label where this layer is in the stack
+                # (for instance useful for the positional encoding, or late layer norm)
+                if len(recipient) > 0:
+                    config.layer_position.mark_not_first()
+                if config != stack_configs[-1] or i < config.num_layers - 1:
+                    config.layer_position.mark_not_last()
+                block = builder(config)  # type: ignore
+                # If reversible: extract the reversible sub-parts, else append the block as-is
+                if config.reversible:
+                    # WARNING: only one pose encoding is saved here (not Focal Transformer compatible for instance)
+                    assert isinstance(config, xFormerEncoderConfig)
+                    if block.pose_encoding is not None:
+                        self.rev_enc_pose_encoding = block.pose_encoding
+                    self.reversible_encoder = True
+                    f, g = xFormerEncoderBlock.get_reversible_layer(config)
+                    recipient.append(torch.nn.ModuleList([f, g]))
+                else:
+                    recipient.append(block)  # type: ignore
+        # Tie embedding weights, if requested and possible
+        assert (
+            not tie_embedding_weights or not self.reversible_encoder
+        ), "Reversible layers and  tied embeddings is not supported for now"
+        if (
+            tie_embedding_weights
+            and encoders
+            and encoders[0].pose_encoding
+            and decoders
+            and decoders[0].pose_encoding
+            and not config.reversible
+        ):
+            logger.info("Tying encoder and decoder embeddings, as requested")
+            encoders[0].pose_encoding = decoders[0].pose_encoding
+        self.encoders: torch.nn.Module = (
+            rv.ReversibleSequence(torch.nn.ModuleList(encoders))
+            if self.reversible_encoder
+            else torch.nn.ModuleList(encoders)
+        )
+        self.decoders = torch.nn.ModuleList(decoders)
+        use_deepnorm = (
+            stack_configs[0].residual_norm_style == ResidualNormStyle.DeepNorm
+        )
+        assert (
+            not use_deepnorm or not self.reversible_encoder
+        ), "Reversible layers and deepnorm is not supported for now"
+        self.init_weights(weight_init=weight_init, use_deep_norm=use_deepnorm)
+    @classmethod
+    def from_config(cls, config: xFormerConfig):
+        return cls(
+            config.stack_configs, config.tie_embedding_weights, config.weight_init
+        )
+    def _verify_reversible(self, stack_configs: List[xFormerBlockConfig]):
+        reversible = [
+            c.reversible
+            for c in filter(lambda x: x.block_type == "encoder", stack_configs)
+        ]
+        assert all(reversible) or not any(reversible), (
+            "All layers need to have the same reversibility setting. "
+            + f"Currently {reversible}"
+        )
+    def _verify_deepnorm(self, stack_configs: List[xFormerBlockConfig]):
+        deepnorm = [
+            c.residual_norm_style == ResidualNormStyle.DeepNorm for c in stack_configs
+        ]
+        assert all(deepnorm) or not any(deepnorm), (
+            "All layers need to have the same deepnorm setting. "
+            + f"Currently {deepnorm}"
+        )
+    def init_weights(self, weight_init: xFormerWeightInit, use_deep_norm: bool):
+        # The deepnorm weight initialization method requires different gain factors for the encoder
+        # and decoder, depending on the general model structure (number of respective layers)
+        if use_deep_norm:
+            encoder_coefficients, decoder_coefficients = get_deepnorm_coefficients(
+                encoder_layers=len(self.encoders), decoder_layers=len(self.decoders)  # type: ignore
+            )
+        else:
+            encoder_coefficients, decoder_coefficients = None, None
+        encoder_gain = (
+            encoder_coefficients.beta if encoder_coefficients is not None else 1.0
+        )
+        decoder_gain = (
+            decoder_coefficients.beta if decoder_coefficients is not None else 1.0
+        )
+        # Pick the desired init function
+        init_fn = get_weight_init_fn(weight_init)
+        # Initialize all the encoder weights
+        for name, module in self.encoders.named_children():
+            init_fn(module=module, name=name, gain=encoder_gain)
+        for name, module in self.decoders.named_children():
+            init_fn(module=module, name=name, gain=decoder_gain)
+    def forward(
+        self,
+        src: torch.Tensor,
+        tgt: Optional[torch.Tensor] = None,
+        encoder_input_mask: Optional[torch.Tensor] = None,
+        decoder_input_mask: Optional[torch.Tensor] = None,
+    ) -> Optional[torch.Tensor]:
+        # Encode to latent space if encoder is present
+        if len(list(self.encoders.parameters())) > 0:
+            encoders = self.encoders
+            memory = src.clone()
+            if isinstance(encoders, torch.nn.ModuleList):
+                for encoder in encoders:
+                    memory = encoder(memory, input_mask=encoder_input_mask)
+            else:
+                if self.rev_enc_pose_encoding:
+                    memory = self.rev_enc_pose_encoding(src)
+                # Reversible Encoder
+                x = torch.cat([memory, memory], dim=-1)
+                # Apply the optional input masking
+                if encoder_input_mask is not None:
+                    if x.dim() - encoder_input_mask.dim() > 1:
+                        encoder_input_mask.unsqueeze(0)
+                    x += encoder_input_mask.unsqueeze(-1)
+                x = encoders(x)
+                memory = torch.stack(x.chunk(2, dim=-1)).mean(dim=0)
+            if not self.decoders:
+                return memory
+        # If decoder: either use the encoder output, or just decode, both options are possible
+        if len(self.decoders) > 0:
+            tgt = src.clone() if tgt is None else tgt
+            for decoder in self.decoders:
+                tgt = decoder(
+                    target=tgt,
+                    # pyre-fixme[61]: `memory` is not always initialized here.
+                    memory=memory,
+                    input_mask=decoder_input_mask,
+                )
+            return tgt
+        return None

.venv/lib/python3.11/site-packages/xformers/factory/weight_init.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# CREDITS: Reusing a lot of code from the Timm repo
+# main difference is probably the handling of deepnorm init, and adapting to some xformers specificities
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+import logging
+import math
+from enum import Enum
+from typing import Callable
+import torch
+import torch.nn as nn
+from torch.nn.init import (
+    _calculate_fan_in_and_fan_out,
+    _no_grad_trunc_normal_,
+    _no_grad_uniform_,
+)
+logger = logging.getLogger("xformers")
+_assert_if_not_initialized = False
+class xFormerWeightInit(str, Enum):
+    Timm = "timm"
+    ViT = "vit"
+    Moco = "moco"
+    Small = "small"
+def get_weight_init_fn(init_choice: xFormerWeightInit):
+    """
+    Provide the xFormers factory with weight init routines.
+    Supported initializations are:
+    - Small: follow the method outlined in `Transformer Without Tears`_
+    - ViT: follow the initialization in the reference ViT_ codebase
+    - Timm: follow the initialization in the reference Timm_ codebase
+    - Moco: follow the initialization in the reference MocoV3_ codebase
+    .. _ViT: https://github.com/google-research/vision_transformer
+    .. _Timm: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    .. _MocoV3: https://github.com/facebookresearch/moco-v3
+    """
+    return {
+        xFormerWeightInit.Timm: _init_weights_vit_timm,
+        xFormerWeightInit.ViT: _init_weights_vit_jax,
+        xFormerWeightInit.Moco: _init_weights_vit_moco,
+        xFormerWeightInit.Small: _init_weights_small,
+    }[init_choice]
+# Define pattern matches
+def is_ffn(n):
+    return "feedforward" in n or ("wrap_ff" in n and not n.endswith("norm"))
+def is_mha_input_projection(n):
+    return "q_proj" in n or "k_proj" in n or "v_proj" in n
+# Define distribution helpers
+def _small_init_(tensor: torch.Tensor, gain: float = 1.0) -> torch.Tensor:
+    r"""Fills the input `Tensor` with values according to the method
+    described in `Transformer Without Tears`_, using a uniform distribution.
+    This is a variation of the Xavier init. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-a, a)` where
+    .. math::
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan\_in} + 4 * \text{fan\_out}}}
+    Also known as Glorot initialization.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+    .. _`Transformer Without Tears`: https://arxiv.org/abs/1910.05895
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + 4 * fan_out))
+    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+    return _no_grad_uniform_(tensor, -a, a)
+def _lecun_normal(tensor, gain=1.0):
+    fan_in, _ = _calculate_fan_in_and_fan_out(tensor)
+    denom = fan_in
+    variance = gain / denom
+    # constant is stddev of standard normal truncated to (-2, 2)
+    _no_grad_trunc_normal_(
+        tensor,
+        mean=0.0,
+        std=math.sqrt(variance) / 0.87962566103423978,
+        a=-2.0,
+        b=2.0,
+    )
+# Helpers to keep all the functions typesafe, and handle corner cases and common behaviours in one place
+def _maybe_init_tensor(module: nn.Module, attr: str, distribution_: Callable, **kwargs):
+    #  Small helper to catch all the corner cases, while staying type safe
+    if hasattr(module, attr):
+        maybe_tensor = getattr(module, attr)
+        if maybe_tensor is not None and isinstance(maybe_tensor, torch.Tensor):
+            distribution_(maybe_tensor, **kwargs)
+def _maybe_report_no_init(module, name):
+    if len(list(module.named_children())) == 0 and (
+        hasattr(module, "weight") or hasattr(module, "bias")
+    ):
+        # Skip layer norm, this is ok
+        if isinstance(module, torch.nn.LayerNorm):
+            return
+        # Skip nn.Embedding, we typically initialize it one level up, else Pytorch has a valid default
+        if isinstance(module, torch.nn.Embedding):
+            return
+        # This is unexpected, warn about a possible unhandled weight
+        logger.warning(
+            f"Not initializing weights in {name}, this could be a mistake.\nModule {module}"
+        )
+        if _assert_if_not_initialized:
+            assert False, (
+                f"Uninitialized weight found in {module}."
+                + " If you have a custom module, please provide a `init_weights()` method"
+            )
+# Define the different initialization schemes
+def _init_weights_vit_jax(
+    module: nn.Module,
+    name: str = "",
+    head_bias: float = 0.0,
+    gain: float = 1.0,
+    deepnorm_style: bool = False,
+    **kwargs,
+):
+    """ViT weight initialization, matching JAX (Flax) impl"""
+    if is_ffn(name):
+        _maybe_init_tensor(module, "bias", nn.init.normal_, std=1e-6)
+        _maybe_init_tensor(module, "weight", torch.nn.init.xavier_uniform_, gain=gain)
+    elif is_mha_input_projection(name) or isinstance(module, nn.Linear):
+        if deepnorm_style and (
+            "q_proj" in name.split(".") or "k_proj" in name.split(".")
+        ):
+            gain = 1.0
+        _maybe_init_tensor(module, "weight", torch.nn.init.xavier_uniform_, gain=gain)
+        _maybe_init_tensor(module, "bias", nn.init.zeros_)
+    elif isinstance(module, nn.Conv2d):
+        _maybe_init_tensor(module, "weight", _lecun_normal, gain=gain)
+        _maybe_init_tensor(module, "bias", nn.init.zeros_)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()  # type: ignore
+    else:
+        _maybe_report_no_init(module, name)
+    # Recurse over the children, if the weight init is being handled here
+    if not hasattr(module, "init_weights"):
+        for child_name, child_module in module.named_children():
+            _init_weights_vit_jax(child_module, f"{name}.{child_name}", head_bias, gain)
+def _init_weights_vit_moco(
+    module: nn.Module,
+    name: str = "",
+    gain: float = 1.0,
+    **kwargs,
+):
+    """ViT weight initialization, matching moco-v3 impl minus fixed PatchEmbed"""
+    assert (
+        "deepnorm_style" not in kwargs.keys()
+    ), "This initialization method does not support deepnorm"
+    if is_ffn(name):
+        _maybe_init_tensor(module, "weight", torch.nn.init.xavier_uniform_, gain=gain)
+        _maybe_init_tensor(module, "bias", nn.init.zeros_)
+    elif is_mha_input_projection(name) or isinstance(module, nn.Linear):
+        if isinstance(module.weight, torch.Tensor):
+            val = (
+                math.sqrt(6.0 / float(module.weight.shape[0] + module.weight.shape[1]))
+                * gain
+            )
+            _maybe_init_tensor(module, "weight", nn.init.uniform_, a=-val, b=val)
+        _maybe_init_tensor(module, "bias", nn.init.zeros_)
+    elif hasattr(module, "init_weights"):
+        module.init_weights(gain=gain)  # type: ignore
+    else:
+        _maybe_report_no_init(module, name)
+    # Recurse over the children, if the weight init is being handled here
+    if not hasattr(module, "init_weights"):
+        for child_name, child_module in module.named_children():
+            _init_weights_vit_moco(child_module, child_name, gain)
+def _init_weights_small(
+    module: nn.Module,
+    name: str = "",
+    head_bias: float = 0.0,
+    gain: float = 1.0,
+    deepnorm_style: bool = False,
+    **kwargs,
+):
+    """Follow the `Transformer Without Tears`_ initialization for self-attention"""
+    if is_ffn(name):
+        _maybe_init_tensor(module, "weight", torch.nn.init.xavier_uniform_, gain=gain)
+        _maybe_init_tensor(module, "bias", nn.init.normal_, std=1e-6)
+    elif is_mha_input_projection(name) or isinstance(module, nn.Linear):
+        # "small init" only scales the attention layers init, not the FFN
+        if deepnorm_style and (
+            "q_proj" in name.split(".") or "k_proj" in name.split(".")
+        ):
+            gain = 1.0
+        _maybe_init_tensor(module, "weight", _small_init_, gain=gain)
+        _maybe_init_tensor(module, "bias", nn.init.zeros_)
+    elif isinstance(module, nn.Conv2d):
+        _maybe_init_tensor(module, "weight", _lecun_normal)
+        _maybe_init_tensor(module, "bias", nn.init.zeros_)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()  # type: ignore
+    else:
+        _maybe_report_no_init(module, name)
+    # Recurse over the children, if the weight init is being handled here
+    if not hasattr(module, "init_weights"):
+        for child_name, child_module in module.named_children():
+            _init_weights_small(child_module, f"{name}.{child_name}", head_bias, gain)
+def _init_weights_vit_timm(
+    module: nn.Module,
+    name: str = "",
+    gain: float = 1.0,
+    deepnorm_style: bool = False,
+    **kwargs,
+):
+    """
+    ViT weight initialization, original timm impl (for reproducibility).
+    See DeepNet_ for all the DeepNorm specific codepaths
+    """
+    if isinstance(module, nn.Linear):
+        if deepnorm_style and (
+            "q_proj" in name.split(".") or "k_proj" in name.split(".")
+        ):
+            gain = 1
+        std = 0.02 * gain
+        a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+        _maybe_init_tensor(
+            module, "weight", _no_grad_trunc_normal_, mean=0.0, std=std, a=-a, b=a
+        )
+        _maybe_init_tensor(module, "bias", nn.init.zeros_)
+    elif hasattr(module, "init_weights"):
+        module.init_weights(gain=gain)  # type: ignore
+    else:
+        _maybe_report_no_init(module, name)
+    # Recurse over the children, if the weight init is being handled here
+    if not hasattr(module, "init_weights"):
+        for child_name, child_module in module.named_children():
+            _init_weights_vit_timm(child_module, child_name, gain)

.venv/lib/python3.11/site-packages/xformers/info.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+import torch
+from . import __version__, _cpp_lib, _is_opensource, _is_triton_available, ops
+from .ops.common import OPERATORS_REGISTRY
+from .profiler.profiler_dcgm import DCGM_PROFILER_AVAILABLE
+def get_features_status() -> Dict[str, str]:
+    features = {}
+    for op in OPERATORS_REGISTRY:
+        status_str = "available" if op.is_available() else "unavailable"
+        features[f"{op.OPERATOR_CATEGORY}.{op.NAME}"] = status_str
+    for k, v in ops.swiglu_op._info().items():
+        features[f"swiglu.{k}"] = v
+    features["is_triton_available"] = str(_is_triton_available())
+    return features
+def print_info():
+    features = get_features_status()
+    print(f"xFormers {__version__}")
+    features["pytorch.version"] = torch.__version__
+    if torch.cuda.is_available():
+        features["pytorch.cuda"] = "available"
+        device = torch.cuda.current_device()
+        cap = torch.cuda.get_device_capability(device)
+        features["gpu.compute_capability"] = ".".join(str(ver) for ver in cap)
+        features["gpu.name"] = torch.cuda.get_device_name(device)
+    else:
+        features["pytorch.cuda"] = "not available"
+    features["dcgm_profiler"] = (
+        "available" if DCGM_PROFILER_AVAILABLE else "unavailable"
+    )
+    build_info = _cpp_lib._build_metadata
+    if build_info is None and isinstance(
+        _cpp_lib._cpp_library_load_exception, _cpp_lib.xFormersInvalidLibException
+    ):
+        build_info = _cpp_lib._cpp_library_load_exception.build_info
+    if build_info is not None:
+        features["build.info"] = "available"
+        features["build.cuda_version"] = build_info.cuda_version
+        features["build.hip_version"] = build_info.hip_version
+        features["build.python_version"] = build_info.python_version
+        features["build.torch_version"] = build_info.torch_version
+        for k, v in build_info.build_env.items():
+            features[f"build.env.{k}"] = v
+    else:
+        features["build.info"] = "none"
+    try:
+        features["build.nvcc_version"] = ".".join(
+            str(v) for v in torch.ops.xformers._nvcc_build_version()
+        )
+    except (RuntimeError, AttributeError):
+        pass
+    if _is_opensource:
+        features["source.privacy"] = "open source"
+    else:
+        features["source.privacy"] = "fairinternal"
+    for name, status in features.items():
+        print("{:<50} {}".format(f"{name}:", status))
+if __name__ == "__main__":
+    print_info()

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/common.cpython-311.pyc ADDED Viewed

Binary file (3.08 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/differentiable_collectives.cpython-311.pyc ADDED Viewed

Binary file (8.61 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/indexing.cpython-311.pyc ADDED Viewed

Binary file (8.59 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/ipc.cpython-311.pyc ADDED Viewed

Binary file (7.89 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/modpar_layers.cpython-311.pyc ADDED Viewed

Binary file (8.76 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/rmsnorm.cpython-311.pyc ADDED Viewed

Binary file (4.78 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/rope_padded.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/seqpar.cpython-311.pyc ADDED Viewed

Binary file (19.8 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/sequence_parallel_fused_ops.cpython-311.pyc ADDED Viewed

Binary file (53.7 kB). View file