koichi12 commited on Feb 12, 2025

Commit

9521f07

verified ·

1 Parent(s): 2b0b5c7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/aiohappyeyeballs/__init__.py +13 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/_staggered.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/types.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/_staggered.py +202 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/impl.py +210 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/py.typed +0 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/types.py +12 -0
.venv/lib/python3.11/site-packages/aiohappyeyeballs/utils.py +97 -0
.venv/lib/python3.11/site-packages/attrs/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/attrs/__pycache__/validators.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__init__.py +62 -0
.venv/lib/python3.11/site-packages/vllm/_custom_ops.py +1098 -0
.venv/lib/python3.11/site-packages/vllm/_ipex_ops.py +228 -0
.venv/lib/python3.11/site-packages/vllm/_version.py +16 -0
.venv/lib/python3.11/site-packages/vllm/beam_search.py +73 -0
.venv/lib/python3.11/site-packages/vllm/config.py +0 -0
.venv/lib/python3.11/site-packages/vllm/connections.py +169 -0
.venv/lib/python3.11/site-packages/vllm/cumem_allocator.abi3.so +0 -0
.venv/lib/python3.11/site-packages/vllm/envs.py +588 -0
.venv/lib/python3.11/site-packages/vllm/executor/__init__.py +0 -0
.venv/lib/python3.11/site-packages/vllm/executor/__pycache__/ray_distributed_executor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/executor/executor_base.py +376 -0
.venv/lib/python3.11/site-packages/vllm/executor/mp_distributed_executor.py +243 -0
.venv/lib/python3.11/site-packages/vllm/executor/msgspec_utils.py +29 -0
.venv/lib/python3.11/site-packages/vllm/executor/ray_distributed_executor.py +638 -0
.venv/lib/python3.11/site-packages/vllm/executor/ray_utils.py +378 -0
.venv/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py +134 -0
.venv/lib/python3.11/site-packages/vllm/forward_context.py +101 -0
.venv/lib/python3.11/site-packages/vllm/logger.py +210 -0
.venv/lib/python3.11/site-packages/vllm/logits_process.py +121 -0
.venv/lib/python3.11/site-packages/vllm/outputs.py +529 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__init__.py +237 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cuda.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/hpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/interface.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/neuron.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/openvino.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/rocm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/tpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/xpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/platforms/cpu.py +145 -0
.venv/lib/python3.11/site-packages/vllm/platforms/cuda.py +390 -0
.venv/lib/python3.11/site-packages/vllm/platforms/hpu.py +90 -0
.venv/lib/python3.11/site-packages/vllm/platforms/interface.py +308 -0
.venv/lib/python3.11/site-packages/vllm/platforms/neuron.py +57 -0

.venv/lib/python3.11/site-packages/aiohappyeyeballs/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+__version__ = "2.4.6"
+from .impl import start_connection
+from .types import AddrInfoType
+from .utils import addr_to_addr_infos, pop_addr_infos_interleave, remove_addr_infos
+__all__ = (
+    "AddrInfoType",
+    "addr_to_addr_infos",
+    "pop_addr_infos_interleave",
+    "remove_addr_infos",
+    "start_connection",
+)

.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (520 Bytes). View file

.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/_staggered.cpython-311.pyc ADDED Viewed

Binary file (8.87 kB). View file

.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-311.pyc ADDED Viewed

Binary file (9.75 kB). View file

.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/types.cpython-311.pyc ADDED Viewed

Binary file (514 Bytes). View file

.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.16 kB). View file

.venv/lib/python3.11/site-packages/aiohappyeyeballs/_staggered.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import asyncio
+import contextlib
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Awaitable,
+    Callable,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
+_T = TypeVar("_T")
+def _set_result(wait_next: "asyncio.Future[None]") -> None:
+    """Set the result of a future if it is not already done."""
+    if not wait_next.done():
+        wait_next.set_result(None)
+async def _wait_one(
+    futures: "Iterable[asyncio.Future[Any]]",
+    loop: asyncio.AbstractEventLoop,
+) -> _T:
+    """Wait for the first future to complete."""
+    wait_next = loop.create_future()
+    def _on_completion(fut: "asyncio.Future[Any]") -> None:
+        if not wait_next.done():
+            wait_next.set_result(fut)
+    for f in futures:
+        f.add_done_callback(_on_completion)
+    try:
+        return await wait_next
+    finally:
+        for f in futures:
+            f.remove_done_callback(_on_completion)
+async def staggered_race(
+    coro_fns: Iterable[Callable[[], Awaitable[_T]]],
+    delay: Optional[float],
+    *,
+    loop: Optional[asyncio.AbstractEventLoop] = None,
+) -> Tuple[Optional[_T], Optional[int], List[Optional[BaseException]]]:
+    """
+    Run coroutines with staggered start times and take the first to finish.
+    This method takes an iterable of coroutine functions. The first one is
+    started immediately. From then on, whenever the immediately preceding one
+    fails (raises an exception), or when *delay* seconds has passed, the next
+    coroutine is started. This continues until one of the coroutines complete
+    successfully, in which case all others are cancelled, or until all
+    coroutines fail.
+    The coroutines provided should be well-behaved in the following way:
+    * They should only ``return`` if completed successfully.
+    * They should always raise an exception if they did not complete
+      successfully. In particular, if they handle cancellation, they should
+      probably reraise, like this::
+        try:
+            # do work
+        except asyncio.CancelledError:
+            # undo partially completed work
+            raise
+    Args:
+    ----
+        coro_fns: an iterable of coroutine functions, i.e. callables that
+            return a coroutine object when called. Use ``functools.partial`` or
+            lambdas to pass arguments.
+        delay: amount of time, in seconds, between starting coroutines. If
+            ``None``, the coroutines will run sequentially.
+        loop: the event loop to use. If ``None``, the running loop is used.
+    Returns:
+    -------
+        tuple *(winner_result, winner_index, exceptions)* where
+        - *winner_result*: the result of the winning coroutine, or ``None``
+          if no coroutines won.
+        - *winner_index*: the index of the winning coroutine in
+          ``coro_fns``, or ``None`` if no coroutines won. If the winning
+          coroutine may return None on success, *winner_index* can be used
+          to definitively determine whether any coroutine won.
+        - *exceptions*: list of exceptions returned by the coroutines.
+          ``len(exceptions)`` is equal to the number of coroutines actually
+          started, and the order is the same as in ``coro_fns``. The winning
+          coroutine's entry is ``None``.
+    """
+    loop = loop or asyncio.get_running_loop()
+    exceptions: List[Optional[BaseException]] = []
+    tasks: Set[asyncio.Task[Optional[Tuple[_T, int]]]] = set()
+    async def run_one_coro(
+        coro_fn: Callable[[], Awaitable[_T]],
+        this_index: int,
+        start_next: "asyncio.Future[None]",
+    ) -> Optional[Tuple[_T, int]]:
+        """
+        Run a single coroutine.
+        If the coroutine fails, set the exception in the exceptions list and
+        start the next coroutine by setting the result of the start_next.
+        If the coroutine succeeds, return the result and the index of the
+        coroutine in the coro_fns list.
+        If SystemExit or KeyboardInterrupt is raised, re-raise it.
+        """
+        try:
+            result = await coro_fn()
+        except (SystemExit, KeyboardInterrupt):
+            raise
+        except BaseException as e:
+            exceptions[this_index] = e
+            _set_result(start_next)  # Kickstart the next coroutine
+            return None
+        return result, this_index
+    start_next_timer: Optional[asyncio.TimerHandle] = None
+    start_next: Optional[asyncio.Future[None]]
+    task: asyncio.Task[Optional[Tuple[_T, int]]]
+    done: Union[asyncio.Future[None], asyncio.Task[Optional[Tuple[_T, int]]]]
+    coro_iter = iter(coro_fns)
+    this_index = -1
+    try:
+        while True:
+            if coro_fn := next(coro_iter, None):
+                this_index += 1
+                exceptions.append(None)
+                start_next = loop.create_future()
+                task = loop.create_task(run_one_coro(coro_fn, this_index, start_next))
+                tasks.add(task)
+                start_next_timer = (
+                    loop.call_later(delay, _set_result, start_next) if delay else None
+                )
+            elif not tasks:
+                # We exhausted the coro_fns list and no tasks are running
+                # so we have no winner and all coroutines failed.
+                break
+            while tasks or start_next:
+                done = await _wait_one(
+                    (*tasks, start_next) if start_next else tasks, loop
+                )
+                if done is start_next:
+                    # The current task has failed or the timer has expired
+                    # so we need to start the next task.
+                    start_next = None
+                    if start_next_timer:
+                        start_next_timer.cancel()
+                        start_next_timer = None
+                    # Break out of the task waiting loop to start the next
+                    # task.
+                    break
+                if TYPE_CHECKING:
+                    assert isinstance(done, asyncio.Task)
+                tasks.remove(done)
+                if winner := done.result():
+                    return *winner, exceptions
+    finally:
+        # We either have:
+        #  - a winner
+        #  - all tasks failed
+        #  - a KeyboardInterrupt or SystemExit.
+        #
+        # If the timer is still running, cancel it.
+        #
+        if start_next_timer:
+            start_next_timer.cancel()
+        #
+        # If there are any tasks left, cancel them and than
+        # wait them so they fill the exceptions list.
+        #
+        for task in tasks:
+            task.cancel()
+            with contextlib.suppress(asyncio.CancelledError):
+                await task
+    return None, None, exceptions

.venv/lib/python3.11/site-packages/aiohappyeyeballs/impl.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Base implementation."""
+import asyncio
+import collections
+import functools
+import itertools
+import socket
+from typing import List, Optional, Sequence, Union
+from . import _staggered
+from .types import AddrInfoType
+async def start_connection(
+    addr_infos: Sequence[AddrInfoType],
+    *,
+    local_addr_infos: Optional[Sequence[AddrInfoType]] = None,
+    happy_eyeballs_delay: Optional[float] = None,
+    interleave: Optional[int] = None,
+    loop: Optional[asyncio.AbstractEventLoop] = None,
+) -> socket.socket:
+    """
+    Connect to a TCP server.
+    Create a socket connection to a specified destination.  The
+    destination is specified as a list of AddrInfoType tuples as
+    returned from getaddrinfo().
+    The arguments are, in order:
+    * ``family``: the address family, e.g. ``socket.AF_INET`` or
+        ``socket.AF_INET6``.
+    * ``type``: the socket type, e.g. ``socket.SOCK_STREAM`` or
+        ``socket.SOCK_DGRAM``.
+    * ``proto``: the protocol, e.g. ``socket.IPPROTO_TCP`` or
+        ``socket.IPPROTO_UDP``.
+    * ``canonname``: the canonical name of the address, e.g.
+        ``"www.python.org"``.
+    * ``sockaddr``: the socket address
+    This method is a coroutine which will try to establish the connection
+    in the background. When successful, the coroutine returns a
+    socket.
+    The expected use case is to use this method in conjunction with
+    loop.create_connection() to establish a connection to a server::
+            socket = await start_connection(addr_infos)
+            transport, protocol = await loop.create_connection(
+                MyProtocol, sock=socket, ...)
+    """
+    if not (current_loop := loop):
+        current_loop = asyncio.get_running_loop()
+    single_addr_info = len(addr_infos) == 1
+    if happy_eyeballs_delay is not None and interleave is None:
+        # If using happy eyeballs, default to interleave addresses by family
+        interleave = 1
+    if interleave and not single_addr_info:
+        addr_infos = _interleave_addrinfos(addr_infos, interleave)
+    sock: Optional[socket.socket] = None
+    # uvloop can raise RuntimeError instead of OSError
+    exceptions: List[List[Union[OSError, RuntimeError]]] = []
+    if happy_eyeballs_delay is None or single_addr_info:
+        # not using happy eyeballs
+        for addrinfo in addr_infos:
+            try:
+                sock = await _connect_sock(
+                    current_loop, exceptions, addrinfo, local_addr_infos
+                )
+                break
+            except (RuntimeError, OSError):
+                continue
+    else:  # using happy eyeballs
+        sock, _, _ = await _staggered.staggered_race(
+            (
+                functools.partial(
+                    _connect_sock, current_loop, exceptions, addrinfo, local_addr_infos
+                )
+                for addrinfo in addr_infos
+            ),
+            happy_eyeballs_delay,
+        )
+    if sock is None:
+        all_exceptions = [exc for sub in exceptions for exc in sub]
+        try:
+            first_exception = all_exceptions[0]
+            if len(all_exceptions) == 1:
+                raise first_exception
+            else:
+                # If they all have the same str(), raise one.
+                model = str(first_exception)
+                if all(str(exc) == model for exc in all_exceptions):
+                    raise first_exception
+                # Raise a combined exception so the user can see all
+                # the various error messages.
+                msg = "Multiple exceptions: {}".format(
+                    ", ".join(str(exc) for exc in all_exceptions)
+                )
+                # If the errno is the same for all exceptions, raise
+                # an OSError with that errno.
+                if isinstance(first_exception, OSError):
+                    first_errno = first_exception.errno
+                    if all(
+                        isinstance(exc, OSError) and exc.errno == first_errno
+                        for exc in all_exceptions
+                    ):
+                        raise OSError(first_errno, msg)
+                elif isinstance(first_exception, RuntimeError) and all(
+                    isinstance(exc, RuntimeError) for exc in all_exceptions
+                ):
+                    raise RuntimeError(msg)
+                # We have a mix of OSError and RuntimeError
+                # so we have to pick which one to raise.
+                # and we raise OSError for compatibility
+                raise OSError(msg)
+        finally:
+            all_exceptions = None  # type: ignore[assignment]
+            exceptions = None  # type: ignore[assignment]
+    return sock
+async def _connect_sock(
+    loop: asyncio.AbstractEventLoop,
+    exceptions: List[List[Union[OSError, RuntimeError]]],
+    addr_info: AddrInfoType,
+    local_addr_infos: Optional[Sequence[AddrInfoType]] = None,
+) -> socket.socket:
+    """Create, bind and connect one socket."""
+    my_exceptions: List[Union[OSError, RuntimeError]] = []
+    exceptions.append(my_exceptions)
+    family, type_, proto, _, address = addr_info
+    sock = None
+    try:
+        sock = socket.socket(family=family, type=type_, proto=proto)
+        sock.setblocking(False)
+        if local_addr_infos is not None:
+            for lfamily, _, _, _, laddr in local_addr_infos:
+                # skip local addresses of different family
+                if lfamily != family:
+                    continue
+                try:
+                    sock.bind(laddr)
+                    break
+                except OSError as exc:
+                    msg = (
+                        f"error while attempting to bind on "
+                        f"address {laddr!r}: "
+                        f"{(exc.strerror or '').lower()}"
+                    )
+                    exc = OSError(exc.errno, msg)
+                    my_exceptions.append(exc)
+            else:  # all bind attempts failed
+                if my_exceptions:
+                    raise my_exceptions.pop()
+                else:
+                    raise OSError(f"no matching local address with {family=} found")
+        await loop.sock_connect(sock, address)
+        return sock
+    except (RuntimeError, OSError) as exc:
+        my_exceptions.append(exc)
+        if sock is not None:
+            try:
+                sock.close()
+            except OSError as e:
+                my_exceptions.append(e)
+                raise
+        raise
+    except:
+        if sock is not None:
+            try:
+                sock.close()
+            except OSError as e:
+                my_exceptions.append(e)
+                raise
+        raise
+    finally:
+        exceptions = my_exceptions = None  # type: ignore[assignment]
+def _interleave_addrinfos(
+    addrinfos: Sequence[AddrInfoType], first_address_family_count: int = 1
+) -> List[AddrInfoType]:
+    """Interleave list of addrinfo tuples by family."""
+    # Group addresses by family
+    addrinfos_by_family: collections.OrderedDict[int, List[AddrInfoType]] = (
+        collections.OrderedDict()
+    )
+    for addr in addrinfos:
+        family = addr[0]
+        if family not in addrinfos_by_family:
+            addrinfos_by_family[family] = []
+        addrinfos_by_family[family].append(addr)
+    addrinfos_lists = list(addrinfos_by_family.values())
+    reordered: List[AddrInfoType] = []
+    if first_address_family_count > 1:
+        reordered.extend(addrinfos_lists[0][: first_address_family_count - 1])
+        del addrinfos_lists[0][: first_address_family_count - 1]
+    reordered.extend(
+        a
+        for a in itertools.chain.from_iterable(itertools.zip_longest(*addrinfos_lists))
+        if a is not None
+    )
+    return reordered

.venv/lib/python3.11/site-packages/aiohappyeyeballs/py.typed ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/aiohappyeyeballs/types.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Types for aiohappyeyeballs."""
+import socket
+from typing import Tuple, Union
+AddrInfoType = Tuple[
+    Union[int, socket.AddressFamily],
+    Union[int, socket.SocketKind],
+    int,
+    str,
+    Tuple,  # type: ignore[type-arg]
+]

.venv/lib/python3.11/site-packages/aiohappyeyeballs/utils.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""Utility functions for aiohappyeyeballs."""
+import ipaddress
+import socket
+from typing import Dict, List, Optional, Tuple, Union
+from .types import AddrInfoType
+def addr_to_addr_infos(
+    addr: Optional[
+        Union[Tuple[str, int, int, int], Tuple[str, int, int], Tuple[str, int]]
+    ],
+) -> Optional[List[AddrInfoType]]:
+    """Convert an address tuple to a list of addr_info tuples."""
+    if addr is None:
+        return None
+    host = addr[0]
+    port = addr[1]
+    is_ipv6 = ":" in host
+    if is_ipv6:
+        flowinfo = 0
+        scopeid = 0
+        addr_len = len(addr)
+        if addr_len >= 4:
+            scopeid = addr[3]  # type: ignore[misc]
+        if addr_len >= 3:
+            flowinfo = addr[2]  # type: ignore[misc]
+        addr = (host, port, flowinfo, scopeid)
+        family = socket.AF_INET6
+    else:
+        addr = (host, port)
+        family = socket.AF_INET
+    return [(family, socket.SOCK_STREAM, socket.IPPROTO_TCP, "", addr)]
+def pop_addr_infos_interleave(
+    addr_infos: List[AddrInfoType], interleave: Optional[int] = None
+) -> None:
+    """
+    Pop addr_info from the list of addr_infos by family up to interleave times.
+    The interleave parameter is used to know how many addr_infos for
+    each family should be popped of the top of the list.
+    """
+    seen: Dict[int, int] = {}
+    if interleave is None:
+        interleave = 1
+    to_remove: List[AddrInfoType] = []
+    for addr_info in addr_infos:
+        family = addr_info[0]
+        if family not in seen:
+            seen[family] = 0
+        if seen[family] < interleave:
+            to_remove.append(addr_info)
+        seen[family] += 1
+    for addr_info in to_remove:
+        addr_infos.remove(addr_info)
+def _addr_tuple_to_ip_address(
+    addr: Union[Tuple[str, int], Tuple[str, int, int, int]],
+) -> Union[
+    Tuple[ipaddress.IPv4Address, int], Tuple[ipaddress.IPv6Address, int, int, int]
+]:
+    """Convert an address tuple to an IPv4Address."""
+    return (ipaddress.ip_address(addr[0]), *addr[1:])
+def remove_addr_infos(
+    addr_infos: List[AddrInfoType],
+    addr: Union[Tuple[str, int], Tuple[str, int, int, int]],
+) -> None:
+    """
+    Remove an address from the list of addr_infos.
+    The addr value is typically the return value of
+    sock.getpeername().
+    """
+    bad_addrs_infos: List[AddrInfoType] = []
+    for addr_info in addr_infos:
+        if addr_info[-1] == addr:
+            bad_addrs_infos.append(addr_info)
+    if bad_addrs_infos:
+        for bad_addr_info in bad_addrs_infos:
+            addr_infos.remove(bad_addr_info)
+        return
+    # Slow path in case addr is formatted differently
+    match_addr = _addr_tuple_to_ip_address(addr)
+    for addr_info in addr_infos:
+        if match_addr == _addr_tuple_to_ip_address(addr_info[-1]):
+            bad_addrs_infos.append(addr_info)
+    if bad_addrs_infos:
+        for bad_addr_info in bad_addrs_infos:
+            addr_infos.remove(bad_addr_info)
+        return
+    raise ValueError(f"Address {addr} not found in addr_infos")

.venv/lib/python3.11/site-packages/attrs/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.4 kB). View file

.venv/lib/python3.11/site-packages/attrs/__pycache__/validators.cpython-311.pyc ADDED Viewed

Binary file (222 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/__init__.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# SPDX-License-Identifier: Apache-2.0
+"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+import os
+import torch
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.entrypoints.llm import LLM
+from vllm.executor.ray_utils import initialize_ray_cluster
+from vllm.inputs import PromptType, TextPrompt, TokensPrompt
+from vllm.model_executor.models import ModelRegistry
+from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
+                          CompletionOutput, EmbeddingOutput,
+                          EmbeddingRequestOutput, PoolingOutput,
+                          PoolingRequestOutput, RequestOutput, ScoringOutput,
+                          ScoringRequestOutput)
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from .version import __version__, __version_tuple__
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
+# see https://github.com/NVIDIA/nccl/issues/1234
+os.environ['NCCL_CUMEM_ENABLE'] = '0'
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "LLM",
+    "ModelRegistry",
+    "PromptType",
+    "TextPrompt",
+    "TokensPrompt",
+    "SamplingParams",
+    "RequestOutput",
+    "CompletionOutput",
+    "PoolingOutput",
+    "PoolingRequestOutput",
+    "EmbeddingOutput",
+    "EmbeddingRequestOutput",
+    "ClassificationOutput",
+    "ClassificationRequestOutput",
+    "ScoringOutput",
+    "ScoringRequestOutput",
+    "LLMEngine",
+    "EngineArgs",
+    "AsyncLLMEngine",
+    "AsyncEngineArgs",
+    "initialize_ray_cluster",
+    "PoolingParams",
+]

.venv/lib/python3.11/site-packages/vllm/_custom_ops.py ADDED Viewed

	@@ -0,0 +1,1098 @@

+# SPDX-License-Identifier: Apache-2.0
+import contextlib
+import importlib
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+import torch
+import torch.library
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType
+logger = init_logger(__name__)
+if not current_platform.is_tpu() and not current_platform.is_hpu():
+    try:
+        import vllm._C
+    except ImportError as e:
+        logger.warning("Failed to import from vllm._C with %r", e)
+supports_moe_ops = False
+with contextlib.suppress(ImportError):
+    import vllm._moe_C  # noqa: F401
+    supports_moe_ops = True
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+# page attention ops
+def paged_attention_v1(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    num_kv_heads: int,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    block_size: int,
+    max_seq_len: int,
+    alibi_slopes: Optional[torch.Tensor],
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    tp_rank: int = 0,
+    blocksparse_local_blocks: int = 0,
+    blocksparse_vert_stride: int = 0,
+    blocksparse_block_size: int = 64,
+    blocksparse_head_sliding_step: int = 0,
+) -> None:
+    torch.ops._C.paged_attention_v1(
+        out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
+        seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
+        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
+        blocksparse_vert_stride, blocksparse_block_size,
+        blocksparse_head_sliding_step)
+def paged_attention_v2(
+    out: torch.Tensor,
+    exp_sum: torch.Tensor,
+    max_logits: torch.Tensor,
+    tmp_out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    num_kv_heads: int,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    block_size: int,
+    max_seq_len: int,
+    alibi_slopes: Optional[torch.Tensor],
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    tp_rank: int = 0,
+    blocksparse_local_blocks: int = 0,
+    blocksparse_vert_stride: int = 0,
+    blocksparse_block_size: int = 64,
+    blocksparse_head_sliding_step: int = 0,
+) -> None:
+    torch.ops._C.paged_attention_v2(
+        out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
+        num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
+        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
+        blocksparse_local_blocks, blocksparse_vert_stride,
+        blocksparse_block_size, blocksparse_head_sliding_step)
+def paged_attention_rocm(
+    out: torch.Tensor,
+    exp_sum: torch.Tensor,
+    max_logits: torch.Tensor,
+    tmp_out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    num_kv_heads: int,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    block_size: int,
+    max_seq_len: int,
+    alibi_slopes: Optional[torch.Tensor],
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+) -> None:
+    torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
+                                      key_cache, value_cache, num_kv_heads,
+                                      scale, block_tables, seq_lens,
+                                      block_size, max_seq_len, alibi_slopes,
+                                      kv_cache_dtype, k_scale, v_scale)
+# pos encoding ops
+def rotary_embedding(
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    head_size: int,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+) -> None:
+    torch.ops._C.rotary_embedding(positions, query, key, head_size,
+                                  cos_sin_cache, is_neox)
+def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+                             key: torch.Tensor, head_size: int,
+                             cos_sin_cache: torch.Tensor, is_neox: bool,
+                             rot_dim: int,
+                             cos_sin_cache_offsets: torch.Tensor) -> None:
+    torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
+                                          cos_sin_cache, is_neox, rot_dim,
+                                          cos_sin_cache_offsets)
+# layer norm ops
+def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
+             epsilon: float) -> None:
+    torch.ops._C.rms_norm(out, input, weight, epsilon)
+def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                       weight: torch.Tensor, epsilon: float) -> None:
+    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
+def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
+                           input_tokens: torch.Tensor,
+                           sampled_token_ids: torch.Tensor,
+                           input_positions: torch.Tensor,
+                           seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
+                           block_tables: torch.Tensor) -> None:
+    """Advance a step on GPU for existing inputs for a multi-step runner"""
+    return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
+                                               block_size, input_tokens,
+                                               sampled_token_ids,
+                                               input_positions, seq_lens,
+                                               slot_mapping, block_tables)
+def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
+                            input_tokens: torch.Tensor,
+                            sampled_token_ids: torch.Tensor,
+                            input_positions: torch.Tensor,
+                            seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
+                            block_tables: torch.Tensor,
+                            paged_kv_indices: torch.Tensor,
+                            paged_kv_indptr: torch.Tensor,
+                            paged_kv_last_page_len: torch.Tensor,
+                            block_table_bound: torch.Tensor) -> None:
+    return torch.ops._C.advance_step_flashinfer(
+        num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+        input_positions, seq_lens, slot_mapping, block_tables,
+        paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
+        block_table_bound)
+# fused quant layer norm ops
+def rms_norm_dynamic_per_token_quant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+    scale_ub: Optional[torch.Tensor] = None,
+    residual: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(input, dtype=quant_dtype)
+    scales = torch.empty((input.numel() // input.shape[-1], 1),
+                         device=input.device,
+                         dtype=torch.float32)
+    torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight,
+                                                  scales, epsilon, scale_ub,
+                                                  residual)
+    return output, scales
+# quantization ops
+# awq
+def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
+                   zeros: torch.Tensor, split_k_iters: int, thx: int,
+                   thy: int) -> torch.Tensor:
+    if envs.VLLM_USE_TRITON_AWQ:
+        from vllm.model_executor.layers.quantization.awq_triton import (
+            awq_dequantize_triton)
+        return awq_dequantize_triton(qweight, scales, zeros)
+    return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters,
+                                       thx, thy)
+def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
+             scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
+    if envs.VLLM_USE_TRITON_AWQ:
+        from vllm.model_executor.layers.quantization.awq_triton import (
+            awq_gemm_triton)
+        return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters)
+    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+# gptq
+def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+              b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
+              b_g_idx: torch.Tensor, use_exllama: bool,
+              bit: int) -> torch.Tensor:
+    return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+                                  b_g_idx, use_exllama, bit)
+if hasattr(torch.ops._C, "gptq_gemm"):
+    @register_fake("_C::gptq_gemm")
+    def _gptq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                        b_gptq_qzeros: torch.Tensor,
+                        b_gptq_scales: torch.Tensor, b_g_idx: torch.Tensor,
+                        use_exllama: bool, bit: int) -> torch.Tensor:
+        return torch.empty((a.size(0), b_q_weight.size(1)),
+                           dtype=a.dtype,
+                           device=a.device)
+def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
+                 bit: int) -> None:
+    torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
+# marlin
+def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
+                size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
+                                    size_n, size_k)
+# marlin_24
+def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                        b_meta: torch.Tensor, b_scales: torch.Tensor,
+                        workspace: torch.Tensor, b_q_type: ScalarType,
+                        size_m: int, size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
+                                            workspace, b_q_type.id, size_m,
+                                            size_n, size_k)
+if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
+    @register_fake("_C::gptq_marlin_24_gemm")
+    def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                                  b_meta: torch.Tensor, b_scales: torch.Tensor,
+                                  workspace: torch.Tensor,
+                                  b_q_type: ScalarType, size_m: torch.SymInt,
+                                  size_n: torch.SymInt,
+                                  size_k: torch.SymInt) -> torch.Tensor:
+        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+    @register_fake("_C::gptq_marlin_gemm")
+    def _gptq_marlin_gemm_fake(a: torch.Tensor,
+                               b_q_weight: torch.Tensor,
+                               b_scales: torch.Tensor,
+                               b_zeros: torch.Tensor,
+                               g_idx: torch.Tensor,
+                               perm: torch.Tensor,
+                               workspace: torch.Tensor,
+                               b_q_type: ScalarType,
+                               size_m: torch.SymInt,
+                               size_n: torch.SymInt,
+                               size_k: torch.SymInt,
+                               is_k_full: bool,
+                               has_zp: bool = False,
+                               use_fp32_reduce: bool = False,
+                               is_zp_float: bool = False) -> torch.Tensor:
+        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+    @register_fake("_C::marlin_qqq_gemm")
+    def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                              s_tok: torch.Tensor, s_ch: torch.Tensor,
+                              s_group: torch.Tensor, workspace: torch.Tensor,
+                              size_m: torch.SymInt, size_n: torch.SymInt,
+                              size_k: torch.SymInt) -> torch.Tensor:
+        return torch.empty((size_m, size_n),
+                           dtype=torch.float16,
+                           device=a.device)
+    @register_fake("_C::marlin_gemm")
+    def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                          b_scales: torch.Tensor, workspace: torch.Tensor,
+                          size_m: torch.SymInt, size_n: torch.SymInt,
+                          size_k: torch.SymInt) -> torch.Tensor:
+        return torch.empty((size_m, size_n),
+                           dtype=torch.float16,
+                           device=a.device)
+    @register_fake("_C::awq_dequantize")
+    def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor,
+                             zeros: torch.Tensor, split_k_iters: torch.SymInt,
+                             thx: int, thy: int) -> torch.Tensor:
+        in_c = qweight.size(0)
+        qout_c = qweight.size(1)
+        out_c = qout_c * 8
+        return torch.empty((in_c, out_c),
+                           dtype=scales.dtype,
+                           device=scales.device)
+    @register_fake("_C::awq_gemm")
+    def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor,
+                       qzeros: torch.Tensor, scales: torch.Tensor,
+                       split_k_iters: torch.SymInt) -> torch.Tensor:
+        num_in_feats = input.size(0)
+        return torch.empty((split_k_iters, num_in_feats, qweight.size(1) * 8),
+                           dtype=input.dtype,
+                           device=input.device).sum(0)
+    @register_fake("_C::aqlm_gemm")
+    def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
+                        codebooks: torch.Tensor, scales: torch.Tensor,
+                        codebook_partition_sizes: List[int],
+                        bias: Optional[torch.Tensor]) -> torch.Tensor:
+        out_features = codes.size(0) * codebooks.size(2)
+        flat_input = input.reshape((-1, input.size(-1)))
+        flat_output = torch.empty((flat_input.size(0), out_features),
+                                  dtype=input.dtype,
+                                  device=input.device)
+        output_sizes = list(input.shape)
+        output_sizes.pop()
+        output_sizes.append(-1)
+        return flat_output.reshape(tuple(output_sizes))
+    @register_fake("_C::aqlm_dequant")
+    def _aqlm_dequant_fake(
+            codes: torch.Tensor, codebooks: torch.Tensor,
+            codebook_partition_sizes: List[int]) -> torch.Tensor:
+        in_features = codes.size(1) * 8
+        out_features = codes.size(0)
+        return torch.empty((out_features, in_features),
+                           dtype=codebooks.dtype,
+                           device=codebooks.device)
+    @register_fake("_C::fp8_marlin_gemm")
+    def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                              b_scales: torch.Tensor, workspace: torch.Tensor,
+                              num_bits: int, size_m: torch.SymInt,
+                              size_n: torch.SymInt,
+                              size_k: torch.SymInt) -> torch.Tensor:
+        return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
+    @register_fake("_C::machete_mm")
+    def machete_mm_fake(
+        a: torch.Tensor,
+        # b_q Should be the tensor returned by machete_prepack_B
+        b_q: torch.Tensor,
+        b_type: ScalarType,
+        out_type: Optional[torch.dtype] = None,
+        b_group_scales: Optional[torch.Tensor] = None,
+        b_group_zeros: Optional[torch.Tensor] = None,
+        b_group_size: Optional[int] = None,
+        b_channel_scales: Optional[torch.Tensor] = None,
+        a_token_scales: Optional[torch.Tensor] = None,
+        schedule: Optional[str] = None,
+    ) -> torch.Tensor:
+        m = a.size(0)
+        n = b_q.size(1)
+        return torch.empty((m, n), device=a.device, dtype=a.dtype)
+    @register_fake("_C::machete_prepack_B")
+    def machete_prepack_B_fake(
+            b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
+            group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
+        return torch.empty_like(b_q_weight,
+                                memory_format=torch.contiguous_format)
+if hasattr(torch.ops._C, "ggml_dequantize"):
+    @register_fake("_C::ggml_dequantize")
+    def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int,
+                              m: torch.SymInt,
+                              n: torch.SymInt) -> torch.Tensor:
+        return torch.empty((m, n), dtype=torch.float16, device=W.device)
+    @register_fake("_C::ggml_mul_mat_vec_a8")
+    def _ggml_mul_mat_vec_a8_fake(
+        W: torch.Tensor,
+        X: torch.Tensor,
+        quant_type: int,
+        row: torch.SymInt,
+    ) -> torch.Tensor:
+        return torch.empty((1, row), dtype=torch.float16, device=W.device)
+    @register_fake("_C::ggml_mul_mat_a8")
+    def _ggml_mul_mat_a8_fake(
+        W: torch.Tensor,
+        X: torch.Tensor,
+        quant_type: int,
+        row: torch.SymInt,
+    ) -> torch.Tensor:
+        batch = X.size(0)
+        return torch.empty((batch, row), dtype=torch.float16, device=W.device)
+# cutlass
+def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(
+        cuda_device_capability)
+def cutlass_scaled_mm(a: torch.Tensor,
+                      b: torch.Tensor,
+                      scale_a: torch.Tensor,
+                      scale_b: torch.Tensor,
+                      out_dtype: torch.dtype,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    `cutlass_scaled_mm` implements a fused version of
+        `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+    where scale_a * a and scale_b * b are implemented using numpy-style
+    broadcasting.
+    In order to support blockwise scaling like found in DeepSeek V3 we also
+    support extended "group" broadcast rules. We extend the numpy-style
+    broadcasting rules with the following rule:
+        "if the extent of a dimension in the source shape is between 1 and
+        corresponding extent in the target shape we repeat each element along
+        that dimension  src_shape[dim] // target_shape[dim] times consecutively"
+    example if we have:
+          a = [[1, 2], and target_shape = (2, 4)
+               [3, 4]]
+    then we would expand a to:
+          a = [[1, 1, 2, 2],
+               [3, 3, 4, 4]]
+    currently we only support the case:
+        scale_a.shape * [1, 128] == a.shape
+        scale_b.shape * [128, 128] == b.shape
+    """
+    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == b.shape[
+        1] and bias.dtype == out_dtype
+    m = a.shape[0]
+    n = b.shape[1]
+    if current_platform.is_rocm():
+        triton_scaled_mm_module = importlib.import_module(
+            "vllm.model_executor.layers.quantization.compressed_tensors."
+            "triton_scaled_mm")
+        triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
+        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
+    return out
+def cutlass_scaled_mm_azp(a: torch.Tensor,
+                          b: torch.Tensor,
+                          scale_a: torch.Tensor,
+                          scale_b: torch.Tensor,
+                          out_dtype: torch.dtype,
+                          azp_adj: torch.Tensor,
+                          azp: Optional[torch.Tensor] = None,
+                          bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    :param azp_adj: In the per-tensor case, this should include the azp.
+    Always per-channel.
+    :param azp: Only set in the per-token case. Per-token if set.
+    """
+    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.numel(
+    ) == b.shape[1] and bias.dtype == out_dtype
+    assert azp is None or azp.numel() == a.shape[0]
+    m = a.shape[0]
+    n = b.shape[1]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+    torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj,
+                                       azp, bias)
+    return out
+def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_sparse_scaled_mm_supported(
+        cuda_device_capability)
+def cutlass_sparse_compress(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+    Args:
+        a (torch.Tensor):
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]:
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+    Raises:
+        ValueError: If the compression operation fails.
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
+    assert (a.is_contiguous())
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
+    m = a.shape[0]
+    k = a.shape[1]
+    assert (k % 2 == 0)
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
+                         dtype=torch.uint8,
+                         device=a.device)
+    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
+        raise ValueError
+    assert (a_nzs.is_contiguous())
+    assert (a_meta.is_contiguous())
+    return a_nzs, a_meta
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
+        and bias.dtype == out_dtype
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
+    return out
+# aqlm
+def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
+              codebooks: torch.Tensor, scales: torch.Tensor,
+              codebook_partition_sizes: List[int],
+              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
+                                  codebook_partition_sizes, bias)
+def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
+                 codebook_partition_sizes: List[int]) -> torch.Tensor:
+    return torch.ops._C.aqlm_dequant(codes, codebooks,
+                                     codebook_partition_sizes)
+# gptq_marlin
+def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
+                       size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
+                                           num_bits)
+# gptq_marlin
+def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
+                      num_bits: int) -> torch.Tensor:
+    return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
+def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
+                           size_k: int, size_n: int,
+                           num_bits: int) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
+                         device=b_q_weight.device,
+                         dtype=b_q_weight.dtype)
+    for e in range(num_experts):
+        output[e] = torch.ops._C.gptq_marlin_repack(b_q_weight[e], perm[e],
+                                                    size_k, size_n, num_bits)
+    return output
+def awq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
+                          size_k: int, size_n: int,
+                          num_bits: int) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
+                         device=b_q_weight.device,
+                         dtype=b_q_weight.dtype)
+    for e in range(num_experts):
+        output[e] = torch.ops._C.awq_marlin_repack(b_q_weight[e], size_k,
+                                                   size_n, num_bits)
+    return output
+def gptq_marlin_gemm(a: torch.Tensor,
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     b_zeros: torch.Tensor,
+                     g_idx: torch.Tensor,
+                     perm: torch.Tensor,
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool,
+                     has_zp: bool = False,
+                     use_fp32_reduce: bool = False,
+                     is_zp_float: bool = False) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
+                                         g_idx, perm, workspace, b_q_type.id,
+                                         size_m, size_n, size_k, is_k_full,
+                                         has_zp, use_fp32_reduce, is_zp_float)
+# fp8 marlin
+def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    b_scales: torch.Tensor, workspace: torch.Tensor,
+                    num_bits: int, size_m: int, size_n: int,
+                    size_k: int) -> torch.Tensor:
+    return torch.ops._C.fp8_marlin_gemm(a, b_q_weight, b_scales, workspace,
+                                        num_bits, size_m, size_n, size_k)
+# machete
+def machete_supported_schedules(
+        a_type: torch.dtype,
+        b_type: ScalarType,
+        group_scales_type: Optional[torch.dtype],
+        group_zeros_type: Optional[torch.dtype] = None,
+        channel_scales_type: Optional[torch.dtype] = None,
+        token_scales_type: Optional[torch.dtype] = None,
+        out_type: Optional[torch.dtype] = None) -> List[str]:
+    return torch.ops._C.machete_supported_schedules(
+        a_type, b_type.id, group_scales_type, group_zeros_type,
+        channel_scales_type, token_scales_type, out_type)
+def machete_mm(
+        a: torch.Tensor,
+        # b_q Should be the tensor returned by machete_prepack_B
+        b_q: torch.Tensor,
+        b_type: ScalarType,
+        out_type: Optional[torch.dtype] = None,
+        b_group_scales: Optional[torch.Tensor] = None,
+        b_group_zeros: Optional[torch.Tensor] = None,
+        b_group_size: Optional[int] = None,
+        b_channel_scales: Optional[torch.Tensor] = None,
+        a_token_scales: Optional[torch.Tensor] = None,
+        schedule: Optional[str] = None) -> torch.Tensor:
+    return torch.ops._C.machete_mm(a, b_q, b_type.id, out_type, b_group_scales,
+                                   b_group_zeros, b_group_size,
+                                   b_channel_scales, a_token_scales, schedule)
+def machete_prepack_B(
+        b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
+        group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
+    return torch.ops._C.machete_prepack_B(b_q_weight, a_type, b_type.id,
+                                          group_scales_type)
+if hasattr(torch.ops._C, "permute_cols"):
+    @register_fake("_C::permute_cols")
+    def _permute_cols_fake(a: torch.Tensor,
+                           perm: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(a)
+def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
+    return torch.ops._C.permute_cols(a, perm)
+# fp8
+def scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    num_token_padding: Optional[int] = None,
+    scale_ub: Optional[torch.Tensor] = None,
+    use_per_token_if_dynamic: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP8 and return quantized tensor and scale.
+    This function supports both static and dynamic quantization: If you
+    provide the scale, it will use static scaling and if you omit it,
+    the scale will be determined dynamically. The function also allows
+    optional padding of the output tensors for downstream kernels that
+    will benefit from padding.
+    Args:
+        input: The input tensor to be quantized to FP8
+        scale: Optional scaling factor for the FP8 quantization
+        scale_ub: Optional upper bound for scaling factor in dynamic
+            per token case
+        num_token_padding: If specified, pad the first dimension
+            of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token
+            in the dynamic quantization case.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
+            scaling factor.
+    """
+    # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape: Union[Tuple[int, int], torch.Size] = input.shape
+    # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = torch.float8_e4m3fnuz \
+            if current_platform.is_rocm() else torch.float8_e4m3fn
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    if scale is None:
+        if use_per_token_if_dynamic:
+            scale = torch.empty((shape[0], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                output, input, scale, scale_ub)
+        else:
+            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        # num_token_padding not implemented for this case
+        assert (scale.numel() == 1 or num_token_padding is None)
+        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+    return output, scale
+# int8
+def scaled_int8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    azp: Optional[torch.Tensor] = None,
+    symmetric: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
+    Args:
+        input: The input tensor to be quantized to int8.
+        scale: Optional scaling factor for the int8 quantization.
+            When not provided, we invoke dynamic-per-token quantization.
+        azp: Optional zero-point for the int8 quantization.
+            Must be provided for asymmetric quantization if `scale` is provided.
+        symmetric: Whether to use symmetric quantization (scale only, azp ignored).
+    Returns:
+      Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+    """
+    output = torch.empty_like(input, dtype=torch.int8)
+    if scale is not None:
+        # static-per-tensor quantization.
+        assert symmetric == (
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
+        torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
+        return output, scale, azp
+    # dynamic-per-token quantization.
+    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    input_azp = None if symmetric else torch.empty_like(input_scales,
+                                                        dtype=torch.int32)
+    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales,
+                                           input_azp)
+    return output, input_scales, input_azp
+# qqq ops
+def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    s_tok: torch.Tensor, s_ch: torch.Tensor,
+                    s_group: torch.Tensor, workspace: torch.Tensor,
+                    size_m: int, size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group,
+                                        workspace, size_m, size_n, size_k)
+# gguf
+def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int,
+                    n: int) -> torch.Tensor:
+    return torch.ops._C.ggml_dequantize(W, quant_type, m, n)
+def ggml_mul_mat_vec_a8(
+    W: torch.Tensor,
+    X: torch.Tensor,
+    quant_type: int,
+    row: int,
+) -> torch.Tensor:
+    return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row)
+def ggml_mul_mat_a8(
+    W: torch.Tensor,
+    X: torch.Tensor,
+    quant_type: int,
+    row: int,
+) -> torch.Tensor:
+    return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row)
+# mamba
+def causal_conv1d_fwd(x: torch.Tensor, weight: torch.Tensor,
+                      bias_: Optional[torch.Tensor],
+                      conv_states: Optional[torch.Tensor],
+                      query_start_loc: Optional[torch.Tensor],
+                      cache_indices: Optional[torch.Tensor],
+                      has_initial_state: Optional[torch.Tensor],
+                      silu_activation: bool, pad_slot_id: int):
+    torch.ops._C.causal_conv1d_fwd(x, weight, bias_, conv_states,
+                                   query_start_loc, cache_indices,
+                                   has_initial_state, silu_activation,
+                                   pad_slot_id)
+def causal_conv1d_update(x: torch.Tensor, conv_state: torch.Tensor,
+                         weight: torch.Tensor, bias_: Optional[torch.Tensor],
+                         silu_activation: bool,
+                         cache_seqlens: Optional[torch.Tensor],
+                         conv_state_indices: Optional[torch.Tensor],
+                         pad_slot_id: int):
+    torch.ops._C.causal_conv1d_update(x, conv_state, weight, bias_,
+                                      silu_activation, cache_seqlens,
+                                      conv_state_indices, pad_slot_id)
+def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
+                       B: torch.Tensor, C: torch.Tensor,
+                       D_: Optional[torch.Tensor], z_: Optional[torch.Tensor],
+                       delta_bias_: Optional[torch.Tensor],
+                       delta_softplus: bool,
+                       query_start_loc: Optional[torch.Tensor],
+                       cache_indices: Optional[torch.Tensor],
+                       has_initial_state: Optional[torch.Tensor],
+                       ssm_states: torch.Tensor, pad_slot_id: int):
+    torch.ops._C.selective_scan_fwd(u, delta, A, B, C, D_, z_, delta_bias_,
+                                    delta_softplus, query_start_loc,
+                                    cache_indices, has_initial_state,
+                                    ssm_states, pad_slot_id)
+# moe
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    torch.ops._moe_C.moe_sum(input, output)
+def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
+                         block_size: int, sorted_token_ids: torch.Tensor,
+                         experts_ids: torch.Tensor,
+                         num_tokens_post_pad: torch.Tensor) -> None:
+    torch.ops._moe_C.moe_align_block_size(topk_ids, num_experts, block_size,
+                                          sorted_token_ids, experts_ids,
+                                          num_tokens_post_pad)
+def sgl_moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
+                             block_size: int, sorted_token_ids: torch.Tensor,
+                             experts_ids: torch.Tensor,
+                             num_tokens_post_pad: torch.Tensor) -> None:
+    torch.ops._moe_C.sgl_moe_align_block_size(topk_ids, num_experts,
+                                              block_size, sorted_token_ids,
+                                              experts_ids, num_tokens_post_pad)
+def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 token_expert_indicies: torch.Tensor,
+                 gating_output: float) -> None:
+    torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
+                                  token_expert_indicies, gating_output)
+if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
+    @register_fake("_moe_C::marlin_gemm_moe")
+    def marlin_gemm_moe_fake(a: torch.Tensor, b_q_weights: torch.Tensor,
+                             sorted_ids: torch.Tensor,
+                             topk_weights: torch.Tensor,
+                             topk_ids: torch.Tensor, b_scales: torch.Tensor,
+                             b_zero_points: torch.Tensor, g_idx: torch.Tensor,
+                             perm: torch.Tensor, workspace: torch.Tensor,
+                             b_q_type: ScalarType, size_m: torch.SymInt,
+                             size_n: torch.SymInt, size_k: torch.SymInt,
+                             is_k_full: bool, num_experts: int, topk: int,
+                             moe_block_size: int, replicate_input: bool,
+                             apply_weights: bool) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n),
+                           dtype=a.dtype,
+                           device=a.device)
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
+                                             value_cache, slot_mapping,
+                                             kv_cache_dtype, k_scale, v_scale)
+def reshape_and_cache_flash(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
+                                                   value_cache, slot_mapping,
+                                                   kv_cache_dtype, k_scale,
+                                                   v_scale)
+def concat_and_cache_mla(
+    kv_c: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.concat_and_cache_mla(kv_c, k_pe, kv_cache,
+                                                slot_mapping, kv_cache_dtype,
+                                                scale)
+def copy_blocks(key_caches: List[torch.Tensor],
+                value_caches: List[torch.Tensor],
+                block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
+def copy_blocks_mla(kv_caches: List[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
+def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
+                block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
+def convert_fp8(output: torch.Tensor,
+                input: torch.Tensor,
+                scale: float = 1.0,
+                kv_dtype: str = "fp8") -> None:
+    torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
+def get_device_attribute(attribute: int, device: int) -> int:
+    return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
+def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
+    # ruff: noqa: E501
+    return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute(
+        device)
+# custom ar
+def init_custom_ar(ipc_tensors: List[torch.Tensor], rank_data: torch.Tensor,
+                   rank: int, full_nvlink: bool) -> int:
+    return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
+                                                 full_nvlink)
+def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
+               reg_buffer_sz_bytes: int) -> None:
+    torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer,
+                                      reg_buffer_sz_bytes)
+def dispose(fa: int) -> None:
+    torch.ops._C_custom_ar.dispose(fa)
+def meta_size() -> int:
+    return torch.ops._C_custom_ar.meta_size()
+def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+    return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+    return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
+def register_graph_buffers(fa: int, handles: List[List[int]],
+                           offsets: List[List[int]]) -> None:
+    torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)

.venv/lib/python3.11/site-packages/vllm/_ipex_ops.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Optional, Tuple
+import torch
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError as e:
+    logger.warning("Import error msg: %s", e.msg)
+class ipex_ops:
+    @staticmethod
+    def _reshape_activation_tensor(
+            x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        num = x.size(0)
+        d = x.size(1) // 2
+        x = x.reshape(num, 2, d)
+        x1, x2 = torch.chunk(x, chunks=2, dim=1)
+        x1 = x1.reshape(num, d)
+        x2 = x2.reshape(num, d)
+        return x1, x2
+    @staticmethod
+    def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.silu_and_mul(x, out)
+    @staticmethod
+    def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.gelu_and_mul(x, out)
+    @staticmethod
+    def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.gelu_and_mul(x, out)
+    @staticmethod
+    def gelu_fast(x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.gelu(x)
+    @staticmethod
+    def gelu_new(x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.gelu(x)
+    @staticmethod
+    def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.gelu_quick(x, out)
+    @staticmethod
+    def paged_attention_v1(
+        out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        ipex.llm.modules.PagedAttention.single_query_kv_attention(
+            out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            num_queries_per_tokens,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
+    @staticmethod
+    def paged_attention_v2(
+        out: torch.Tensor,
+        exp_sum: torch.Tensor,
+        max_logits: torch.Tensor,
+        tmp_out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        ipex.llm.modules.PagedAttention.single_query_kv_attention(
+            out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            num_queries_per_tokens,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
+    @staticmethod
+    def rotary_embedding(
+        positions: torch.Tensor,  # [batch_size, seq_len]
+        query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
+        key: torch.Tensor,  # [batch_size, seq_len, num_kv_heads*head_size]
+        head_size: int,
+        cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
+        is_neox: bool,
+    ) -> None:
+        rot_dim = cos_sin_cache.size(1)
+        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+                                                     head_size, cos_sin_cache,
+                                                     is_neox, rot_dim)
+    @staticmethod
+    def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+                                 key: torch.Tensor, head_size: int,
+                                 cos_sin_cache: torch.Tensor, is_neox: bool,
+                                 rot_dim: int,
+                                 cos_sin_cache_offsets: torch.Tensor) -> None:
+        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+                                                     head_size, cos_sin_cache,
+                                                     is_neox, rot_dim,
+                                                     cos_sin_cache_offsets)
+    @staticmethod
+    def rms_norm(input: torch.Tensor, weight: torch.Tensor,
+                 epsilon: float) -> torch.Tensor:
+        return ipex.llm.functional.rms_norm(input, weight, epsilon)
+    @staticmethod
+    def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                           weight: torch.Tensor, epsilon: float) -> None:
+        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
+                                               epsilon, True)
+        input.copy_(tmp)
+    @staticmethod
+    def varlen_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+        seqlen_q: torch.Tensor,
+        seqlen_k: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        pdropout: float,
+        softmax_scale: float,
+        zero_tensors: bool,
+        is_causal: bool,
+        return_softmax: bool,
+        gen_: torch.Generator,
+        logits_soft_cap: float,
+    ) -> None:
+        ipex.llm.functional.varlen_attention(query.contiguous(),
+                                             key.contiguous(),
+                                             value.contiguous(), out,
+                                             seqlen_q.int(), seqlen_k.int(),
+                                             max_seqlen_q, max_seqlen_k,
+                                             pdropout, softmax_scale,
+                                             zero_tensors, is_causal,
+                                             return_softmax, gen_,
+                                             logits_soft_cap)
+    @staticmethod
+    def reshape_and_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        ipex.llm.modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache, slot_mapping)
+    @staticmethod
+    def copy_blocks(key_caches: List[torch.Tensor],
+                    value_caches: List[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.copy_blocks(  # type: ignore
+            key_caches,
+            value_caches,
+            block_mapping,
+        )
+    @staticmethod
+    def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore

.venv/lib/python3.11/site-packages/vllm/_version.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# file generated by setuptools_scm
+# don't change, don't track in version control
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple, Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '0.7.2'
+__version_tuple__ = version_tuple = (0, 7, 2)

.venv/lib/python3.11/site-packages/vllm/beam_search.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from vllm.sequence import Logprob
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalDataDict
+@dataclass
+class BeamSearchSequence:
+    """A sequence for beam search.
+    It keeps track of the tokens and the log probability of the sequence.
+    The text field is optional and will only be filled when the sequence is
+    about to be returned to the user.
+    """
+    # The tokens includes the prompt.
+    tokens: List[int]
+    logprobs: List[Dict[int, Logprob]]
+    cum_logprob: float = 0.0
+    text: Optional[str] = None
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+    multi_modal_data: Optional["MultiModalDataDict"] = None
+    mm_processor_kwargs: Optional[Dict[str, Any]] = None
+@dataclass
+class BeamSearchOutput:
+    """The output of beam search.
+    It contains the list of the best beam search sequences.
+    The length of the list is equal to the beam width.
+    """
+    sequences: List[BeamSearchSequence]
+class BeamSearchInstance:
+    def __init__(self, prompt_tokens: List[int]):
+        self.beams: List[BeamSearchSequence] = [
+            BeamSearchSequence(tokens=prompt_tokens, logprobs=[])
+        ]
+        self.completed: List[BeamSearchSequence] = []
+def get_beam_search_score(
+    tokens: List[int],
+    cumulative_logprob: float,
+    eos_token_id: int,
+    length_penalty: float = 1.0,
+) -> float:
+    """Calculate the beam search score with length penalty.
+    Adapted from
+    https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
+    """
+    seq_len = len(tokens)
+    if tokens[-1] == eos_token_id:
+        seq_len -= 1
+    return cumulative_logprob / (seq_len**length_penalty)
+def create_sort_beams_key_function(eos_token_id: int, length_penalty: float):
+    def sort_beams_key(x: BeamSearchSequence) -> float:
+        return get_beam_search_score(x.tokens, x.cum_logprob, eos_token_id,
+                                     length_penalty)
+    return sort_beams_key

.venv/lib/python3.11/site-packages/vllm/config.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/vllm/connections.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+from typing import Mapping, MutableMapping, Optional
+from urllib.parse import urlparse
+import aiohttp
+import requests
+from vllm.version import __version__ as VLLM_VERSION
+class HTTPConnection:
+    """Helper class to send HTTP requests."""
+    def __init__(self, *, reuse_client: bool = True) -> None:
+        super().__init__()
+        self.reuse_client = reuse_client
+        self._sync_client: Optional[requests.Session] = None
+        self._async_client: Optional[aiohttp.ClientSession] = None
+    def get_sync_client(self) -> requests.Session:
+        if self._sync_client is None or not self.reuse_client:
+            self._sync_client = requests.Session()
+        return self._sync_client
+    # NOTE: We intentionally use an async function even though it is not
+    # required, so that the client is only accessible inside async event loop
+    async def get_async_client(self) -> aiohttp.ClientSession:
+        if self._async_client is None or not self.reuse_client:
+            self._async_client = aiohttp.ClientSession(trust_env=True)
+        return self._async_client
+    def _validate_http_url(self, url: str):
+        parsed_url = urlparse(url)
+        if parsed_url.scheme not in ("http", "https"):
+            raise ValueError("Invalid HTTP URL: A valid HTTP URL "
+                             "must have scheme 'http' or 'https'.")
+    def _headers(self, **extras: str) -> MutableMapping[str, str]:
+        return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
+    def get_response(
+        self,
+        url: str,
+        *,
+        stream: bool = False,
+        timeout: Optional[float] = None,
+        extra_headers: Optional[Mapping[str, str]] = None,
+    ):
+        self._validate_http_url(url)
+        client = self.get_sync_client()
+        extra_headers = extra_headers or {}
+        return client.get(url,
+                          headers=self._headers(**extra_headers),
+                          stream=stream,
+                          timeout=timeout)
+    async def get_async_response(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+        extra_headers: Optional[Mapping[str, str]] = None,
+    ):
+        self._validate_http_url(url)
+        client = await self.get_async_client()
+        extra_headers = extra_headers or {}
+        return client.get(url,
+                          headers=self._headers(**extra_headers),
+                          timeout=timeout)
+    def get_bytes(self, url: str, *, timeout: Optional[float] = None) -> bytes:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return r.content
+    async def async_get_bytes(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> bytes:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return await r.read()
+    def get_text(self, url: str, *, timeout: Optional[float] = None) -> str:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return r.text
+    async def async_get_text(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> str:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return await r.text()
+    def get_json(self, url: str, *, timeout: Optional[float] = None) -> str:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return r.json()
+    async def async_get_json(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> str:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return await r.json()
+    def download_file(
+        self,
+        url: str,
+        save_path: Path,
+        *,
+        timeout: Optional[float] = None,
+        chunk_size: int = 128,
+    ) -> Path:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            with save_path.open("wb") as f:
+                for chunk in r.iter_content(chunk_size):
+                    f.write(chunk)
+        return save_path
+    async def async_download_file(
+        self,
+        url: str,
+        save_path: Path,
+        *,
+        timeout: Optional[float] = None,
+        chunk_size: int = 128,
+    ) -> Path:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            with save_path.open("wb") as f:
+                async for chunk in r.content.iter_chunked(chunk_size):
+                    f.write(chunk)
+        return save_path
+global_http_connection = HTTPConnection()
+"""The global :class:`HTTPConnection` instance used by vLLM."""

.venv/lib/python3.11/site-packages/vllm/cumem_allocator.abi3.so ADDED Viewed

Binary file (27.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/envs.py ADDED Viewed

	@@ -0,0 +1,588 @@

+# SPDX-License-Identifier: Apache-2.0
+import os
+import tempfile
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+if TYPE_CHECKING:
+    VLLM_HOST_IP: str = ""
+    VLLM_PORT: Optional[int] = None
+    VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
+    VLLM_USE_MODELSCOPE: bool = False
+    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
+    VLLM_NCCL_SO_PATH: Optional[str] = None
+    LD_LIBRARY_PATH: Optional[str] = None
+    VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_FLASH_ATTN_VERSION: Optional[int] = None
+    LOCAL_RANK: int = 0
+    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
+    VLLM_API_KEY: Optional[str] = None
+    S3_ACCESS_KEY_ID: Optional[str] = None
+    S3_SECRET_ACCESS_KEY: Optional[str] = None
+    S3_ENDPOINT_URL: Optional[str] = None
+    VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
+    VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
+    VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
+    VLLM_NO_USAGE_STATS: bool = False
+    VLLM_DO_NOT_TRACK: bool = False
+    VLLM_USAGE_SOURCE: str = ""
+    VLLM_CONFIGURE_LOGGING: int = 1
+    VLLM_LOGGING_LEVEL: str = "INFO"
+    VLLM_LOGGING_PREFIX: str = ""
+    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
+    VLLM_TRACE_FUNCTION: int = 0
+    VLLM_ATTENTION_BACKEND: Optional[str] = None
+    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
+    VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
+    VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
+    VLLM_PP_LAYER_PARTITION: Optional[str] = None
+    VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_CPU_OMP_THREADS_BIND: str = ""
+    VLLM_OPENVINO_DEVICE: str = "CPU"
+    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
+    VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
+    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
+    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    VLLM_USE_RAY_SPMD_WORKER: bool = False
+    VLLM_USE_RAY_COMPILED_DAG: bool = False
+    VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
+    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
+    VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
+    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
+    VLLM_VIDEO_FETCH_TIMEOUT: int = 30
+    VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+    VLLM_TARGET_DEVICE: str = "cuda"
+    MAX_JOBS: Optional[str] = None
+    NVCC_THREADS: Optional[str] = None
+    VLLM_USE_PRECOMPILED: bool = False
+    VLLM_NO_DEPRECATION_WARNING: bool = False
+    VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
+    CMAKE_BUILD_TYPE: Optional[str] = None
+    VERBOSE: bool = False
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
+    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
+    VLLM_RPC_TIMEOUT: int = 10000  # ms
+    VLLM_PLUGINS: Optional[List[str]] = None
+    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
+    VLLM_USE_TRITON_AWQ: bool = False
+    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
+    VLLM_SKIP_P2P_CHECK: bool = False
+    VLLM_DISABLED_KERNELS: List[str] = []
+    VLLM_USE_V1: bool = False
+    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
+    VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
+    VLLM_DISABLE_COMPILE_CACHE: bool = False
+    K_SCALE_CONSTANT: int = 200
+    V_SCALE_CONSTANT: int = 100
+    VLLM_SERVER_DEV_MODE: bool = False
+    VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
+    VLLM_MLA_DISABLE: bool = False
+    VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
+    VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
+    VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
+    VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
+    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
+    VLLM_RAY_BUNDLE_INDICES: str = ""
+def get_default_cache_root():
+    return os.getenv(
+        "XDG_CACHE_HOME",
+        os.path.join(os.path.expanduser("~"), ".cache"),
+    )
+def get_default_config_root():
+    return os.getenv(
+        "XDG_CONFIG_HOME",
+        os.path.join(os.path.expanduser("~"), ".config"),
+    )
+def maybe_convert_int(value: Optional[str]) -> Optional[int]:
+    if value is None:
+        return None
+    return int(value)
+# The begin-* and end* here are used by the documentation generator
+# to extract the used env vars.
+# begin-env-vars-definition
+environment_variables: Dict[str, Callable[[], Any]] = {
+    # ================== Installation Time Env Vars ==================
+    # Target device of vLLM, supporting [cuda (by default),
+    # rocm, neuron, cpu, openvino]
+    "VLLM_TARGET_DEVICE":
+    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
+    # Maximum number of compilation jobs to run in parallel.
+    # By default this is the number of CPUs
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+    # Number of threads to use for nvcc
+    # By default this is 1.
+    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
+    "NVCC_THREADS":
+    lambda: os.getenv("NVCC_THREADS", None),
+    # If set, vllm will use precompiled binaries (*.so)
+    "VLLM_USE_PRECOMPILED":
+    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
+        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+    # CMake build type
+    # If not set, defaults to "Debug" or "RelWithDebInfo"
+    # Available options: "Debug", "Release", "RelWithDebInfo"
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
+    # If set, vllm will print verbose logs during installation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+    # Root directory for VLLM configuration files
+    # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
+    # Note that this not only affects how vllm finds its configuration files
+    # during runtime, but also affects how vllm installs its configuration
+    # files during **installation**.
+    "VLLM_CONFIG_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_CONFIG_ROOT",
+            os.path.join(get_default_config_root(), "vllm"),
+        )),
+    # ================== Runtime Env Vars ==================
+    # Root directory for VLLM cache files
+    # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
+    "VLLM_CACHE_ROOT":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_CACHE_ROOT",
+            os.path.join(get_default_cache_root(), "vllm"),
+        )),
+    # used in distributed environment to determine the ip address
+    # of the current node, when the node has multiple network interfaces.
+    # If you are using multi-node inference, you should set this differently
+    # on each node.
+    'VLLM_HOST_IP':
+    lambda: os.getenv('VLLM_HOST_IP', ""),
+    # used in distributed environment to manually set the communication port
+    # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
+    # VLLM_PORT will be used as the first port, and the rest will be generated
+    # by incrementing the VLLM_PORT value.
+    # '0' is used to make mypy happy
+    'VLLM_PORT':
+    lambda: int(os.getenv('VLLM_PORT', '0'))
+    if 'VLLM_PORT' in os.environ else None,
+    # path used for ipc when the frontend api server is running in
+    # multi-processing mode to communicate with the backend engine process.
+    'VLLM_RPC_BASE_PATH':
+    lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),
+    # If true, will load models from ModelScope instead of Hugging Face Hub.
+    # note that the value is true or false, not numbers
+    "VLLM_USE_MODELSCOPE":
+    lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
+    # Interval in seconds to log a warning message when the ring buffer is full
+    "VLLM_RINGBUFFER_WARNING_INTERVAL":
+    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
+    # path to cudatoolkit home directory, under which should be bin, include,
+    # and lib directories.
+    "CUDA_HOME":
+    lambda: os.environ.get("CUDA_HOME", None),
+    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
+    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
+    "VLLM_NCCL_SO_PATH":
+    lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
+    # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
+    # library file in the locations specified by `LD_LIBRARY_PATH`
+    "LD_LIBRARY_PATH":
+    lambda: os.environ.get("LD_LIBRARY_PATH", None),
+    # flag to control if vllm should use triton flash attention
+    "VLLM_USE_TRITON_FLASH_ATTN":
+    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
+             ("true", "1")),
+    # Force vllm to use a specific flash-attention version (2 or 3), only valid
+    # when using the flash-attention backend.
+    "VLLM_FLASH_ATTN_VERSION":
+    lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
+    # Internal flag to enable Dynamo fullgraph capture
+    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
+    lambda: bool(
+        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
+    # local rank of the process in the distributed setting, used to determine
+    # the GPU device id
+    "LOCAL_RANK":
+    lambda: int(os.environ.get("LOCAL_RANK", "0")),
+    # used to control the visible devices in the distributed setting
+    "CUDA_VISIBLE_DEVICES":
+    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
+    # timeout for each iteration in the engine
+    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
+    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
+    # API key for VLLM API server
+    "VLLM_API_KEY":
+    lambda: os.environ.get("VLLM_API_KEY", None),
+    # S3 access information, used for tensorizer to load model from S3
+    "S3_ACCESS_KEY_ID":
+    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
+    "S3_SECRET_ACCESS_KEY":
+    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
+    "S3_ENDPOINT_URL":
+    lambda: os.environ.get("S3_ENDPOINT_URL", None),
+    # Usage stats collection
+    "VLLM_USAGE_STATS_SERVER":
+    lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
+    "VLLM_NO_USAGE_STATS":
+    lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
+    "VLLM_DO_NOT_TRACK":
+    lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
+        "DO_NOT_TRACK", None) or "0") == "1",
+    "VLLM_USAGE_SOURCE":
+    lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
+    # Logging configuration
+    # If set to 0, vllm will not configure logging
+    # If set to 1, vllm will configure logging using the default configuration
+    #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
+    "VLLM_CONFIGURE_LOGGING":
+    lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
+    "VLLM_LOGGING_CONFIG_PATH":
+    lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
+    # this is used for configuring the default logging level
+    "VLLM_LOGGING_LEVEL":
+    lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"),
+    # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
+    "VLLM_LOGGING_PREFIX":
+    lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
+    # if set, vllm will call logits processors in a thread pool with this many
+    # threads. This is useful when using custom logits processors that either
+    # (a) launch additional CUDA kernels or (b) do significant CPU-bound work
+    # while not holding the python GIL, or both.
+    "VLLM_LOGITS_PROCESSOR_THREADS":
+    lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
+    if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
+    # Trace function calls
+    # If set to 1, vllm will trace function calls
+    # Useful for debugging
+    "VLLM_TRACE_FUNCTION":
+    lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
+    # Backend for attention computation
+    # Available options:
+    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
+    # - "FLASH_ATTN": use FlashAttention
+    # - "XFORMERS": use XFormers
+    # - "ROCM_FLASH": use ROCmFlashAttention
+    # - "FLASHINFER": use flashinfer
+    "VLLM_ATTENTION_BACKEND":
+    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
+    # If set, vllm will use flashinfer sampler
+    "VLLM_USE_FLASHINFER_SAMPLER":
+    lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
+    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None,
+    # If set, vllm will force flashinfer to use tensor cores;
+    # otherwise will use heuristic based on model architecture.
+    "VLLM_FLASHINFER_FORCE_TENSOR_CORES":
+    lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))),
+    # Pipeline stage partition strategy
+    "VLLM_PP_LAYER_PARTITION":
+    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
+    # (CPU backend only) CPU key-value cache space.
+    # default is 4GB
+    "VLLM_CPU_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
+    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
+    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
+    "VLLM_CPU_OMP_THREADS_BIND":
+    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
+    # OpenVINO device selection
+    # default is CPU
+    "VLLM_OPENVINO_DEVICE":
+    lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(),
+    # OpenVINO key-value cache space
+    # default is 4GB
+    "VLLM_OPENVINO_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
+    # OpenVINO KV cache precision
+    # default is bf16 if natively supported by platform, otherwise f16
+    # To enable KV cache compression, please, explicitly specify u8
+    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
+    lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
+    # Enables weights compression during model export via HF Optimum
+    # default is False
+    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
+    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
+    # If the env var is set, then all workers will execute as separate
+    # processes from the engine, and we use the same mechanism to trigger
+    # execution on all workers.
+    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
+    "VLLM_USE_RAY_SPMD_WORKER":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),
+    # If the env var is set, it uses the Ray's compiled DAG API
+    # which optimizes the control plane overhead.
+    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+    "VLLM_USE_RAY_COMPILED_DAG":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
+    # If the env var is set, it uses NCCL for communication in
+    # Ray's compiled DAG. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
+                 ),
+    # If the env var is set, it enables GPU communication overlap
+    # (experimental feature) in Ray's compiled DAG. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
+                 ),
+    # Use dedicated multiprocess context for workers.
+    # Both spawn and fork work
+    "VLLM_WORKER_MULTIPROC_METHOD":
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
+    # Path to the cache for storing downloaded assets
+    "VLLM_ASSETS_CACHE":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_ASSETS_CACHE",
+            os.path.join(get_default_cache_root(), "vllm", "assets"),
+        )),
+    # Timeout for fetching images when serving multimodal models
+    # Default is 5 seconds
+    "VLLM_IMAGE_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
+    # Timeout for fetching videos when serving multimodal models
+    # Default is 15 seconds
+    "VLLM_VIDEO_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")),
+    # Timeout for fetching audio when serving multimodal models
+    # Default is 10 seconds
+    "VLLM_AUDIO_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
+    # Path to the XLA persistent cache directory.
+    # Only used for XLA devices such as TPUs.
+    "VLLM_XLA_CACHE_PATH":
+    lambda: os.path.expanduser(
+        os.getenv(
+            "VLLM_XLA_CACHE_PATH",
+            os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
+        )),
+    "VLLM_FUSED_MOE_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
+    # If set, vllm will skip the deprecation warnings.
+    "VLLM_NO_DEPRECATION_WARNING":
+    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
+    # If set, the OpenAI API server will stay alive even after the underlying
+    # AsyncLLMEngine errors and stops serving requests
+    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
+    lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
+    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
+    # the user to specify a max sequence length greater than
+    # the max length derived from the model's config.json.
+    # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
+    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
+    lambda:
+    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
+     ("1", "true")),
+    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
+    # of the hardware support for FP8 compute.
+    "VLLM_TEST_FORCE_FP8_MARLIN":
+    lambda:
+    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
+     ("1", "true")),
+    "VLLM_TEST_FORCE_LOAD_FORMAT":
+    lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"),
+    # Time in ms for the zmq client to wait for a response from the backend
+    # server for simple data operations
+    "VLLM_RPC_TIMEOUT":
+    lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
+    # a list of plugin names to load, separated by commas.
+    # if this is not set, it means all plugins will be loaded
+    # if this is set to an empty string, no plugins will be loaded
+    "VLLM_PLUGINS":
+    lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[
+        "VLLM_PLUGINS"].split(","),
+    # Enables torch profiler if set. Path to the directory where torch profiler
+    # traces are saved. Note that it must be an absolute path.
+    "VLLM_TORCH_PROFILER_DIR":
+    lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
+             .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
+    # If set, vLLM will use Triton implementations of AWQ.
+    "VLLM_USE_TRITON_AWQ":
+    lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
+    # If set, allow loading or unloading lora adapters in runtime,
+    "VLLM_ALLOW_RUNTIME_LORA_UPDATING":
+    lambda:
+    (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
+     ("1", "true")),
+    # By default, vLLM will check the peer-to-peer capability itself,
+    # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
+    # If this env var is set to 1, vLLM will skip the peer-to-peer check,
+    # and trust the driver's peer-to-peer capability report.
+    "VLLM_SKIP_P2P_CHECK":
+    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
+    # List of quantization kernels that should be disabled, used for testing
+    # and performance comparisons. Currently only affects MPLinearKernel
+    # selection
+    # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
+    "VLLM_DISABLED_KERNELS":
+    lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[
+        "VLLM_DISABLED_KERNELS"].split(","),
+    # If set, use the V1 code path.
+    "VLLM_USE_V1":
+    lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
+    # Divisor for dynamic key scale factor calculation for FP8 KV Cache
+    "K_SCALE_CONSTANT":
+    lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
+    # Divisor for dynamic value scale factor calculation for FP8 KV Cache
+    "V_SCALE_CONSTANT":
+    lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
+    # If set, enable multiprocessing in LLM for the V1 code path.
+    "VLLM_ENABLE_V1_MULTIPROCESSING":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
+    "VLLM_LOG_BATCHSIZE_INTERVAL":
+    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
+    "VLLM_DISABLE_COMPILE_CACHE":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
+    # If set, vllm will run in development mode, which will enable
+    # some additional endpoints for developing and debugging,
+    # e.g. `/reset_prefix_cache`
+    "VLLM_SERVER_DEV_MODE":
+    lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
+    # Controls the maximum number of requests to handle in a
+    # single asyncio task when processing per-token outputs in the
+    # V1 AsyncLLM interface. It is applicable when handling a high
+    # concurrency of streaming requests.
+    # Setting this too high can result in a higher variance of
+    # inter-message latencies. Setting it too low can negatively impact
+    # TTFT and overall throughput.
+    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
+    # If set, vLLM will disable the MLA attention optimizations.
+    "VLLM_MLA_DISABLE":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
+    # Flag that can control whether or not we perform matrix-absorption for MLA
+    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
+    # matrices reduces the runtime FLOPs needed to compute MLA but requires
+    # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
+    # the is enabled by default
+    "VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
+    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))),
+    # When running MLA with matrix-absorption enabled and fp8 quantized weights
+    # we perform the matrix-absorption in float32 precision, after the matrices
+    # are absorbed we requantize the weights back to fp8, this flag can be used
+    # to disable the requantization step, and instead convert the absorbed
+    # matrices to match the activation type. This can lead to higher memory and
+    # compute usage but better preserves the accuracy of the original model.
+    "VLLM_MLA_DISABLE_REQUANTIZATION":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
+    # If set, vLLM will use the Triton implementation of moe_align_block_size,
+    # i.e. moe_align_block_size_triton in fused_moe.py.
+    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
+                 ),
+    # Number of GPUs per worker in Ray, if it is set to be a fraction,
+    # it allows ray to schedule multiple actors on a single GPU,
+    # so that users can colocate other actors on the same GPUs as vLLM.
+    "VLLM_RAY_PER_WORKER_GPUS":
+    lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
+    # Bundle indices for Ray, if it is set, it can control precisely
+    # which indices are used for the Ray bundle, for every worker.
+    # Format: comma-separated list of integers, e.g. "0,1,2,3"
+    "VLLM_RAY_BUNDLE_INDICES":
+    lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
+    # When on a Nvidia GPU aligns single entries (within a page) so they are 256
+    # byte aligned for better performance, this increases the memory usage of
+    # the cache. Currently this only affects MLA that results in non-256
+    # byte aligned entries. This matches the alignment the CUDA runtime uses
+    # for all allocations. Currently this primarily affects MLA, for most other
+    # models the alignment is already naturally aligned to 256 bytes.
+    "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
+    lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
+}
+# end-env-vars-definition
+def __getattr__(name: str):
+    # lazy evaluation of environment variables
+    if name in environment_variables:
+        return environment_variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+def __dir__():
+    return list(environment_variables.keys())

.venv/lib/python3.11/site-packages/vllm/executor/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/vllm/executor/__pycache__/ray_distributed_executor.cpython-311.pyc ADDED Viewed

Binary file (30 kB). View file

.venv/lib/python3.11/site-packages/vllm/executor/executor_base.py ADDED Viewed

	@@ -0,0 +1,376 @@

+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+from abc import ABC, abstractmethod
+from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
+                    Union)
+import torch.nn as nn
+from typing_extensions import TypeVar
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import ExecuteModelRequest, PoolerOutput
+from vllm.utils import make_async
+from vllm.worker.worker_base import WorkerBase
+logger = init_logger(__name__)
+_R = TypeVar("_R", default=Any)
+class ExecutorBase(ABC):
+    """Base class for all executors.
+    An executor is responsible for executing the model on one device,
+    or it can be a distributed executor
+    that can execute the model on multiple devices.
+    """
+    uses_ray: bool  # whether the executor uses Ray for orchestration.
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+        self._init_executor()
+        self.is_sleeping = False
+    @abstractmethod
+    def _init_executor(self) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
+        """
+        Execute an RPC call on all workers.
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                :exc:`TimeoutError` on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+        Returns:
+            A list containing the results from each worker.
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
+        """
+        raise NotImplementedError
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available blocks for the GPU KV cache and
+        swappable CPU KV cache.
+        Normally, this should simply delegate to the underlying Worker. Some
+        ExecutorBase may require modification of the result, e.g. to ensure the
+        selected cache sizes are compatible with all workers.
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        are blocks that are "active" on the device and can be appended to.
+        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
+        appended to.
+        """
+        results = self.collective_rpc("determine_num_available_blocks")
+        a = min([r[0] for r in results])
+        b = min([r[1] for r in results])
+        return a, b
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
+        """
+        # NOTE: This is logged in the executor because there can be >1 workers.
+        logger.info("# %s blocks: %d, # CPU blocks: %d",
+                    current_platform.dispatch_key, num_gpu_blocks,
+                    num_cpu_blocks)
+        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
+                           self.model_config.max_model_len)
+        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                    self.model_config.max_model_len, max_concurrency)
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+        self.collective_rpc("initialize_cache",
+                            args=(num_gpu_blocks, num_cpu_blocks))
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        """
+        Run a function directly on the model inside each worker,
+        returning the result for each of them.
+        """
+        def rpc_func(worker: WorkerBase) -> _R:
+            return func(worker.get_model())
+        return self.collective_rpc(rpc_func)
+    def execute_model(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
+        output = self.collective_rpc("execute_model",
+                                     args=(execute_model_req, ))
+        return output[0]
+    def stop_remote_worker_execution_loop(self) -> None:
+        """Releases parallel workers from model loop."""
+        return
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("add_lora", args=(lora_request, )))
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("remove_lora", args=(lora_id, )))
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("pin_lora", args=(lora_id, )))
+    def list_loras(self) -> Set[int]:
+        sets = self.collective_rpc("list_loras")
+        for s in sets:
+            assert s == sets[0], "All workers should have the same LORAs."
+        return sets[0]
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        assert prompt_adapter_request.prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("add_prompt_adapter",
+                                args=(prompt_adapter_request, )))
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("remove_prompt_adapter",
+                                args=(prompt_adapter_id, )))
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("pin_prompt_adapter",
+                                args=(prompt_adapter_id, )))
+    def list_prompt_adapters(self) -> Set[int]:
+        sets = self.collective_rpc("list_prompt_adapters")
+        for s in sets:
+            assert (s == sets[0]
+                    ), "All workers should have the same prompt adapters."
+        return sets[0]
+    def start_profile(self) -> None:
+        self.collective_rpc("start_profile")
+    def stop_profile(self) -> None:
+        self.collective_rpc("stop_profile")
+    def sleep(self, level: int = 1):
+        if self.is_sleeping:
+            logger.warning("Executor is already sleeping.")
+            return
+        self.collective_rpc("sleep", kwargs=dict(level=level))
+        self.is_sleeping = True
+    def wake_up(self):
+        if not self.is_sleeping:
+            logger.warning("Executor is not sleeping.")
+            return
+        self.collective_rpc("wake_up")
+        self.is_sleeping = False
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.collective_rpc("save_sharded_state",
+                            kwargs=dict(path=path,
+                                        pattern=pattern,
+                                        max_size=max_size))
+    @abstractmethod
+    def check_health(self) -> None:
+        """Checks if the executor is healthy. If not, it should raise an
+        exception."""
+        raise NotImplementedError
+    def shutdown(self) -> None:
+        """Shutdown the executor."""
+        return
+    def __del__(self):
+        self.shutdown()
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        """Executes one model step on the given sequences."""
+        output = await make_async(self.execute_model)(execute_model_req)
+        return output
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        """Releases parallel workers from model loop."""
+        return
+    async def check_health_async(self) -> None:
+        """Checks if the executor is healthy. If not, it should raise an
+        exception."""
+        self.check_health()
+class DistributedExecutorBase(ExecutorBase):
+    """Abstract superclass of distributed executor implementations."""
+    def __init__(self, *args, **kwargs):
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        super().__init__(*args, **kwargs)
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        # TODO: unify into collective_rpc
+        if self.parallel_worker_tasks is None:
+            self.parallel_worker_tasks = self._run_workers(
+                "start_worker_execution_loop",
+                async_run_tensor_parallel_workers_only=True)
+        # Only the driver worker returns the sampling results.
+        driver_outputs = self._driver_execute_model(execute_model_req)
+        assert driver_outputs is not None
+        return driver_outputs
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+        self._driver_execute_model(execute_model_req=None)
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+    @abstractmethod
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+        Passing None will cause the driver to stop the model execution loop
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
+        """
+        raise NotImplementedError
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        return self._run_workers(method, *args, **(kwargs or {}))
+    @abstractmethod
+    def _run_workers(
+        self,
+        method: Union[str, Callable],
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers.
+        Args:
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
+        # TODO: simplify and merge with collective_rpc
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        raise NotImplementedError
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if self.parallel_worker_tasks is None:
+            # Start model execution loop running in the parallel workers
+            self.parallel_worker_tasks = asyncio.create_task(
+                self._start_worker_execution_loop())
+        # Only the driver worker returns the sampling results.
+        return await self._driver_execute_model_async(execute_model_req)
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+        await self._driver_execute_model_async()
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        await parallel_worker_tasks
+    @abstractmethod
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> List[SamplerOutput]:
+        """Execute the model asynchronously in the driver worker.
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    async def _start_worker_execution_loop(self):
+        """Run execution loop on all workers. It guarantees all workers run
+        the loop or None of them is running the loop. Loop can be stopped by
+        `stop_remote_worker_execution_loop`.
+        The API is idempotent (guarantee only 1 loop run at any moment)."""
+        raise NotImplementedError

.venv/lib/python3.11/site-packages/vllm/executor/mp_distributed_executor.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import os
+from typing import Any, Callable, List, Optional, Union
+import cloudpickle
+from vllm.executor.executor_base import DistributedExecutorBase
+from vllm.executor.multiproc_worker_utils import (
+    ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
+    set_multiprocessing_worker_envs)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
+                        get_distributed_init_method, get_ip, get_open_port,
+                        make_async, run_method, update_environment_variables)
+from vllm.worker.worker_base import WorkerWrapperBase
+logger = init_logger(__name__)
+class MultiprocessingDistributedExecutor(DistributedExecutorBase):
+    """Python multiprocessing-based distributed executor"""
+    uses_ray: bool = False
+    def _check_cuda(self) -> None:
+        """Check that the number of GPUs is sufficient for the parallel
+        configuration. Separate from _init_executor to reduce the number of
+        indented blocks.
+        """
+        parallel_config = self.parallel_config
+        world_size = parallel_config.world_size
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+        cuda_device_count = cuda_device_count_stateless()
+        # Use confusing message for more common TP-only case.
+        if tensor_parallel_size > cuda_device_count:
+            raise RuntimeError(
+                f"please set tensor_parallel_size ({tensor_parallel_size}) "
+                f"to less than max local gpu count ({cuda_device_count})")
+        if world_size > cuda_device_count:
+            raise RuntimeError(
+                f"please ensure that world_size ({world_size}) "
+                f"is less than than max local gpu count ({cuda_device_count})")
+        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
+    def _init_executor(self) -> None:
+        from vllm.platforms import current_platform
+        if current_platform.is_cuda_alike():
+            self._check_cuda()
+        # Create the parallel GPU workers.
+        world_size = self.parallel_config.world_size
+        tensor_parallel_size = self.parallel_config.tensor_parallel_size
+        # Set multiprocessing envs that are common to V0 and V1
+        set_multiprocessing_worker_envs(self.parallel_config)
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # 127.0.0.1 for communication.
+        distributed_init_method = get_distributed_init_method(
+            "127.0.0.1", get_open_port())
+        self.workers: List[ProcessWorkerWrapper] = []
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[ProcessWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[ProcessWorkerWrapper] = []
+        if world_size == 1:
+            self.worker_monitor = None
+        else:
+            result_handler = ResultHandler()
+            for rank in range(1, world_size):
+                worker = ProcessWorkerWrapper(result_handler,
+                                              WorkerWrapperBase,
+                                              self.vllm_config, rank)
+                self.workers.append(worker)
+                if rank % tensor_parallel_size == 0:
+                    self.tp_driver_workers.append(worker)
+                else:
+                    self.non_driver_workers.append(worker)
+            self.worker_monitor = WorkerMonitor(self.workers, result_handler)
+            result_handler.start()
+            self.worker_monitor.start()
+        # Set up signal handlers to shutdown the executor cleanly
+        # sometimes gc does not work well
+        self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
+        all_kwargs = []
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        for i in range(world_size):
+            local_rank = i
+            rank = i
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
+        self._run_workers("init_device")
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+        self.driver_exec_model = make_async(self.driver_worker.execute_model)
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+    def shutdown(self):
+        if (worker_monitor := getattr(self, "worker_monitor",
+                                      None)) is not None:
+            worker_monitor.close()
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        return self.driver_worker.execute_model(execute_model_req)
+    def _run_workers(
+        self,
+        method: Union[str, Callable],
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> List[Any]:
+        """Runs the given method on all workers.
+        Args:
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
+        """
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+        if async_run_tensor_parallel_workers_only:
+            # Run only non-driver workers and just return futures.
+            return [
+                worker.execute_method(sent_method, *args, **kwargs)
+                for worker in self.non_driver_workers
+            ]
+        # Start all remote workers first.
+        worker_outputs = [
+            worker.execute_method(sent_method, *args, **kwargs)
+            for worker in self.workers
+        ]
+        driver_worker_output = run_method(self.driver_worker, sent_method,
+                                          args, kwargs)
+        # Get the results of the workers.
+        return [driver_worker_output
+                ] + [output.get() for output in worker_outputs]
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
+        ):
+            raise RuntimeError("Worker processes are not running")
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        for result in parallel_worker_tasks:
+            result.get()
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        if not self.tp_driver_workers:
+            return await self.driver_exec_model(execute_model_req)
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+        tasks = [
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
+                                    execute_model_req))
+        ]
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method_async,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+        results = await asyncio.gather(*tasks)
+        # Only the last PP stage has the final results.
+        return results[-1]
+    async def _start_worker_execution_loop(self):
+        coros = [
+            worker.execute_method_async("start_worker_execution_loop")
+            for worker in self.non_driver_workers
+        ]
+        return await asyncio.gather(*coros)

.venv/lib/python3.11/site-packages/vllm/executor/msgspec_utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# SPDX-License-Identifier: Apache-2.0
+from array import array
+from typing import Any, Type
+from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
+def encode_hook(obj: Any) -> Any:
+    """Custom msgspec enc hook that supports array types.
+    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
+    """
+    if isinstance(obj, array):
+        assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
+            f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
+            f"Given array has a type code of {obj.typecode}.")
+        return obj.tobytes()
+def decode_hook(type: Type, obj: Any) -> Any:
+    """Custom msgspec dec hook that supports array types.
+    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
+    """
+    if type is array:
+        deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
+        deserialized.frombytes(obj)
+        return deserialized

.venv/lib/python3.11/site-packages/vllm/executor/ray_distributed_executor.py ADDED Viewed

	@@ -0,0 +1,638 @@

+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import os
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+import cloudpickle
+import msgspec
+import vllm.envs as envs
+from vllm.executor.executor_base import (
+    DistributedExecutorBase)  # yapf: disable
+from vllm.executor.msgspec_utils import encode_hook
+from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster,
+                                     ray)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, make_async)
+if ray is not None:
+    from ray.actor import ActorHandle
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+else:
+    ActorHandle = None
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+logger = init_logger(__name__)
+@dataclass
+class RayWorkerMetaData:
+    """
+    Metadata for a Ray worker.
+    The order of ray worker creation can be random,
+    and we need to reset the rank after creating all workers.
+    """
+    worker: ActorHandle
+    created_rank: int
+    adjusted_rank: int = -1
+    ip: str = ""
+class RayDistributedExecutor(DistributedExecutorBase):
+    uses_ray: bool = True
+    def _init_executor(self) -> None:
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
+        if envs.VLLM_USE_V1:
+            # v1 always uses the compiled DAG and SPMD worker.
+            os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
+            os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1"
+        # If the env var is set, it uses the Ray's compiled DAG API
+        # which optimizes the control plane overhead.
+        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+        # Currently, this requires USE_RAY_SPMD_WORKER=True.
+        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
+        # If the env var is set, then we do not distinguish between the
+        # "driver worker" vs other workers. Also, the rank 0 worker will
+        # be executed in a remote Ray worker. Currently this requires
+        # USE_RAY_COMPILED_DAG=True.
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if self.use_ray_compiled_dag:
+            assert self.use_ray_spmd_worker, (
+                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
+                "VLLM_USE_RAY_SPMD_WORKER=1")
+        if self.use_ray_spmd_worker:
+            # TODO: Support SPMD worker for non-DAG Ray executor.
+            assert self.use_ray_compiled_dag, (
+                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
+                "VLLM_USE_RAY_COMPILED_DAG=1")
+        assert self.uses_ray
+        initialize_ray_cluster(self.parallel_config)
+        placement_group = self.parallel_config.placement_group
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+        self.output_decoder = msgspec.msgpack.Decoder(
+            Optional[List[SamplerOutput]])
+        self.use_v1 = envs.VLLM_USE_V1
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if not self.use_ray_compiled_dag:
+            self.driver_exec_method = make_async(
+                self.driver_worker.execute_method)
+    def shutdown(self) -> None:
+        if hasattr(self, "forward_dag") and self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
+            self.forward_dag = None
+    def _configure_ray_workers_use_nsight(self,
+                                          ray_remote_kwargs) -> Dict[str, Any]:
+        # If nsight profiling is enabled, we need to set the profiling
+        # configuration for the ray workers as runtime env.
+        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
+        runtime_env.update({
+            "nsight": {
+                "t": "cuda,cudnn,cublas",
+                "o": "'worker_process_%p'",
+                "cuda-graph-trace": "node",
+            }
+        })
+        return ray_remote_kwargs
+    # child class could overwrite this to return actual env vars.
+    def _get_env_vars_to_be_updated(self):
+        return self._env_vars_for_all_workers
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+        # Used in ray compiled DAG: indexed first by PP rank,
+        # and then TP rank. In other words, the inner list is
+        # the TP group of workers for a PP rank.
+        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
+        if self.parallel_config.ray_workers_use_nsight:
+            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                ray_remote_kwargs)
+        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
+        # Create the workers.
+        bundle_indices: List[int]
+        if envs.VLLM_RAY_BUNDLE_INDICES:
+            # Use the bundle indices specified by the user.
+            bundle_indices = list(
+                map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(",")))
+            assert len(bundle_indices) == self.parallel_config.world_size, \
+            ("VLLM_RAY_BUNDLE_INDICES must have the same size"
+            f" as the world size, but got {bundle_indices=} "
+            f"and {self.parallel_config.world_size=}")
+            assert len(set(bundle_indices)) == len(bundle_indices), \
+            ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
+            f" but got {bundle_indices=}")
+        else:
+            # use the first N bundles that have GPU resources.
+            bundle_indices = []
+            for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+                if bundle.get(current_platform.ray_device_key, 0):
+                    bundle_indices.append(bundle_id)
+            bundle_indices = bundle_indices[:self.parallel_config.world_size]
+        worker_metadata: List[RayWorkerMetaData] = []
+        driver_ip = get_ip()
+        for rank, bundle_id in enumerate(bundle_indices):
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+            if current_platform.ray_device_key == "GPU":
+                # NV+AMD GPUs, and Intel XPUs
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=num_gpus,
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rpc_rank=rank)
+            else:
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=0,
+                    resources={current_platform.ray_device_key: num_gpus},
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rpc_rank=rank)
+            worker_metadata.append(
+                RayWorkerMetaData(worker=worker, created_rank=rank))
+        worker_ips = ray.get([
+            each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
+            for each in worker_metadata
+        ])
+        for each, ip in zip(worker_metadata, worker_ips):
+            each.ip = ip
+        if not self.use_ray_spmd_worker:
+            for i, each in enumerate(worker_metadata):
+                # find and remove the dummy worker from the list
+                worker = each.worker
+                worker_ip = each.ip
+                if self.driver_dummy_worker is None and worker_ip == driver_ip:
+                    # If the worker is on the same node as the driver, we use it
+                    # as the resource holder for the driver process.
+                    self.driver_dummy_worker = worker
+                    self.driver_worker = RayWorkerWrapper(
+                        vllm_config=self.vllm_config, rpc_rank=0)
+                    worker_metadata.pop(i)
+                    break
+        logger.debug("workers: %s", worker_metadata)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
+        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node.")
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+        def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = item.ip
+            return (0 if ip == driver_ip else 1, ip_counts[ip], ip)
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        sorted_worker_metadata = sorted(worker_metadata,
+                                        key=sort_by_driver_then_worker_ip)
+        start_rank = 0 if self.use_ray_spmd_worker else 1
+        for i, item in enumerate(sorted_worker_metadata):
+            item.adjusted_rank = i + start_rank
+        self.workers = [item.worker for item in sorted_worker_metadata]
+        rerank_mapping = {
+            item.created_rank: item.adjusted_rank
+            for item in sorted_worker_metadata
+        }
+        self._run_workers("adjust_rank", rerank_mapping)
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+        all_ips = set(worker_ips + [driver_ip])
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `VLLM_HOST_IP`"
+                " environment variable, make sure it is unique for"
+                " each node.")
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [{
+            current_platform.device_control_env_var:
+            ",".join(map(str, node_gpus[node_id])),
+        } for (node_id, _) in worker_node_and_gpu_ids]
+        for args in all_args_to_update_environment_variables:
+            # some carry-over env vars from the driver
+            # TODO: refactor platform-specific env vars
+            for name in [
+                    "VLLM_ATTENTION_BACKEND",
+                    "TPU_CHIPS_PER_HOST_BOUNDS",
+                    "TPU_HOST_BOUNDS",
+                    "VLLM_USE_V1",
+                    "VLLM_TRACE_FUNCTION",
+            ]:
+                if name in os.environ:
+                    args[name] = os.environ[name]
+        self._env_vars_for_all_workers = (
+            all_args_to_update_environment_variables)
+        self._run_workers("update_environment_variables",
+                          self._get_env_vars_to_be_updated())
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+        # Initialize the actual workers inside worker wrapper.
+        all_kwargs = []
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            local_rank = node_workers[node_id].index(rank)
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
+        self._run_workers("init_device")
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+        if self.use_ray_spmd_worker:
+            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+                self.pp_tp_workers.append([])
+                for tp_rank in range(
+                        self.parallel_config.tensor_parallel_size):
+                    # PP=2, TP=4
+                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
+                            ) + tp_rank
+                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
+                    assert pp_rank < len(self.pp_tp_workers)
+                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[RayWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[RayWorkerWrapper] = []
+        # Enforce rank order for correct rank to return final output.
+        for index, worker in enumerate(self.workers):
+            # The driver worker is rank 0 and not in self.workers.
+            rank = index + 1
+            if rank % self.parallel_config.tensor_parallel_size == 0:
+                self.tp_driver_workers.append(worker)
+            else:
+                self.non_driver_workers.append(worker)
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return super().execute_model(execute_model_req)
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+        if self.use_v1:
+            serialized_data = execute_model_req
+        else:
+            serialized_data = self.input_encoder.encode(execute_model_req)
+        outputs = ray.get(self.forward_dag.execute(serialized_data))
+        if self.use_v1:
+            output = outputs[0]
+        else:
+            output = self.output_decoder.decode(outputs[0])
+        return output
+    def _run_workers(
+        self,
+        method: Union[str, Callable],
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+        Args:
+        - async_run_tensor_parallel_workers_only: If True the method will be
+          run only in the remote TP workers, not the driver worker.
+          It will also be run asynchronously and return a list of futures
+          rather than blocking on the results.
+        - args/kwargs: All workers share the same args/kwargs
+        """
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
+        if self.use_ray_spmd_worker:
+            assert not async_run_tensor_parallel_workers_only, (
+                "async_run_tensor_parallel_workers_only is not supported for "
+                "spmd mode.")
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+        # Start the ray workers first.
+        ray_workers = self.workers
+        if async_run_tensor_parallel_workers_only:
+            ray_workers = self.non_driver_workers
+        ray_worker_outputs = [
+            worker.execute_method.remote(sent_method, *args, **kwargs)
+            for worker in ray_workers
+        ]
+        if async_run_tensor_parallel_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+        driver_worker_output = []
+        # In SPMD mode, the driver worker is the same as any other worker,
+        # so we only explicitly execute on the driver worker if using a
+        # non-SPMD worker class.
+        if not self.use_ray_spmd_worker:
+            # Start the driver worker after all the ray workers.
+            driver_worker_output = [
+                self.driver_worker.execute_method(sent_method, *args, **kwargs)
+            ]
+        # Get the results of the ray workers.
+        if self.workers:
+            ray_worker_outputs = ray.get(ray_worker_outputs)
+        return driver_worker_output + ray_worker_outputs
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+    def _check_ray_adag_installation(self):
+        import pkg_resources
+        from packaging import version
+        required_version = version.parse("2.40")
+        current_version = version.parse(
+            pkg_resources.get_distribution("ray").version)
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} is "
+                             f"required, but found {current_version}")
+        import importlib.util
+        adag_spec = importlib.util.find_spec(
+            "ray.experimental.compiled_dag_ref")
+        if adag_spec is None:
+            raise ValueError("Ray accelerated DAG is not installed. "
+                             "Run `pip install ray[adag]` to install it.")
+        cupy_spec = importlib.util.find_spec("cupy")
+        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
+            raise ValueError(
+                "cupy is not installed but required since "
+                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
+                "Run `pip install ray[adag]` and check cupy installation.")
+    def _compiled_ray_dag(self, enable_asyncio: bool):
+        assert self.parallel_config.use_ray
+        self._check_ray_adag_installation()
+        from ray.dag import InputNode, MultiOutputNode
+        from ray.experimental.channel.torch_tensor_type import TorchTensorType
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+        with InputNode() as input_data:
+            # Example DAG: PP=2, TP=4
+            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
+            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
+            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
+            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
+            # All workers in the first TP group will take in the
+            # ExecuteModelRequest as input.
+            outputs = [input_data for _ in self.pp_tp_workers[0]]
+            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
+                # Each PP worker takes in the output of the previous PP worker,
+                # and the TP group executes in SPMD fashion.
+                if self.use_v1:
+                    outputs = [
+                        worker.execute_model.
+                        bind(  # type: ignore[attr-defined]
+                            outputs[i]) for i, worker in enumerate(tp_group)
+                    ]
+                else:
+                    outputs = [
+                        worker.execute_model_spmd.
+                        bind(  # type: ignore[attr-defined]
+                            outputs[i]) for i, worker in enumerate(tp_group)
+                    ]
+                last_pp_rank = len(self.pp_tp_workers) - 1
+                if pp_rank < last_pp_rank:
+                    # Specify how intermediate tensors should be passed
+                    # between pp stages, no need to specify for the last
+                    # pp stage.
+                    transport = "nccl" \
+                        if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
+                        else "auto"
+                    outputs = [
+                        output.with_type_hint(
+                            TorchTensorType(transport=transport))
+                        for output in outputs
+                    ]
+            forward_dag = MultiOutputNode(outputs)
+        return forward_dag.experimental_compile(
+            enable_asyncio=enable_asyncio,
+            _overlap_gpu_communication=envs.
+            VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+    def __del__(self):
+        self.shutdown()
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return await super().execute_model_async(execute_model_req)
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
+        serialized_data = self.input_encoder.encode(execute_model_req)
+        dag_future = await self.forward_dag.execute_async(serialized_data)
+        output = await dag_future[0]
+        return self.output_decoder.decode(output)
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
+        if not self.tp_driver_workers:
+            return await self.driver_exec_method("execute_model",
+                                                 execute_model_req)
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+        tasks = [
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
+                                    "execute_model", execute_model_req))
+        ]
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method.remote,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+        results = await asyncio.gather(*tasks)
+        # Only the last PP stage has the final results.
+        return results[-1]
+    async def _start_worker_execution_loop(self):
+        assert not self.use_ray_spmd_worker, (
+            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.non_driver_workers
+        ]
+        return await asyncio.gather(*coros)
+    def check_health(self) -> None:
+        # Assume that the Ray workers are healthy.
+        # TODO: check the health of the Ray workers
+        return

.venv/lib/python3.11/site-packages/vllm/executor/ray_utils.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# SPDX-License-Identifier: Apache-2.0
+import os
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+import msgspec
+from vllm.config import ParallelConfig
+from vllm.executor.msgspec_utils import decode_hook, encode_hook
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.sequence import ExecuteModelRequest, IntermediateTensors
+from vllm.utils import get_ip
+from vllm.worker.worker_base import WorkerWrapperBase
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.outputs import ModelRunnerOutput
+logger = init_logger(__name__)
+PG_WAIT_TIMEOUT = 1800
+try:
+    import ray
+    from ray.util import placement_group_table
+    from ray.util.placement_group import PlacementGroup
+    try:
+        from ray._private.state import available_resources_per_node
+    except ImportError:
+        # Ray 2.9.x doesn't expose `available_resources_per_node`
+        from ray._private.state import state as _state
+        available_resources_per_node = _state._available_resources_per_node
+    class RayWorkerWrapper(WorkerWrapperBase):
+        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
+        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
+        def __init__(self, *args, **kwargs) -> None:
+            super().__init__(*args, **kwargs)
+            # Since the compiled DAG runs a main execution
+            # in a different thread that calls cuda.set_device.
+            # The flag indicates is set_device is called on
+            # that thread.
+            self.compiled_dag_cuda_device_set = False
+            self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
+                                                         dec_hook=decode_hook)
+            self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+        def get_node_ip(self) -> str:
+            return get_ip()
+        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
+            node_id = ray.get_runtime_context().get_node_id()
+            device_key = current_platform.ray_device_key
+            if not device_key:
+                raise RuntimeError("current platform %s does not support ray.",
+                                   current_platform.device_name)
+            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
+            )[device_key]
+            return node_id, gpu_ids
+        def execute_model_spmd(
+            self, req_or_tuple: Union[bytes,
+                                      Tuple[bytes,
+                                            Optional[IntermediateTensors]]]
+        ) -> bytes:
+            """Execute model in SPMD fashion: used only when SPMD worker and
+            compiled DAG are both enabled.
+            Args:
+                req_or_tuple: A request or a tuple containing the
+                    request and intermediate tensors. Intermediate tensors are
+                    None unless if it is provided because it is > 0 pipeline
+                    stage. The request is serialized by msgspec.
+            """
+            if isinstance(req_or_tuple, bytes):
+                serialized_req, intermediate_tensors = req_or_tuple, None
+            else:
+                serialized_req, intermediate_tensors = req_or_tuple
+            execute_model_req = self.input_decoder.decode(serialized_req)
+            # TODO(swang): This is needed right now because Ray aDAG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            import torch
+            if not self.compiled_dag_cuda_device_set:
+                torch.cuda.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+            output = self.worker._execute_model_spmd(execute_model_req,
+                                                     intermediate_tensors)
+            # Pipeline model request and output to the next pipeline stage.
+            if isinstance(output, IntermediateTensors):
+                output = serialized_req, output
+            else:
+                output = self.output_encoder.encode(output)
+            return output
+        def setup_device_if_necessary(self):
+            # TODO(swang): This is needed right now because Ray CG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            # We can remove this API after it is fixed in compiled graph.
+            import torch
+            assert self.worker is not None, "Worker is not initialized"
+            if not self.compiled_dag_cuda_device_set:
+                torch.cuda.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+        def execute_model(
+            self,
+            scheduler_output: "SchedulerOutput",
+        ) -> "ModelRunnerOutput":
+            self.setup_device_if_necessary()
+            assert self.worker is not None, "Worker is not initialized"
+            output = self.worker.model_runner.execute_model(scheduler_output)
+            return output
+        def override_env_vars(self, vars: Dict[str, str]):
+            os.environ.update(vars)
+    ray_import_err = None
+except ImportError as e:
+    ray = None  # type: ignore
+    ray_import_err = e
+    RayWorkerWrapper = None  # type: ignore
+def ray_is_available() -> bool:
+    """Returns True if Ray is available."""
+    return ray is not None
+def assert_ray_available():
+    """Raise an exception if Ray is not available."""
+    if ray is None:
+        raise ValueError("Failed to import Ray, please install Ray with "
+                         "`pip install ray`.") from ray_import_err
+def _verify_bundles(placement_group: "PlacementGroup",
+                    parallel_config: ParallelConfig, device_str: str):
+    """Verify a given placement group has bundles located in the right place.
+    There are 2 rules.
+    - Warn if all tensor parallel workers cannot fit in a single node.
+    - Fail if driver node is not included in a placement group.
+    """
+    assert ray.is_initialized(), (
+        "Ray is not initialized although distributed-executor-backend is ray.")
+    pg_data = placement_group_table(placement_group)
+    # bundle_idx -> node_id
+    bundle_to_node_ids = pg_data["bundles_to_node_id"]
+    # bundle_idx -> bundle (e.g., {"GPU": 1})
+    bundles = pg_data["bundles"]
+    # node_id -> List of bundle (e.g., {"GPU": 1})
+    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
+    for bundle_idx, node_id in bundle_to_node_ids.items():
+        node_id_to_bundle[node_id].append(bundles[bundle_idx])
+    driver_node_id = ray.get_runtime_context().get_node_id()
+    if driver_node_id not in node_id_to_bundle:
+        raise RuntimeError(
+            f"driver node id {driver_node_id} is not included in a placement "
+            f"group {placement_group.id}. Node id -> bundles "
+            f"{node_id_to_bundle}. "
+            "You don't have enough GPUs available in a current node. Check "
+            "`ray status` to see if you have available GPUs in a node "
+            f"{driver_node_id} before starting an vLLM engine.")
+    for node_id, bundles in node_id_to_bundle.items():
+        if len(bundles) < parallel_config.tensor_parallel_size:
+            logger.warning(
+                "tensor_parallel_size=%d "
+                "is bigger than a reserved number of %ss (%d "
+                "%ss) in a node %s. Tensor parallel workers can be "
+                "spread out to 2+ nodes which can degrade the performance "
+                "unless you have fast interconnect across nodes, like "
+                "Infiniband. To resolve this issue, make sure you have more "
+                "than %d GPUs available at each node.",
+                parallel_config.tensor_parallel_size, device_str, len(bundles),
+                device_str, node_id, parallel_config.tensor_parallel_size)
+def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
+    """Wait until a placement group is ready.
+    It prints the informative log messages if the placement group is
+    not created within time.
+    """
+    # Wait until PG is ready - this will block until all
+    # requested resources are available, and will timeout
+    # if they cannot be provisioned.
+    placement_group_specs = current_placement_group.bundle_specs
+    s = time.time()
+    pg_ready_ref = current_placement_group.ready()
+    wait_interval = 10
+    while time.time() - s < PG_WAIT_TIMEOUT:
+        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
+        if len(ready) > 0:
+            break
+        # Exponential backoff for warning print.
+        wait_interval *= 2
+        logger.info(
+            "Waiting for creating a placement group of specs for "
+            "%d seconds. specs=%s. Check "
+            "`ray status` to see if you have enough resources,"
+            " and make sure the IP addresses used by ray cluster"
+            " are the same as VLLM_HOST_IP environment variable"
+            " specified in each node if you are running on a multi-node.",
+            int(time.time() - s), placement_group_specs)
+    try:
+        ray.get(pg_ready_ref, timeout=0)
+    except ray.exceptions.GetTimeoutError:
+        raise ValueError(
+            "Cannot provide a placement group of "
+            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
+            "`ray status` to make sure the cluster has enough resources."
+        ) from None
+def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
+    ray.util.remove_placement_group(current_placement_group)
+    s = time.time()
+    wait_interval = 10
+    while time.time() - s < PG_WAIT_TIMEOUT:
+        pg = ray.util.get_current_placement_group()
+        if pg is None:
+            break
+        # Exponential backoff for warning print.
+        wait_interval *= 2
+        logger.info(
+            "Waiting for removing a placement group of specs for "
+            "%d seconds.", int(time.time() - s))
+        time.sleep(wait_interval)
+def initialize_ray_cluster(
+    parallel_config: ParallelConfig,
+    ray_address: Optional[str] = None,
+):
+    """Initialize the distributed cluster with Ray.
+    it will connect to the Ray cluster and create a placement group
+    for the workers, which includes the specification of the resources
+    for each distributed worker.
+    Args:
+        parallel_config: The configurations for parallel execution.
+        ray_address: The address of the Ray cluster. If None, uses
+            the default Ray cluster address.
+    """
+    assert_ray_available()
+    from vllm.platforms import current_platform
+    # Connect to a ray cluster.
+    if current_platform.is_rocm() or current_platform.is_xpu():
+        # Try to connect existing ray instance and create a new one if not found
+        try:
+            ray.init("auto", ignore_reinit_error=True)
+        except ConnectionError:
+            logger.warning(
+                "No existing RAY instance detected. "
+                "A new instance will be launched with current node resources.")
+            ray.init(address=ray_address,
+                     ignore_reinit_error=True,
+                     num_gpus=parallel_config.world_size)
+    else:
+        ray.init(address=ray_address, ignore_reinit_error=True)
+    if parallel_config.placement_group:
+        # Placement group is already set.
+        return
+    device_str = current_platform.ray_device_key
+    if not device_str:
+        raise ValueError(
+            f"current platform {current_platform.device_name} does not "
+            "support ray.")
+    # Create placement group for worker processes
+    current_placement_group = ray.util.get_current_placement_group()
+    if current_placement_group:
+        # We are in a placement group
+        bundles = current_placement_group.bundle_specs
+        # Verify that we can use the placement group.
+        device_bundles = 0
+        for bundle in bundles:
+            bundle_devices = bundle.get(device_str, 0)
+            if bundle_devices > 1:
+                raise ValueError(
+                    "Placement group bundle cannot have more than 1 "
+                    f"{device_str}.")
+            if bundle_devices:
+                device_bundles += 1
+        if parallel_config.world_size > device_bundles:
+            raise ValueError(
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group."
+                f"Required number of devices: {parallel_config.world_size}. "
+                f"Total number of devices: {device_bundles}.")
+    else:
+        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        # Log a warning message and delay resource allocation failure response.
+        # Avoid immediate rejection to allow user-initiated placement group
+        # created and wait cluster to be ready
+        if parallel_config.world_size > num_devices_in_cluster:
+            logger.warning(
+                "The number of required %ss exceeds the total "
+                "number of available %ss in the placement group.", device_str,
+                device_str)
+        # Create a new placement group
+        placement_group_specs: List[Dict[str, float]] = ([{
+            device_str: 1.0
+        } for _ in range(parallel_config.world_size)])
+        # vLLM engine is also a worker to execute model with an accelerator,
+        # so it requires to have the device in a current node. Check if
+        # the current node has at least one device.
+        current_ip = get_ip()
+        current_node_id = ray.get_runtime_context().get_node_id()
+        current_node_resource = available_resources_per_node()[current_node_id]
+        if current_node_resource.get(device_str, 0) < 1:
+            raise ValueError(
+                f"Current node has no {device_str} available. "
+                f"{current_node_resource=}. vLLM engine cannot start without "
+                f"{device_str}. Make sure you have at least 1 {device_str} "
+                f"available in a node {current_node_id=} {current_ip=}.")
+        # This way, at least bundle is required to be created in a current
+        # node.
+        placement_group_specs[0][f"node:{current_ip}"] = 0.001
+        # By default, Ray packs resources as much as possible.
+        current_placement_group = ray.util.placement_group(
+            placement_group_specs, strategy="PACK")
+        _wait_until_pg_ready(current_placement_group)
+    assert current_placement_group is not None
+    _verify_bundles(current_placement_group, parallel_config, device_str)
+    # Set the placement group in the parallel config
+    parallel_config.placement_group = current_placement_group
+def get_num_tpu_nodes() -> int:
+    from ray._private.accelerators import TPUAcceleratorManager
+    cluster_resources = ray.cluster_resources()
+    total_tpus = int(cluster_resources["TPU"])
+    tpus_per_node = TPUAcceleratorManager.get_current_node_num_accelerators()
+    assert total_tpus % tpus_per_node == 0
+    return total_tpus // tpus_per_node
+def get_num_nodes_in_placement_group() -> int:
+    pg_table = ray.util.placement_group_table()
+    current_pg = ray.util.get_current_placement_group()
+    num_nodes = 0
+    if current_pg:
+        nodes_in_pg = set()
+        for pg_key, pg in pg_table.items():
+            if pg_key == current_pg.id.hex():
+                for _, node in pg["bundles_to_node_id"].items():
+                    nodes_in_pg.add(node)
+        num_nodes = len(nodes_in_pg)
+    return num_nodes

.venv/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+import vllm.envs as envs
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        run_method)
+from vllm.worker.worker_base import WorkerWrapperBase
+logger = init_logger(__name__)
+class UniProcExecutor(ExecutorBase):
+    uses_ray: bool = False
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        local_rank = 0
+        rank = 0
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=(not self.parallel_config)
+            or (rank % self.parallel_config.tensor_parallel_size == 0),
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        if kwargs is None:
+            kwargs = {}
+        answer = run_method(self.driver_worker, method, args, kwargs)
+        return [answer]
+    def check_health(self) -> None:
+        # UniProcExecutor will always be healthy as long as
+        # it's running.
+        return
+UniProcExecutorAsync = UniProcExecutor
+class ExecutorWithExternalLauncher(UniProcExecutor):
+    """An executor that uses external launchers to launch engines,
+    specially designed for torchrun-compatible launchers, for
+    offline inference with tensor parallelism.
+    see https://github.com/vllm-project/vllm/issues/11400 for
+    the motivation, and examples/offline_inference/torchrun_example.py
+    for the usage example.
+    The key idea: although it is tensor-parallel inference, we only
+    create one worker per executor, users will launch multiple
+    engines with torchrun-compatible launchers, and all these engines
+    work together to process the same prompts. When scheduling is
+    deterministic, all the engines will generate the same outputs,
+    and they don't need to synchronize the states with each other.
+    """
+    uses_ray: bool = False
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        assert self.vllm_config.parallel_config.pipeline_parallel_size == 1, \
+            ("ExecutorWithExternalLauncher does not "
+            "support pipeline parallelism.")
+        assert self.vllm_config.scheduler_config.delay_factor == 0.0, \
+            ("ExecutorWithExternalLauncher needs deterministic "
+            "execution, so it"
+            "does not support delay_factor in scheduling")
+        assert not envs.VLLM_USE_V1, \
+            ("V1 architecture cannot guarantee deterministic execution, "
+            "so it is not supported in ExecutorWithExternalLauncher.")
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        # engines are launched in torchrun-compatible launchers
+        # so we can use the env:// method.
+        # required env vars:
+        # - RANK
+        # - MASTER_ADDR
+        # - MASTER_PORT
+        distributed_init_method = "env://"
+        rank = int(os.environ["RANK"])
+        local_rank = rank
+        is_driver_worker = True
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """
+        Determine the number of available KV blocks.
+        Add an additional all_reduce to get the min across all ranks.
+        Note that even if we have the same `gpu_memory_utilization` and
+        `swap_space`, the available memory in every rank might still
+        differ because NCCL can take different amounts of memory in
+        different ranks. Therefore, it is necessary to test if all ranks
+        agree on the same KV cache configuration.
+        """
+        a, b = super().determine_num_available_blocks()
+        from vllm.distributed.parallel_state import get_world_group
+        cpu_group = get_world_group().cpu_group
+        a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64)
+        b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64)
+        dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return a_tensor.item(), b_tensor.item()

.venv/lib/python3.11/site-packages/vllm/forward_context.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# SPDX-License-Identifier: Apache-2.0
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, Optional
+import torch
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+logger = init_logger(__name__)
+track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
+last_logging_time: float = 0
+forward_start_time: float = 0
+batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
+batchsize_forward_time: defaultdict = defaultdict(list)
+@dataclass
+class ForwardContext:
+    # copy from vllm_config.compilation_config.static_forward_context
+    attn_layers: Dict[str, Any]
+    # TODO: extend to support per-layer dynamic forward context
+    attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
+    # TODO: remove after making all virtual_engines share the same kv cache
+    virtual_engine: int  # set dynamically for each forward pass
+_forward_context: Optional[ForwardContext] = None
+def get_forward_context() -> ForwardContext:
+    """Get the current forward context."""
+    assert _forward_context is not None, (
+        "Forward context is not set. "
+        "Please use `set_forward_context` to set the forward context.")
+    return _forward_context
+@contextmanager
+def set_forward_context(attn_metadata: Any,
+                        vllm_config: VllmConfig,
+                        virtual_engine: int = 0):
+    """A context manager that stores the current forward context,
+    can be attention metadata, etc.
+    Here we can inject common logic for every model forward pass.
+    """
+    global forward_start_time
+    need_to_track_batchsize = track_batchsize and attn_metadata is not None
+    if need_to_track_batchsize:
+        forward_start_time = time.perf_counter()
+    global _forward_context
+    prev_context = _forward_context
+    _forward_context = ForwardContext(
+        attn_layers=vllm_config.compilation_config.static_forward_context,
+        virtual_engine=virtual_engine,
+        attn_metadata=attn_metadata)
+    try:
+        yield
+    finally:
+        global last_logging_time, batchsize_logging_interval
+        if need_to_track_batchsize:
+            if hasattr(attn_metadata, "num_prefill_tokens"):
+                # for v0 attention backends
+                batchsize = attn_metadata.num_prefill_tokens + \
+                    attn_metadata.num_decode_tokens
+            else:
+                # for v1 attention backends
+                batchsize = attn_metadata.num_input_tokens
+            # we use synchronous scheduling right now,
+            # adding a sync point here should not affect
+            # scheduling of the next batch
+            torch.cuda.synchronize()
+            now = time.perf_counter()
+            # time measurement is in milliseconds
+            batchsize_forward_time[batchsize].append(
+                (now - forward_start_time) * 1000)
+            if now - last_logging_time > batchsize_logging_interval:
+                last_logging_time = now
+                forward_stats = []
+                for bs, times in batchsize_forward_time.items():
+                    if len(times) <= 1:
+                        # can be cudagraph / profiling run
+                        continue
+                    medium = torch.quantile(torch.tensor(times), q=0.5).item()
+                    medium = round(medium, 2)
+                    forward_stats.append((bs, len(times), medium))
+                forward_stats.sort(key=lambda x: x[1], reverse=True)
+                if forward_stats:
+                    logger.info(("Batchsize forward time stats "
+                                 "(batchsize, count, median_time(ms)): %s"),
+                                forward_stats)
+        _forward_context = prev_context

.venv/lib/python3.11/site-packages/vllm/logger.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# SPDX-License-Identifier: Apache-2.0
+"""Logging configuration for vLLM."""
+import datetime
+import json
+import logging
+import os
+import sys
+from functools import lru_cache, partial
+from logging import Logger
+from logging.config import dictConfig
+from os import path
+from types import MethodType
+from typing import Any, Optional, cast
+import vllm.envs as envs
+VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
+VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
+VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+           "%(filename)s:%(lineno)d] %(message)s")
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+DEFAULT_LOGGING_CONFIG = {
+    "formatters": {
+        "vllm": {
+            "class": "vllm.logging_utils.NewLineFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
+    },
+    "handlers": {
+        "vllm": {
+            "class": "logging.StreamHandler",
+            "formatter": "vllm",
+            "level": VLLM_LOGGING_LEVEL,
+            "stream": "ext://sys.stdout",
+        },
+    },
+    "loggers": {
+        "vllm": {
+            "handlers": ["vllm"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+    },
+    "version": 1,
+    "disable_existing_loggers": False
+}
+@lru_cache
+def _print_info_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, stacklevel=2)
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, stacklevel=2)
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the :class:`logging.Logger`
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+    def info_once(self, msg: str) -> None:
+        """
+        As :meth:`info`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_info_once(self, msg)
+    def warning_once(self, msg: str) -> None:
+        """
+        As :meth:`warning`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_warning_once(self, msg)
+def _configure_vllm_root_logger() -> None:
+    logging_config = dict[str, Any]()
+    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+        raise RuntimeError(
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
+            "implies VLLM_CONFIGURE_LOGGING. Please enable "
+            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
+    if VLLM_CONFIGURE_LOGGING:
+        logging_config = DEFAULT_LOGGING_CONFIG
+    if VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+            raise RuntimeError(
+                "Could not load logging config. File does not exist: %s",
+                VLLM_LOGGING_CONFIG_PATH)
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+            custom_config = json.loads(file.read())
+        if not isinstance(custom_config, dict):
+            raise ValueError("Invalid logging config. Expected Dict, got %s.",
+                             type(custom_config).__name__)
+        logging_config = custom_config
+    for formatter in logging_config.get("formatters", {}).values():
+        # This provides backwards compatibility after #10134.
+        if formatter.get("class") == "vllm.logging.NewLineFormatter":
+            formatter["class"] = "vllm.logging_utils.NewLineFormatter"
+    if logging_config:
+        dictConfig(logging_config)
+def init_logger(name: str) -> _VllmLogger:
+    """The main purpose of this function is to ensure that loggers are
+    retrieved in such a way that we can be sure the root vllm logger has
+    already been configured."""
+    logger = logging.getLogger(name)
+    methods_to_patch = {
+        "info_once": _print_info_once,
+        "warning_once": _print_warning_once,
+    }
+    for method_name, method in methods_to_patch.items():
+        setattr(logger, method_name, MethodType(method, logger))
+    return cast(_VllmLogger, logger)
+# The root logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_configure_vllm_root_logger()
+logger = init_logger(__name__)
+def _trace_calls(log_path, root_dir, frame, event, arg=None):
+    if event in ['call', 'return']:
+        # Extract the filename, line number, function name, and the code object
+        filename = frame.f_code.co_filename
+        lineno = frame.f_lineno
+        func_name = frame.f_code.co_name
+        if not filename.startswith(root_dir):
+            # only log the functions in the vllm root_dir
+            return
+        # Log every function call or return
+        try:
+            last_frame = frame.f_back
+            if last_frame is not None:
+                last_filename = last_frame.f_code.co_filename
+                last_lineno = last_frame.f_lineno
+                last_func_name = last_frame.f_code.co_name
+            else:
+                # initial frame
+                last_filename = ""
+                last_lineno = 0
+                last_func_name = ""
+            with open(log_path, 'a') as f:
+                ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+                if event == 'call':
+                    f.write(f"{ts} Call to"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" from {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+                else:
+                    f.write(f"{ts} Return from"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" to {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+        except NameError:
+            # modules are deleted during shutdown
+            pass
+    return partial(_trace_calls, log_path, root_dir)
+def enable_trace_function_call(log_file_path: str,
+                               root_dir: Optional[str] = None):
+    """
+    Enable tracing of every function call in code under `root_dir`.
+    This is useful for debugging hangs or crashes.
+    `log_file_path` is the path to the log file.
+    `root_dir` is the root directory of the code to trace. If None, it is the
+    vllm root directory.
+    Note that this call is thread-level, any threads calling this function
+    will have the trace enabled. Other threads will not be affected.
+    """
+    logger.warning(
+        "VLLM_TRACE_FUNCTION is enabled. It will record every"
+        " function executed by Python. This will slow down the code. It "
+        "is suggested to be used for debugging hang or crashes only.")
+    logger.info("Trace frame log is saved to %s", log_file_path)
+    if root_dir is None:
+        # by default, this is the vllm root directory
+        root_dir = os.path.dirname(os.path.dirname(__file__))
+    sys.settrace(partial(_trace_calls, log_file_path, root_dir))

.venv/lib/python3.11/site-packages/vllm/logits_process.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import Callable, List, Tuple, Union
+import torch
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor],
+                        Callable[[List[int], List[int], torch.Tensor],
+                                 torch.Tensor]]
+"""LogitsProcessor is a function that takes a list
+of previously generated tokens, the logits tensor
+for the next token and, optionally, prompt tokens as a
+first argument, and returns a modified tensor of logits
+to sample from."""
+def get_bad_words_logits_processors(
+        bad_words: List[str],
+        tokenizer: AnyTokenizer) -> List[LogitsProcessor]:
+    bad_words_ids: List[List[int]] = list()
+    for bad_word in bad_words:
+        # To prohibit words both at the beginning
+        # and in the middle of text
+        # (related to add_prefix_space tokenizer parameter)
+        for add_prefix_space in [False, True]:
+            prefix = " " if add_prefix_space else ""
+            prompt = prefix + bad_word.lstrip()
+            if isinstance(tokenizer, MistralTokenizer):
+                # Mistral tokenizers should not add special tokens
+                prompt_token_ids = tokenizer.encode(prompt=prompt)
+            else:
+                prompt_token_ids = tokenizer.encode(text=prompt,
+                                                    add_special_tokens=False)
+            # If no space at the beginning
+            # or if prefix space produces a new word token
+            if (not add_prefix_space) or (
+                    add_prefix_space
+                    and prompt_token_ids[0] != bad_words_ids[-1][0]
+                    and len(prompt_token_ids) == len(bad_words_ids[-1])):
+                bad_words_ids.append(prompt_token_ids)
+    return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)]
+class NoBadWordsLogitsProcessor:
+    _SMALLEST_LOGIT = float("-inf")
+    _NEUTRAL_LOGIT = 0.0
+    def __init__(self, bad_words_ids: List[List[int]]):
+        self.bad_words_ids = bad_words_ids
+        self.word_bias: torch.FloatTensor = None
+    def __call__(
+        self,
+        past_tokens_ids: Union[List[int], Tuple[int]],
+        logits: torch.FloatTensor,
+    ) -> torch.Tensor:
+        if self.word_bias is None:
+            self._init_word_bias(logits=logits)
+        last_token_bias = torch.zeros_like(logits)
+        for bad_word_ids in self.bad_words_ids:
+            if len(bad_word_ids) == 1:  # 1-token words already processed
+                continue
+            if len(bad_word_ids) > len(past_tokens_ids) + 1:
+                continue
+            prefix_length = len(bad_word_ids) - 1
+            last_token_id = bad_word_ids[-1]
+            actual_prefix = past_tokens_ids[-prefix_length:]
+            expected_prefix = bad_word_ids[:prefix_length]
+            assert len(actual_prefix) == len(expected_prefix)
+            is_match = tuple(actual_prefix) == tuple(expected_prefix)
+            last_token_bias[last_token_id] += (self._SMALLEST_LOGIT if is_match
+                                               else self._NEUTRAL_LOGIT)
+        logits = logits + self.word_bias + last_token_bias
+        return logits
+    def _init_word_bias(self, logits: torch.FloatTensor) -> None:
+        # Code based on NoBadWordsLogitsProcessor and SequenceBiasLogitsProcessor  # noqa: E501
+        # from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py
+        vocab_size = logits.shape[-1]
+        self._check_token_ids_bounds(vocab_size=vocab_size)
+        self.word_bias = torch.zeros((vocab_size, ),
+                                     dtype=torch.float,
+                                     device=logits.device)
+        for bad_word_ids in self.bad_words_ids:
+            if len(bad_word_ids) == 1:
+                bad_word_id = bad_word_ids[-1]
+                self.word_bias[bad_word_id] = self._SMALLEST_LOGIT
+    def _check_token_ids_bounds(self, vocab_size: int) -> None:
+        invalid_token_ids = []
+        for bad_word_ids in self.bad_words_ids:
+            for token_id in bad_word_ids:
+                if token_id < 0 or token_id >= vocab_size:
+                    invalid_token_ids.append(token_id)
+        if len(invalid_token_ids) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {vocab_size},"
+                f" but the following tokens"
+                f" were specified as bad: {invalid_token_ids}."
+                f" All token id values should be integers satisfying:"
+                f" 0 <= token_id < {vocab_size}.")

.venv/lib/python3.11/site-packages/vllm/outputs.py ADDED Viewed

	@@ -0,0 +1,529 @@

+# SPDX-License-Identifier: Apache-2.0
+import time
+from dataclasses import dataclass
+from typing import Dict, Generic, List, MutableSequence, Optional
+from typing import Sequence as GenericSequence
+from typing import Union
+import torch
+from typing_extensions import TypeVar, deprecated
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalPlaceholderDict
+from vllm.sampling_params import RequestOutputKind
+from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
+                           SequenceGroup, SequenceGroupBase, SequenceStatus)
+@dataclass
+class CompletionOutput:
+    """The output data of one completion output of a request.
+    Args:
+        index: The index of the output in the request.
+        text: The generated output text.
+        token_ids: The token IDs of the generated output text.
+        cumulative_logprob: The cumulative log probability of the generated
+            output text.
+        logprobs: The log probabilities of the top probability words at each
+            position if the logprobs are requested.
+        finish_reason: The reason why the sequence is finished.
+        stop_reason: The stop string or token id that caused the completion
+            to stop, None if the completion finished for some other reason
+            including encountering the EOS token.
+        lora_request: The LoRA request that was used to generate the output.
+    """
+    index: int
+    text: str
+    token_ids: GenericSequence[int]
+    cumulative_logprob: Optional[float]
+    logprobs: Optional[SampleLogprobs]
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+    lora_request: Optional[LoRARequest] = None
+    def finished(self) -> bool:
+        return self.finish_reason is not None
+    def __repr__(self) -> str:
+        return (f"CompletionOutput(index={self.index}, "
+                f"text={self.text!r}, "
+                f"token_ids={self.token_ids}, "
+                f"cumulative_logprob={self.cumulative_logprob}, "
+                f"logprobs={self.logprobs}, "
+                f"finish_reason={self.finish_reason}, "
+                f"stop_reason={self.stop_reason})")
+@dataclass
+class PoolingOutput:
+    """The output data of one pooling output of a request.
+    Args:
+        data: The extracted hidden states.
+    """
+    data: torch.Tensor
+    def __repr__(self) -> str:
+        return (f"PoolingOutput(data={self.data})")
+    def __eq__(self, other: object) -> bool:
+        return (isinstance(other, self.__class__) and bool(
+            (self.data == other.data).all()))
+    @property
+    @deprecated("`LLM.encode()` now stores raw outputs in the `data` "
+                "attribute. To return embeddings, use `LLM.embed()`. "
+                "To return class probabilities, use `LLM.classify()` "
+                "and access the `probs` attribute. ")
+    def embedding(self) -> list[float]:
+        return self.data.tolist()
+class RequestOutput:
+    """The output data of a completion request to the LLM.
+    Args:
+        request_id: The unique ID of the request.
+        prompt: The prompt string of the request.
+                For encoder/decoder models, this is the
+                decoder input prompt.
+        prompt_token_ids: The token IDs of the prompt.
+                          For encoder/decoder models, this is the
+                          decoder input prompt token ids.
+        prompt_logprobs: The log probabilities to return per prompt token.
+        outputs: The output sequences of the request.
+        finished: Whether the whole request is finished.
+        metrics: Metrics associated with the request.
+        lora_request: The LoRA request that was used to generate the output.
+        encoder_prompt: The encoder prompt string of the request.
+                        None if decoder-only.
+        encoder_prompt_token_ids: The token IDs of the encoder prompt.
+                                  None if decoder-only.
+        num_cached_tokens: The number of tokens with prefix cache hit.
+    """
+    def __init__(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: Optional[List[int]],
+        prompt_logprobs: Optional[PromptLogprobs],
+        outputs: List[CompletionOutput],
+        finished: bool,
+        metrics: Optional[RequestMetrics] = None,
+        lora_request: Optional[LoRARequest] = None,
+        encoder_prompt: Optional[str] = None,
+        encoder_prompt_token_ids: Optional[List[int]] = None,
+        num_cached_tokens: Optional[int] = None,
+        *,
+        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
+    ) -> None:
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.multi_modal_placeholders = multi_modal_placeholders or {}
+        self.prompt_logprobs = prompt_logprobs
+        self.outputs = outputs
+        self.finished = finished
+        self.metrics = metrics
+        self.lora_request = lora_request
+        self.encoder_prompt = encoder_prompt
+        self.encoder_prompt_token_ids = encoder_prompt_token_ids
+        self.num_cached_tokens = num_cached_tokens
+    @classmethod
+    def new(
+        cls,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: Optional[List[int]],
+        text: str,
+        token_ids: List[int],
+        finished: bool = False,
+    ) -> "RequestOutput":
+        """Initialize a new RequestOutput object."""
+        # TODO: Support `n` > 1.
+        completion_output = CompletionOutput(
+            index=0,
+            text=text,
+            token_ids=token_ids,
+            cumulative_logprob=None,
+            logprobs=None,  # TODO
+        )
+        return RequestOutput(
+            request_id=request_id,
+            prompt=prompt,
+            prompt_token_ids=prompt_token_ids,
+            prompt_logprobs=None,  # TODO
+            outputs=[completion_output],
+            finished=finished,
+        )
+    def add(self, next_output: "RequestOutput") -> None:
+        """Merge subsequent RequestOutput into this one"""
+        self.prompt = next_output.prompt
+        self.prompt_token_ids = next_output.prompt_token_ids
+        self.prompt_logprobs = next_output.prompt_logprobs
+        self.finished |= next_output.finished
+        #TODO assuming n == 1 for now
+        completion = self.outputs[0]
+        next_completion = next_output.outputs[0]
+        completion.text += next_completion.text
+        if not isinstance(completion.token_ids, MutableSequence):
+            completion.token_ids = list(completion.token_ids)
+        completion.token_ids.extend(next_completion.token_ids)
+        if next_completion.logprobs:
+            assert completion.logprobs is not None
+            completion.logprobs.extend(next_completion.logprobs)
+        completion.cumulative_logprob = next_completion.cumulative_logprob
+    @classmethod
+    def from_seq_group(
+        cls, seq_group: SequenceGroup, use_cache: bool,
+        seq_id_to_seq_group: Dict[str, SequenceGroupBase]
+    ) -> Optional["RequestOutput"]:
+        finished = seq_group.is_finished()
+        if seq_group.request_id in seq_id_to_seq_group:
+            group: SequenceGroupBase = seq_id_to_seq_group[
+                seq_group.request_id]
+            assembled_seq_group = group.maybe_assemble_group(seq_group)
+            if finished:
+                group.finish_seq(seq_group)
+            if assembled_seq_group is None:
+                return None
+            return cls.from_seq_group(assembled_seq_group, use_cache,
+                                      seq_id_to_seq_group)
+        sampling_params = seq_group.sampling_params
+        if sampling_params is None:
+            raise ValueError(
+                "Sampling parameters are missing for a CompletionRequest.")
+        if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and (
+                not finished):
+            return None
+        # Init cache (if needed)
+        if use_cache and seq_group.cached_request_output is None:
+            seq_group.cached_request_output = RequestOutput(  # type: ignore
+                request_id="",
+                prompt=None,
+                prompt_token_ids=[],
+                prompt_logprobs=None,
+                outputs=[],
+                finished=False)
+        top_n_seqs = seq_group.get_seqs()
+        # Create the outputs.
+        # NOTE: We need omit logprobs here explicitly because the sequence
+        # always has the logprobs of the sampled tokens even if the
+        # logprobs are not requested.
+        include_logprobs = sampling_params.logprobs is not None
+        text_buffer_length = sampling_params.output_text_buffer_length
+        delta = sampling_params.output_kind == RequestOutputKind.DELTA
+        outputs = []
+        include_prompt = True
+        # num_cached_tokens should be the same for all the sequences
+        num_cached_tokens = None
+        for i, seq in enumerate(top_n_seqs):
+            output_text = seq.get_output_text_to_return(
+                text_buffer_length, delta)
+            output_token_ids = seq.get_output_token_ids_to_return(delta)
+            num_output_tokens = 1 if isinstance(output_token_ids,
+                                                int) else len(output_token_ids)
+            num_cached_tokens = seq.data.get_num_cached_tokens()
+            output_logprobs = seq.output_logprobs if include_logprobs else None
+            if delta:
+                # Slice logprobs delta if applicable
+                if output_logprobs:
+                    output_logprobs = output_logprobs[-num_output_tokens:]
+                # Don't include prompt if this is after the first output
+                # containing decode token ids
+                if include_prompt and seq.get_output_len() > num_output_tokens:
+                    include_prompt = False
+            if use_cache:
+                # Get cached output object
+                cached_outputs = seq_group.cached_request_output.outputs  # type: ignore
+                if i >= len(cached_outputs):
+                    cached_outputs.append(
+                        CompletionOutput(index=i,
+                                         text="",
+                                         token_ids=[],
+                                         cumulative_logprob=None,
+                                         logprobs=None,
+                                         finish_reason=None,
+                                         stop_reason=None))
+                output = cached_outputs[i]
+                # Init cached output object
+                assert output.index == i
+                output.text = output_text
+                if isinstance(output_token_ids, int):
+                    output.token_ids.clear()
+                    output.token_ids.append(output_token_ids)
+                else:
+                    output.token_ids = output_token_ids
+                output.cumulative_logprob = seq.get_cumulative_logprob() \
+                    if include_logprobs else None
+                output.logprobs = output_logprobs
+                output.finish_reason = SequenceStatus.get_finished_reason(
+                    seq.status)
+                output.stop_reason = seq.stop_reason
+            else:
+                output = CompletionOutput(
+                    top_n_seqs.index(seq), output_text, [output_token_ids]
+                    if isinstance(output_token_ids, int) else output_token_ids,
+                    seq.get_cumulative_logprob() if include_logprobs else None,
+                    output_logprobs,
+                    SequenceStatus.get_finished_reason(seq.status),
+                    seq.stop_reason)
+            outputs.append(output)
+        # Every sequence in the sequence group should have the same prompt.
+        if include_prompt:
+            prompt = seq_group.prompt
+            prompt_token_ids = seq_group.prompt_token_ids
+            encoder_prompt = seq_group.encoder_prompt
+            encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids
+            prompt_logprobs = seq_group.prompt_logprobs
+        else:
+            prompt = None
+            prompt_token_ids = None
+            encoder_prompt = None
+            encoder_prompt_token_ids = None
+            prompt_logprobs = None
+        finished_time = time.time() if finished else None
+        seq_group.set_finished_time(finished_time)
+        init_kwargs = {
+            "request_id": seq_group.request_id,
+            "prompt": prompt,
+            "prompt_token_ids": prompt_token_ids,
+            "prompt_logprobs": prompt_logprobs,
+            "outputs": outputs,
+            "finished": finished,
+            "metrics": seq_group.metrics,
+            "lora_request": seq_group.lora_request,
+            "encoder_prompt": encoder_prompt,
+            "encoder_prompt_token_ids": encoder_prompt_token_ids,
+            "num_cached_tokens": num_cached_tokens,
+            "multi_modal_placeholders": seq_group.multi_modal_placeholders
+        }
+        if use_cache:
+            request_output = seq_group.cached_request_output
+            request_output.__init__(**init_kwargs)  # type: ignore
+        else:
+            request_output = cls(**init_kwargs)  # type: ignore
+        return request_output
+    def __repr__(self) -> str:
+        return (f"RequestOutput(request_id={self.request_id}, "
+                f"prompt={self.prompt!r}, "
+                f"prompt_token_ids={self.prompt_token_ids}, "
+                f"encoder_prompt={self.encoder_prompt!r}, "
+                f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
+                f"prompt_logprobs={self.prompt_logprobs}, "
+                f"outputs={self.outputs}, "
+                f"finished={self.finished}, "
+                f"metrics={self.metrics}, "
+                f"lora_request={self.lora_request}, "
+                f"num_cached_tokens={self.num_cached_tokens}, "
+                f"multi_modal_placeholders={self.multi_modal_placeholders})")
+_O = TypeVar("_O", default=PoolingOutput)
+class PoolingRequestOutput(Generic[_O]):
+    """
+    The output data of a pooling request to the LLM.
+    Args:
+        request_id (str): A unique identifier for the pooling request.
+        outputs (PoolingOutput): The pooling results for the given input.
+        prompt_token_ids (List[int]): A list of token IDs used in the prompt.
+        finished (bool): A flag indicating whether the pooling is completed.
+    """
+    def __init__(self, request_id: str, outputs: _O,
+                 prompt_token_ids: List[int], finished: bool):
+        self.request_id = request_id
+        self.prompt_token_ids = prompt_token_ids
+        self.finished = finished
+        self.outputs = outputs
+    @staticmethod
+    def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
+        pooled_data = seq_group.pooled_data
+        assert pooled_data is not None
+        data = pooled_data.to(dtype=torch.float32, device="cpu")
+        output = PoolingOutput(data)
+        prompt_token_ids = seq_group.prompt_token_ids
+        finished = seq_group.is_finished()
+        return PoolingRequestOutput(seq_group.request_id, output,
+                                    prompt_token_ids, finished)
+    def __repr__(self):
+        """
+        Returns a string representation of an PoolingRequestOutput instance.
+        The representation includes the request_id and the number of outputs,
+        providing a quick overview of the pooling request's results.
+        Returns:
+            str: A string representation of the PoolingRequestOutput instance.
+        """
+        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
+                f"outputs={self.outputs!r}, "
+                f"prompt_token_ids={self.prompt_token_ids}, "
+                f"finished={self.finished})")
+class RequestOutputFactory:
+    @staticmethod
+    def create(seq_group: SequenceGroup,
+               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
+               use_cache: bool = False):
+        if seq_group.pooled_data is not None:
+            return PoolingRequestOutput.from_seq_group(seq_group)
+        else:
+            return RequestOutput.from_seq_group(seq_group, use_cache,
+                                                seq_id_to_seq_group)
+@dataclass
+class EmbeddingOutput:
+    """The output data of one embedding output of a request.
+    Args:
+        embedding: The embedding vector, which is a list of floats.
+        Its length depends on the hidden dimension of the model.
+    """
+    embedding: list[float]
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D embedding vector")
+        return EmbeddingOutput(pooled_data.tolist())
+    @property
+    def hidden_size(self) -> int:
+        return len(self.embedding)
+    def __repr__(self) -> str:
+        return f"EmbeddingOutput(hidden_size={self.hidden_size})"
+class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return EmbeddingRequestOutput(
+            request_id=request_output.request_id,
+            outputs=EmbeddingOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+@dataclass
+class ClassificationOutput:
+    """The output data of one classification output of a request.
+    Args:
+        probs: The probability vector, which is a list of floats.
+        Its length depends on the number of classes.
+    """
+    probs: list[float]
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D probability vector")
+        return ClassificationOutput(pooled_data.tolist())
+    @property
+    def num_classes(self) -> int:
+        return len(self.probs)
+    def __repr__(self) -> str:
+        return f"ClassificationOutput(num_classes={self.num_classes})"
+class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ClassificationRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ClassificationOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+@dataclass
+class ScoringOutput:
+    """The output data of one scoring output of a request.
+    Args:
+        score: The similarity score, which is a scalar value.
+    """
+    score: float
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 0:
+            raise ValueError("pooled_data should be a scalar score")
+        return ScoringOutput(pooled_data.item())
+    def __repr__(self) -> str:
+        return f"ScoringOutput(score={self.score})"
+    @property
+    @deprecated("`LLM.score()` now returns scalar scores. "
+                "Please access it via the `score` attribute. ")
+    def embedding(self) -> list[float]:
+        return [self.score]
+class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ScoringRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ScoringOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )

.venv/lib/python3.11/site-packages/vllm/platforms/__init__.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# SPDX-License-Identifier: Apache-2.0
+import logging
+import traceback
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+from vllm.plugins import load_plugins_by_group
+from vllm.utils import resolve_obj_by_qualname
+from .interface import _Backend  # noqa: F401
+from .interface import CpuArchEnum, Platform, PlatformEnum
+logger = logging.getLogger(__name__)
+def tpu_platform_plugin() -> Optional[str]:
+    is_tpu = False
+    try:
+        # While it's technically possible to install libtpu on a
+        # non-TPU machine, this is a very uncommon scenario. Therefore,
+        # we assume that libtpu is installed if and only if the machine
+        # has TPUs.
+        import libtpu  # noqa: F401
+        is_tpu = True
+    except Exception:
+        pass
+    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
+def cuda_platform_plugin() -> Optional[str]:
+    is_cuda = False
+    try:
+        from vllm.utils import import_pynvml
+        pynvml = import_pynvml()
+        pynvml.nvmlInit()
+        try:
+            if pynvml.nvmlDeviceGetCount() > 0:
+                is_cuda = True
+        finally:
+            pynvml.nvmlShutdown()
+    except Exception as e:
+        if "nvml" not in e.__class__.__name__.lower():
+            # If the error is not related to NVML, re-raise it.
+            raise e
+        # CUDA is supported on Jetson, but NVML may not be.
+        import os
+        def cuda_is_jetson() -> bool:
+            return os.path.isfile("/etc/nv_tegra_release") \
+                or os.path.exists("/sys/class/tegra-firmware")
+        if cuda_is_jetson():
+            is_cuda = True
+    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
+def rocm_platform_plugin() -> Optional[str]:
+    is_rocm = False
+    try:
+        import amdsmi
+        amdsmi.amdsmi_init()
+        try:
+            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+                is_rocm = True
+        finally:
+            amdsmi.amdsmi_shut_down()
+    except Exception:
+        pass
+    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
+def hpu_platform_plugin() -> Optional[str]:
+    is_hpu = False
+    try:
+        from importlib import util
+        is_hpu = util.find_spec('habana_frameworks') is not None
+    except Exception:
+        pass
+    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
+def xpu_platform_plugin() -> Optional[str]:
+    is_xpu = False
+    try:
+        # installed IPEX if the machine has XPUs.
+        import intel_extension_for_pytorch  # noqa: F401
+        import oneccl_bindings_for_pytorch  # noqa: F401
+        import torch
+        if hasattr(torch, 'xpu') and torch.xpu.is_available():
+            is_xpu = True
+    except Exception:
+        pass
+    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
+def cpu_platform_plugin() -> Optional[str]:
+    is_cpu = False
+    try:
+        from importlib.metadata import version
+        is_cpu = "cpu" in version("vllm")
+        if not is_cpu:
+            import platform
+            is_cpu = platform.machine().lower().startswith("arm")
+    except Exception:
+        pass
+    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
+def neuron_platform_plugin() -> Optional[str]:
+    is_neuron = False
+    try:
+        import transformers_neuronx  # noqa: F401
+        is_neuron = True
+    except ImportError:
+        pass
+    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
+def openvino_platform_plugin() -> Optional[str]:
+    is_openvino = False
+    try:
+        from importlib.metadata import version
+        is_openvino = "openvino" in version("vllm")
+    except Exception:
+        pass
+    return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
+builtin_platform_plugins = {
+    'tpu': tpu_platform_plugin,
+    'cuda': cuda_platform_plugin,
+    'rocm': rocm_platform_plugin,
+    'hpu': hpu_platform_plugin,
+    'xpu': xpu_platform_plugin,
+    'cpu': cpu_platform_plugin,
+    'neuron': neuron_platform_plugin,
+    'openvino': openvino_platform_plugin,
+}
+def resolve_current_platform_cls_qualname() -> str:
+    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
+    activated_plugins = []
+    for name, func in chain(builtin_platform_plugins.items(),
+                            platform_plugins.items()):
+        try:
+            assert callable(func)
+            platform_cls_qualname = func()
+            if platform_cls_qualname is not None:
+                activated_plugins.append(name)
+        except Exception:
+            pass
+    activated_builtin_plugins = list(
+        set(activated_plugins) & set(builtin_platform_plugins.keys()))
+    activated_oot_plugins = list(
+        set(activated_plugins) & set(platform_plugins.keys()))
+    if len(activated_oot_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_oot_plugins}")
+    elif len(activated_oot_plugins) == 1:
+        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
+        logger.info("Platform plugin %s is activated",
+                    activated_oot_plugins[0])
+    elif len(activated_builtin_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_builtin_plugins}")
+    elif len(activated_builtin_plugins) == 1:
+        platform_cls_qualname = builtin_platform_plugins[
+            activated_builtin_plugins[0]]()
+        logger.info("Automatically detected platform %s.",
+                    activated_builtin_plugins[0])
+    else:
+        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
+        logger.info(
+            "No platform detected, vLLM is running on UnspecifiedPlatform")
+    return platform_cls_qualname
+_current_platform = None
+_init_trace: str = ''
+if TYPE_CHECKING:
+    current_platform: Platform
+def __getattr__(name: str):
+    if name == 'current_platform':
+        # lazy init current_platform.
+        # 1. out-of-tree platform plugins need `from vllm.platforms import
+        #    Platform` so that they can inherit `Platform` class. Therefore,
+        #    we cannot resolve `current_platform` during the import of
+        #    `vllm.platforms`.
+        # 2. when users use out-of-tree platform plugins, they might run
+        #    `import vllm`, some vllm internal code might access
+        #    `current_platform` during the import, and we need to make sure
+        #    `current_platform` is only resolved after the plugins are loaded
+        #    (we have tests for this, if any developer violate this, they will
+        #    see the test failures).
+        global _current_platform
+        if _current_platform is None:
+            platform_cls_qualname = resolve_current_platform_cls_qualname()
+            _current_platform = resolve_obj_by_qualname(
+                platform_cls_qualname)()
+            global _init_trace
+            _init_trace = "".join(traceback.format_stack())
+        return _current_platform
+    elif name in globals():
+        return globals()[name]
+    else:
+        raise AttributeError(
+            f"No attribute named '{name}' exists in {__name__}.")
+__all__ = [
+    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
+    "_init_trace"
+]

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (9.54 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cpu.cpython-311.pyc ADDED Viewed

Binary file (6.86 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cuda.cpython-311.pyc ADDED Viewed

Binary file (20.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/hpu.cpython-311.pyc ADDED Viewed

Binary file (4.86 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/interface.cpython-311.pyc ADDED Viewed

Binary file (15.2 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/neuron.cpython-311.pyc ADDED Viewed

Binary file (3.1 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/openvino.cpython-311.pyc ADDED Viewed

Binary file (8.21 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/rocm.cpython-311.pyc ADDED Viewed

Binary file (9.38 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/tpu.cpython-311.pyc ADDED Viewed

Binary file (5.2 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/xpu.cpython-311.pyc ADDED Viewed

Binary file (7.59 kB). View file

.venv/lib/python3.11/site-packages/vllm/platforms/cpu.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import TYPE_CHECKING, Optional
+import psutil
+import torch
+from vllm.logger import init_logger
+from .interface import Platform, PlatformEnum, _Backend
+logger = init_logger(__name__)
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+logger = init_logger(__name__)
+class CpuPlatform(Platform):
+    _enum = PlatformEnum.CPU
+    device_name: str = "cpu"
+    device_type: str = "cpu"
+    dispatch_key: str = "CPU"
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "cpu"
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if selected_backend != _Backend.TORCH_SDPA:
+            logger.info("Cannot use %s backend on CPU.", selected_backend)
+        logger.info("Using Torch SDPA backend.")
+        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        return psutil.virtual_memory().total
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        import vllm.envs as envs
+        from vllm.utils import GiB_bytes
+        model_config = vllm_config.model_config
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # If the feature combo become valid
+        if not model_config.enforce_eager:
+            logger.warning(
+                "CUDA graph is not supported on CPU, fallback to the eager "
+                "mode.")
+            model_config.enforce_eager = True
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+        if kv_cache_space >= 0:
+            if kv_cache_space == 0:
+                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+                logger.warning(
+                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
+                    "for CPU backend is not set, using 4 by default.")
+            else:
+                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
+        else:
+            raise RuntimeError(
+                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
+                f" {kv_cache_space}, expect a positive integer value.")
+        scheduler_config = vllm_config.scheduler_config
+        if ((scheduler_config.chunked_prefill_enabled
+             or cache_config.enable_prefix_caching)
+                and model_config.dtype == torch.half):
+            logger.warning("Chunked-prefill on the CPU backend only does not"
+                           " support fp16 for now, cast to bf16.")
+            model_config.dtype = torch.bfloat16
+        parallel_config = vllm_config.parallel_config
+        if (parallel_config.distributed_executor_backend is not None
+                and parallel_config.distributed_executor_backend != "mp"):
+            logger.warning(("%s is not supported on CPU, fallback to mp "
+                            "distributed executor backend."),
+                           parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "mp"
+        if parallel_config.worker_cls == "auto":
+            if vllm_config.speculative_config:
+                parallel_config.worker_cls = \
+                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = \
+                    "vllm.worker.cpu_worker.CPUWorker"
+            else:
+                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+        assert vllm_config.device_config.device_type == "cpu"
+        #
+        # Environment variables for CPU executor
+        #
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            vllm_config.parallel_config.tensor_parallel_size)
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"

.venv/lib/python3.11/site-packages/vllm/platforms/cuda.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# SPDX-License-Identifier: Apache-2.0
+"""Code inside this file can safely assume cuda platform, e.g. importing
+pynvml. However, it should not initialize cuda context.
+"""
+import os
+from functools import lru_cache, wraps
+from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar,
+                    Union)
+import torch
+from typing_extensions import ParamSpec
+# import custom ops, trigger op registration
+import vllm._C  # noqa
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import import_pynvml
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+logger = init_logger(__name__)
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+pynvml = import_pynvml()
+# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
+# see https://github.com/huggingface/diffusers/issues/9704 for details
+torch.backends.cuda.enable_cudnn_sdp(False)
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        if device_ids == [""]:
+            msg = (
+                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
+                " GPU support is disabled. If you are using ray, please unset"
+                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
+                " worker/actor. "
+                "Check https://github.com/vllm-project/vllm/issues/8402 for"
+                " more information.")
+            raise RuntimeError(msg)
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+    @wraps(fn)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            pynvml.nvmlShutdown()
+    return wrapper
+class CudaPlatformBase(Platform):
+    _enum = PlatformEnum.CUDA
+    device_name: str = "cuda"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
+    @classmethod
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        raise NotImplementedError
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+    @classmethod
+    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
+        raise NotImplementedError
+    @classmethod
+    def log_warnings(cls):
+        pass
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on VLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Speculative decoding is not yet supported on VLLM V1."
+                    )
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = \
+                        "vllm.worker.worker.Worker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
+        if use_v1:
+            logger.info("Using Flash Attention backend on V1 engine.")
+            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        if use_mla:
+            logger.info("Using Triton MLA backend.")
+            return "vllm.attention.backends.triton_mla.TritonMLABackend"
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.FLASH_ATTN:
+            pass
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
+        target_backend = _Backend.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            # Volta and Turing NVIDIA GPUs.
+            logger.info(
+                "Cannot use FlashAttention-2 backend for Volta and Turing "
+                "GPUs.")
+            target_backend = _Backend.XFORMERS
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for dtype other than "
+                "torch.float16 or torch.bfloat16.")
+            target_backend = _Backend.XFORMERS
+        elif kv_cache_dtype is not None and \
+            kv_cache_dtype.startswith("fp8"):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
+            logger.warning(
+                "Please use FlashInfer backend with FP8 KV Cache for "
+                "better performance by setting environment variable  "
+                "VLLM_ATTENTION_BACKEND=FLASHINFER")
+            target_backend = _Backend.XFORMERS
+        elif block_size % 16 != 0:
+            logger.info(
+                "Cannot use FlashAttention-2 backend for block size not "
+                "divisible by 16.")
+            target_backend = _Backend.XFORMERS
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == _Backend.FLASH_ATTN:
+            try:
+                import vllm.vllm_flash_attn  # noqa: F401
+                from vllm.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend)
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for head size %d.",
+                        head_size)
+                    target_backend = _Backend.XFORMERS
+            except ImportError:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend because the "
+                    "vllm.vllm_flash_attn package is not found. "
+                    "Make sure that vllm_flash_attn was built and installed "
+                    "(on by default).")
+                target_backend = _Backend.XFORMERS
+        if target_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        logger.info("Using Flash Attention backend.")
+        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+class NvmlCudaPlatform(CudaPlatformBase):
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        try:
+            physical_device_id = device_id_to_physical_device_id(device_id)
+            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            return DeviceCapability(major=major, minor=minor)
+        except RuntimeError:
+            return None
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def has_device_capability(
+        cls,
+        capability: Union[Tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        try:
+            return super().has_device_capability(capability, device_id)
+        except RuntimeError:
+            return False
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        return cls._get_physical_device_name(physical_device_id)
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return pynvml.nvmlDeviceGetUUID(handle)
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
+    @classmethod
+    @with_nvml_context
+    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [
+            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle,
+                            peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
+                        )
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError:
+                        logger.exception(
+                            "NVLink detection failed. This is normal if"
+                            " your machine has no NVLink equipped.")
+                        return False
+        return True
+    @classmethod
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return pynvml.nvmlDeviceGetName(handle)
+    @classmethod
+    @with_nvml_context
+    def log_warnings(cls):
+        device_ids: int = pynvml.nvmlDeviceGetCount()
+        if device_ids > 1:
+            device_names = [
+                cls._get_physical_device_name(i) for i in range(device_ids)
+            ]
+            if (len(set(device_names)) > 1
+                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+                logger.warning(
+                    "Detected different devices in the system: \n%s\nPlease"
+                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                    "avoid unexpected behavior.",
+                    "\n".join(device_names),
+                )
+class NonNvmlCudaPlatform(CudaPlatformBase):
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+    @classmethod
+    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+        logger.exception(
+            "NVLink detection not possible, as context support was"
+            " not found. Assuming no NVLink available.")
+        return False
+# Autodetect either NVML-enabled or non-NVML platform
+# based on whether NVML is available.
+nvml_available = False
+try:
+    try:
+        pynvml.nvmlInit()
+        nvml_available = True
+    except Exception:
+        # On Jetson, NVML is not supported.
+        nvml_available = False
+finally:
+    if nvml_available:
+        pynvml.nvmlShutdown()
+CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
+try:
+    from sphinx.ext.autodoc.mock import _MockModule
+    if not isinstance(pynvml, _MockModule):
+        CudaPlatform.log_warnings()
+except ModuleNotFoundError:
+    CudaPlatform.log_warnings()

.venv/lib/python3.11/site-packages/vllm/platforms/hpu.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import TYPE_CHECKING, Optional
+import torch
+from vllm import envs
+from vllm.logger import init_logger
+from .interface import Platform, PlatformEnum, _Backend
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+logger = init_logger(__name__)
+class HpuPlatform(Platform):
+    _enum = PlatformEnum.HPU
+    device_name: str = "hpu"
+    device_type: str = "hpu"
+    dispatch_key: str = "HPU"
+    ray_device_key: str = "HPU"
+    device_control_env_var: str = "HABANA_VISIBLE_MODULES"
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        logger.info("Using HPUAttention backend.")
+        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        scheduler_config = vllm_config.scheduler_config
+        if scheduler_config.is_multi_step:
+            raise NotImplementedError(
+                "Multi-step execution is not implemented for HPU")
+        if vllm_config.speculative_config is not None:
+            raise NotImplementedError(
+                "Speculative decoding is not implemented for HPU")
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
+        # NOTE(kzawora): default block size for Gaudi should be 128
+        # smaller sizes still work, but very inefficiently
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+        if (parallel_config.distributed_executor_backend == 'mp'
+                and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
+            if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
+                              None) is not None:
+                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
+                               "might cause application hangs on exit. Using "
+                               "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
+                               "as it was explicitly requested.")
+            else:
+                logger.warning(
+                    "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
+                    "might cause application hangs on exit. Setting "
+                    "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                    "To override that behavior, please set "
+                    "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
+                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"

.venv/lib/python3.11/site-packages/vllm/platforms/interface.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# SPDX-License-Identifier: Apache-2.0
+import enum
+import platform
+import random
+from platform import uname
+from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
+import numpy as np
+import torch
+from vllm.logger import init_logger
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+logger = init_logger(__name__)
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+class _Backend(enum.Enum):
+    FLASH_ATTN = enum.auto()
+    FLASH_ATTN_VLLM_V1 = enum.auto()
+    XFORMERS = enum.auto()
+    ROCM_FLASH = enum.auto()
+    TORCH_SDPA = enum.auto()
+    OPENVINO = enum.auto()
+    FLASHINFER = enum.auto()
+    TRITON_MLA = enum.auto()
+    HPU_ATTN = enum.auto()
+    PALLAS = enum.auto()
+    IPEX = enum.auto()
+    BLOCK_SPARSE_FLASH_ATTN = enum.auto()
+    NO_ATTENTION = enum.auto()
+class PlatformEnum(enum.Enum):
+    CUDA = enum.auto()
+    ROCM = enum.auto()
+    TPU = enum.auto()
+    HPU = enum.auto()
+    XPU = enum.auto()
+    CPU = enum.auto()
+    NEURON = enum.auto()
+    OPENVINO = enum.auto()
+    OOT = enum.auto()
+    UNSPECIFIED = enum.auto()
+class CpuArchEnum(enum.Enum):
+    X86 = enum.auto()
+    ARM = enum.auto()
+    POWERPC = enum.auto()
+    OTHER = enum.auto()
+    UNKNOWN = enum.auto()
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+class Platform:
+    _enum: PlatformEnum
+    device_name: str
+    device_type: str
+    # available dispatch keys:
+    # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
+    # use "CPU" as a fallback for platforms not registered in PyTorch
+    dispatch_key: str = "CPU"
+    # available ray device keys:
+    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
+    # empty string means the device does not support ray
+    ray_device_key: str = ""
+    # platform-agnostic way to specify the device control environment variable,
+    # .e.g. CUDA_VISIBLE_DEVICES for CUDA.
+    # hint: search for "get_visible_accelerator_ids_env_var" in
+    # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
+    device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
+    # The torch.compile backend for compiling simple and
+    # standalone functions. The default value is "inductor" to keep
+    # the same behavior as PyTorch.
+    # NOTE: for the forward part of the model, vLLM has another separate
+    # compilation strategy.
+    simple_compile_backend: str = "inductor"
+    supported_quantization: list[str] = []
+    def is_cuda(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+    def is_rocm(self) -> bool:
+        return self._enum == PlatformEnum.ROCM
+    def is_tpu(self) -> bool:
+        return self._enum == PlatformEnum.TPU
+    def is_hpu(self) -> bool:
+        return self._enum == PlatformEnum.HPU
+    def is_xpu(self) -> bool:
+        return self._enum == PlatformEnum.XPU
+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
+    def is_neuron(self) -> bool:
+        return self._enum == PlatformEnum.NEURON
+    def is_openvino(self) -> bool:
+        return self._enum == PlatformEnum.OPENVINO
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+    def is_cuda_alike(self) -> bool:
+        """Stateless version of :func:`torch.cuda.is_available`."""
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        """Get the attention backend class of a device."""
+        return ""
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        """Stateless version of :func:`torch.cuda.get_device_capability`."""
+        return None
+    @classmethod
+    def has_device_capability(
+        cls,
+        capability: Union[Tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform is compatible with a device capability.
+        The ``capability`` argument can either be:
+        - A tuple ``(major, minor)``.
+        - An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+        if isinstance(capability, tuple):
+            return current_capability >= capability
+        return current_capability.to_int() >= capability
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        """Get the name of a device."""
+        raise NotImplementedError
+    @classmethod
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        """Get the uuid of a device, e.g. the PCI bus ID."""
+        raise NotImplementedError
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Get the total memory of a device in bytes."""
+        raise NotImplementedError
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        """
+        Check if the current platform supports async output.
+        """
+        raise NotImplementedError
+    @classmethod
+    def inference_mode(cls):
+        """A device-specific wrapper of `torch.inference_mode`.
+        This wrapper is recommended because some hardware backends such as TPU
+        do not support `torch.inference_mode`. In such a case, they will fall
+        back to `torch.no_grad` by overriding this method.
+        """
+        return torch.inference_mode(mode=True)
+    @classmethod
+    def seed_everything(cls, seed: int) -> None:
+        """
+        Set the seed of each random module.
+        `torch.manual_seed` will set seed on all devices.
+        Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
+        """
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        """
+        Check and update the configuration for the current platform.
+        It can raise an exception if the configuration is not compatible with
+        the current platform, or it can update the configuration to make it
+        compatible with the current platform.
+        The config is passed by reference, so it can be modified in place.
+        """
+        pass
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        """
+        Verify whether the current platform supports the specified model
+        architecture.
+        - This will raise an Error or Warning based on the model support on
+        the current platform.
+        - By default all models are considered supported.
+        """
+        pass
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        """
+        Verify whether the quantization is supported by the current platform.
+        """
+        if cls.supported_quantization and \
+            quant not in cls.supported_quantization:
+            raise ValueError(
+                f"{quant} quantization is currently not supported in "
+                f"{cls.device_name}.")
+    @classmethod
+    def get_cpu_architecture(cls) -> CpuArchEnum:
+        """
+        Determine the CPU architecture of the current system.
+        Returns CpuArchEnum indicating the architecture type.
+        """
+        machine = platform.machine().lower()
+        if machine in ("x86_64", "amd64", "i386", "i686"):
+            return CpuArchEnum.X86
+        elif machine.startswith("arm") or machine.startswith("aarch"):
+            return CpuArchEnum.ARM
+        elif machine.startswith("ppc"):
+            return CpuArchEnum.POWERPC
+        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        """
+        Return the memory usage in bytes.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        """
+        Return the punica wrapper for current platform.
+        """
+        raise NotImplementedError
+class UnspecifiedPlatform(Platform):
+    _enum = PlatformEnum.UNSPECIFIED
+    device_type = ""

.venv/lib/python3.11/site-packages/vllm/platforms/neuron.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING, Optional
+from vllm.logger import init_logger
+from .interface import Platform, PlatformEnum
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+logger = init_logger(__name__)
+class NeuronPlatform(Platform):
+    _enum = PlatformEnum.NEURON
+    device_name: str = "neuron"
+    device_type: str = "neuron"
+    ray_device_key: str = "neuron_cores"
+    supported_quantization: list[str] = ["neuron_quant"]
+    device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "neuron"
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = \
+                "vllm.worker.neuron_worker.NeuronWorker"
+        if parallel_config.world_size > 1:
+            parallel_config.distributed_executor_backend = "uni"
+        assert (vllm_config.lora_config
+                is None), "LoRA is not supported for Neuron backend."
+        assert (not vllm_config.speculative_config
+                ), "Speculative decoding not yet supported for Neuron backend."
+        cache_config = vllm_config.cache_config
+        if cache_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False