diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__init__.py b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..13be3a59b3a9216bf2ee2897c14645040cf255d8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__init__.py @@ -0,0 +1,13 @@ +__version__ = "2.4.6" + +from .impl import start_connection +from .types import AddrInfoType +from .utils import addr_to_addr_infos, pop_addr_infos_interleave, remove_addr_infos + +__all__ = ( + "AddrInfoType", + "addr_to_addr_infos", + "pop_addr_infos_interleave", + "remove_addr_infos", + "start_connection", +) diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c9f0424a812a59719a1488ffd57d01200e9b1ab Binary files /dev/null and b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/_staggered.cpython-311.pyc b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/_staggered.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a6d93f5888167feb415fa1bf653f76d3ef267db Binary files /dev/null and b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/_staggered.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-311.pyc b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1017a44802c2655ff19ada0243d120ec6f1d00d2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/types.cpython-311.pyc b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/types.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1641f45e6d27271d123bfc8f78c7db11d024813f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/types.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3cb73c1f728a8e4705799564c9f480058fc59ba Binary files /dev/null and b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/_staggered.py b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/_staggered.py new file mode 100644 index 0000000000000000000000000000000000000000..b9bdae324200303578cb5e2e8f60af49acae7695 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/_staggered.py @@ -0,0 +1,202 @@ +import asyncio +import contextlib +from typing import ( + TYPE_CHECKING, + Any, + Awaitable, + Callable, + Iterable, + List, + Optional, + Set, + Tuple, + TypeVar, + Union, +) + +_T = TypeVar("_T") + + +def _set_result(wait_next: "asyncio.Future[None]") -> None: + """Set the result of a future if it is not already done.""" + if not wait_next.done(): + wait_next.set_result(None) + + +async def _wait_one( + futures: "Iterable[asyncio.Future[Any]]", + loop: asyncio.AbstractEventLoop, +) -> _T: + """Wait for the first future to complete.""" + wait_next = loop.create_future() + + def _on_completion(fut: "asyncio.Future[Any]") -> None: + if not wait_next.done(): + wait_next.set_result(fut) + + for f in futures: + f.add_done_callback(_on_completion) + + try: + return await wait_next + finally: + for f in futures: + f.remove_done_callback(_on_completion) + + +async def staggered_race( + coro_fns: Iterable[Callable[[], Awaitable[_T]]], + delay: Optional[float], + *, + loop: Optional[asyncio.AbstractEventLoop] = None, +) -> Tuple[Optional[_T], Optional[int], List[Optional[BaseException]]]: + """ + Run coroutines with staggered start times and take the first to finish. + + This method takes an iterable of coroutine functions. The first one is + started immediately. From then on, whenever the immediately preceding one + fails (raises an exception), or when *delay* seconds has passed, the next + coroutine is started. This continues until one of the coroutines complete + successfully, in which case all others are cancelled, or until all + coroutines fail. + + The coroutines provided should be well-behaved in the following way: + + * They should only ``return`` if completed successfully. + + * They should always raise an exception if they did not complete + successfully. In particular, if they handle cancellation, they should + probably reraise, like this:: + + try: + # do work + except asyncio.CancelledError: + # undo partially completed work + raise + + Args: + ---- + coro_fns: an iterable of coroutine functions, i.e. callables that + return a coroutine object when called. Use ``functools.partial`` or + lambdas to pass arguments. + + delay: amount of time, in seconds, between starting coroutines. If + ``None``, the coroutines will run sequentially. + + loop: the event loop to use. If ``None``, the running loop is used. + + Returns: + ------- + tuple *(winner_result, winner_index, exceptions)* where + + - *winner_result*: the result of the winning coroutine, or ``None`` + if no coroutines won. + + - *winner_index*: the index of the winning coroutine in + ``coro_fns``, or ``None`` if no coroutines won. If the winning + coroutine may return None on success, *winner_index* can be used + to definitively determine whether any coroutine won. + + - *exceptions*: list of exceptions returned by the coroutines. + ``len(exceptions)`` is equal to the number of coroutines actually + started, and the order is the same as in ``coro_fns``. The winning + coroutine's entry is ``None``. + + """ + loop = loop or asyncio.get_running_loop() + exceptions: List[Optional[BaseException]] = [] + tasks: Set[asyncio.Task[Optional[Tuple[_T, int]]]] = set() + + async def run_one_coro( + coro_fn: Callable[[], Awaitable[_T]], + this_index: int, + start_next: "asyncio.Future[None]", + ) -> Optional[Tuple[_T, int]]: + """ + Run a single coroutine. + + If the coroutine fails, set the exception in the exceptions list and + start the next coroutine by setting the result of the start_next. + + If the coroutine succeeds, return the result and the index of the + coroutine in the coro_fns list. + + If SystemExit or KeyboardInterrupt is raised, re-raise it. + """ + try: + result = await coro_fn() + except (SystemExit, KeyboardInterrupt): + raise + except BaseException as e: + exceptions[this_index] = e + _set_result(start_next) # Kickstart the next coroutine + return None + + return result, this_index + + start_next_timer: Optional[asyncio.TimerHandle] = None + start_next: Optional[asyncio.Future[None]] + task: asyncio.Task[Optional[Tuple[_T, int]]] + done: Union[asyncio.Future[None], asyncio.Task[Optional[Tuple[_T, int]]]] + coro_iter = iter(coro_fns) + this_index = -1 + try: + while True: + if coro_fn := next(coro_iter, None): + this_index += 1 + exceptions.append(None) + start_next = loop.create_future() + task = loop.create_task(run_one_coro(coro_fn, this_index, start_next)) + tasks.add(task) + start_next_timer = ( + loop.call_later(delay, _set_result, start_next) if delay else None + ) + elif not tasks: + # We exhausted the coro_fns list and no tasks are running + # so we have no winner and all coroutines failed. + break + + while tasks or start_next: + done = await _wait_one( + (*tasks, start_next) if start_next else tasks, loop + ) + if done is start_next: + # The current task has failed or the timer has expired + # so we need to start the next task. + start_next = None + if start_next_timer: + start_next_timer.cancel() + start_next_timer = None + + # Break out of the task waiting loop to start the next + # task. + break + + if TYPE_CHECKING: + assert isinstance(done, asyncio.Task) + + tasks.remove(done) + if winner := done.result(): + return *winner, exceptions + finally: + # We either have: + # - a winner + # - all tasks failed + # - a KeyboardInterrupt or SystemExit. + + # + # If the timer is still running, cancel it. + # + if start_next_timer: + start_next_timer.cancel() + + # + # If there are any tasks left, cancel them and than + # wait them so they fill the exceptions list. + # + for task in tasks: + task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await task + + return None, None, exceptions diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/impl.py b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/impl.py new file mode 100644 index 0000000000000000000000000000000000000000..42254a0b4e45134c2a08fbd4f287613d4f0489d3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/impl.py @@ -0,0 +1,210 @@ +"""Base implementation.""" + +import asyncio +import collections +import functools +import itertools +import socket +from typing import List, Optional, Sequence, Union + +from . import _staggered +from .types import AddrInfoType + + +async def start_connection( + addr_infos: Sequence[AddrInfoType], + *, + local_addr_infos: Optional[Sequence[AddrInfoType]] = None, + happy_eyeballs_delay: Optional[float] = None, + interleave: Optional[int] = None, + loop: Optional[asyncio.AbstractEventLoop] = None, +) -> socket.socket: + """ + Connect to a TCP server. + + Create a socket connection to a specified destination. The + destination is specified as a list of AddrInfoType tuples as + returned from getaddrinfo(). + + The arguments are, in order: + + * ``family``: the address family, e.g. ``socket.AF_INET`` or + ``socket.AF_INET6``. + * ``type``: the socket type, e.g. ``socket.SOCK_STREAM`` or + ``socket.SOCK_DGRAM``. + * ``proto``: the protocol, e.g. ``socket.IPPROTO_TCP`` or + ``socket.IPPROTO_UDP``. + * ``canonname``: the canonical name of the address, e.g. + ``"www.python.org"``. + * ``sockaddr``: the socket address + + This method is a coroutine which will try to establish the connection + in the background. When successful, the coroutine returns a + socket. + + The expected use case is to use this method in conjunction with + loop.create_connection() to establish a connection to a server:: + + socket = await start_connection(addr_infos) + transport, protocol = await loop.create_connection( + MyProtocol, sock=socket, ...) + """ + if not (current_loop := loop): + current_loop = asyncio.get_running_loop() + + single_addr_info = len(addr_infos) == 1 + + if happy_eyeballs_delay is not None and interleave is None: + # If using happy eyeballs, default to interleave addresses by family + interleave = 1 + + if interleave and not single_addr_info: + addr_infos = _interleave_addrinfos(addr_infos, interleave) + + sock: Optional[socket.socket] = None + # uvloop can raise RuntimeError instead of OSError + exceptions: List[List[Union[OSError, RuntimeError]]] = [] + if happy_eyeballs_delay is None or single_addr_info: + # not using happy eyeballs + for addrinfo in addr_infos: + try: + sock = await _connect_sock( + current_loop, exceptions, addrinfo, local_addr_infos + ) + break + except (RuntimeError, OSError): + continue + else: # using happy eyeballs + sock, _, _ = await _staggered.staggered_race( + ( + functools.partial( + _connect_sock, current_loop, exceptions, addrinfo, local_addr_infos + ) + for addrinfo in addr_infos + ), + happy_eyeballs_delay, + ) + + if sock is None: + all_exceptions = [exc for sub in exceptions for exc in sub] + try: + first_exception = all_exceptions[0] + if len(all_exceptions) == 1: + raise first_exception + else: + # If they all have the same str(), raise one. + model = str(first_exception) + if all(str(exc) == model for exc in all_exceptions): + raise first_exception + # Raise a combined exception so the user can see all + # the various error messages. + msg = "Multiple exceptions: {}".format( + ", ".join(str(exc) for exc in all_exceptions) + ) + # If the errno is the same for all exceptions, raise + # an OSError with that errno. + if isinstance(first_exception, OSError): + first_errno = first_exception.errno + if all( + isinstance(exc, OSError) and exc.errno == first_errno + for exc in all_exceptions + ): + raise OSError(first_errno, msg) + elif isinstance(first_exception, RuntimeError) and all( + isinstance(exc, RuntimeError) for exc in all_exceptions + ): + raise RuntimeError(msg) + # We have a mix of OSError and RuntimeError + # so we have to pick which one to raise. + # and we raise OSError for compatibility + raise OSError(msg) + finally: + all_exceptions = None # type: ignore[assignment] + exceptions = None # type: ignore[assignment] + + return sock + + +async def _connect_sock( + loop: asyncio.AbstractEventLoop, + exceptions: List[List[Union[OSError, RuntimeError]]], + addr_info: AddrInfoType, + local_addr_infos: Optional[Sequence[AddrInfoType]] = None, +) -> socket.socket: + """Create, bind and connect one socket.""" + my_exceptions: List[Union[OSError, RuntimeError]] = [] + exceptions.append(my_exceptions) + family, type_, proto, _, address = addr_info + sock = None + try: + sock = socket.socket(family=family, type=type_, proto=proto) + sock.setblocking(False) + if local_addr_infos is not None: + for lfamily, _, _, _, laddr in local_addr_infos: + # skip local addresses of different family + if lfamily != family: + continue + try: + sock.bind(laddr) + break + except OSError as exc: + msg = ( + f"error while attempting to bind on " + f"address {laddr!r}: " + f"{(exc.strerror or '').lower()}" + ) + exc = OSError(exc.errno, msg) + my_exceptions.append(exc) + else: # all bind attempts failed + if my_exceptions: + raise my_exceptions.pop() + else: + raise OSError(f"no matching local address with {family=} found") + await loop.sock_connect(sock, address) + return sock + except (RuntimeError, OSError) as exc: + my_exceptions.append(exc) + if sock is not None: + try: + sock.close() + except OSError as e: + my_exceptions.append(e) + raise + raise + except: + if sock is not None: + try: + sock.close() + except OSError as e: + my_exceptions.append(e) + raise + raise + finally: + exceptions = my_exceptions = None # type: ignore[assignment] + + +def _interleave_addrinfos( + addrinfos: Sequence[AddrInfoType], first_address_family_count: int = 1 +) -> List[AddrInfoType]: + """Interleave list of addrinfo tuples by family.""" + # Group addresses by family + addrinfos_by_family: collections.OrderedDict[int, List[AddrInfoType]] = ( + collections.OrderedDict() + ) + for addr in addrinfos: + family = addr[0] + if family not in addrinfos_by_family: + addrinfos_by_family[family] = [] + addrinfos_by_family[family].append(addr) + addrinfos_lists = list(addrinfos_by_family.values()) + + reordered: List[AddrInfoType] = [] + if first_address_family_count > 1: + reordered.extend(addrinfos_lists[0][: first_address_family_count - 1]) + del addrinfos_lists[0][: first_address_family_count - 1] + reordered.extend( + a + for a in itertools.chain.from_iterable(itertools.zip_longest(*addrinfos_lists)) + if a is not None + ) + return reordered diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/py.typed b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/types.py b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/types.py new file mode 100644 index 0000000000000000000000000000000000000000..01d79a28eb0bcb4c6daa2b2f656b0014aecb258c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/types.py @@ -0,0 +1,12 @@ +"""Types for aiohappyeyeballs.""" + +import socket +from typing import Tuple, Union + +AddrInfoType = Tuple[ + Union[int, socket.AddressFamily], + Union[int, socket.SocketKind], + int, + str, + Tuple, # type: ignore[type-arg] +] diff --git a/.venv/lib/python3.11/site-packages/aiohappyeyeballs/utils.py b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ea29adb9be9edd751cd6d7b93ca9c4bd8d08b658 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/aiohappyeyeballs/utils.py @@ -0,0 +1,97 @@ +"""Utility functions for aiohappyeyeballs.""" + +import ipaddress +import socket +from typing import Dict, List, Optional, Tuple, Union + +from .types import AddrInfoType + + +def addr_to_addr_infos( + addr: Optional[ + Union[Tuple[str, int, int, int], Tuple[str, int, int], Tuple[str, int]] + ], +) -> Optional[List[AddrInfoType]]: + """Convert an address tuple to a list of addr_info tuples.""" + if addr is None: + return None + host = addr[0] + port = addr[1] + is_ipv6 = ":" in host + if is_ipv6: + flowinfo = 0 + scopeid = 0 + addr_len = len(addr) + if addr_len >= 4: + scopeid = addr[3] # type: ignore[misc] + if addr_len >= 3: + flowinfo = addr[2] # type: ignore[misc] + addr = (host, port, flowinfo, scopeid) + family = socket.AF_INET6 + else: + addr = (host, port) + family = socket.AF_INET + return [(family, socket.SOCK_STREAM, socket.IPPROTO_TCP, "", addr)] + + +def pop_addr_infos_interleave( + addr_infos: List[AddrInfoType], interleave: Optional[int] = None +) -> None: + """ + Pop addr_info from the list of addr_infos by family up to interleave times. + + The interleave parameter is used to know how many addr_infos for + each family should be popped of the top of the list. + """ + seen: Dict[int, int] = {} + if interleave is None: + interleave = 1 + to_remove: List[AddrInfoType] = [] + for addr_info in addr_infos: + family = addr_info[0] + if family not in seen: + seen[family] = 0 + if seen[family] < interleave: + to_remove.append(addr_info) + seen[family] += 1 + for addr_info in to_remove: + addr_infos.remove(addr_info) + + +def _addr_tuple_to_ip_address( + addr: Union[Tuple[str, int], Tuple[str, int, int, int]], +) -> Union[ + Tuple[ipaddress.IPv4Address, int], Tuple[ipaddress.IPv6Address, int, int, int] +]: + """Convert an address tuple to an IPv4Address.""" + return (ipaddress.ip_address(addr[0]), *addr[1:]) + + +def remove_addr_infos( + addr_infos: List[AddrInfoType], + addr: Union[Tuple[str, int], Tuple[str, int, int, int]], +) -> None: + """ + Remove an address from the list of addr_infos. + + The addr value is typically the return value of + sock.getpeername(). + """ + bad_addrs_infos: List[AddrInfoType] = [] + for addr_info in addr_infos: + if addr_info[-1] == addr: + bad_addrs_infos.append(addr_info) + if bad_addrs_infos: + for bad_addr_info in bad_addrs_infos: + addr_infos.remove(bad_addr_info) + return + # Slow path in case addr is formatted differently + match_addr = _addr_tuple_to_ip_address(addr) + for addr_info in addr_infos: + if match_addr == _addr_tuple_to_ip_address(addr_info[-1]): + bad_addrs_infos.append(addr_info) + if bad_addrs_infos: + for bad_addr_info in bad_addrs_infos: + addr_infos.remove(bad_addr_info) + return + raise ValueError(f"Address {addr} not found in addr_infos") diff --git a/.venv/lib/python3.11/site-packages/attrs/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/attrs/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d981df1502727f3185ed3a2cd9d07c9dd4826727 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/attrs/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/attrs/__pycache__/validators.cpython-311.pyc b/.venv/lib/python3.11/site-packages/attrs/__pycache__/validators.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e55b831ab4bdce4f29c3d70df86b5c00b002da2a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/attrs/__pycache__/validators.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/__init__.py b/.venv/lib/python3.11/site-packages/vllm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..566c5116d5f098dded1eed924b7a76a9ce22a16c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/__init__.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 +"""vLLM: a high-throughput and memory-efficient inference engine for LLMs""" +import os + +import torch + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.engine.llm_engine import LLMEngine +from vllm.entrypoints.llm import LLM +from vllm.executor.ray_utils import initialize_ray_cluster +from vllm.inputs import PromptType, TextPrompt, TokensPrompt +from vllm.model_executor.models import ModelRegistry +from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput, + CompletionOutput, EmbeddingOutput, + EmbeddingRequestOutput, PoolingOutput, + PoolingRequestOutput, RequestOutput, ScoringOutput, + ScoringRequestOutput) +from vllm.pooling_params import PoolingParams +from vllm.sampling_params import SamplingParams + +from .version import __version__, __version_tuple__ + +# set some common config/environment variables that should be set +# for all processes created by vllm and all processes +# that interact with vllm workers. +# they are executed whenever `import vllm` is called. + +# see https://github.com/NVIDIA/nccl/issues/1234 +os.environ['NCCL_CUMEM_ENABLE'] = '0' + +# see https://github.com/vllm-project/vllm/issues/10480 +os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' +# see https://github.com/vllm-project/vllm/issues/10619 +torch._inductor.config.compile_threads = 1 + +__all__ = [ + "__version__", + "__version_tuple__", + "LLM", + "ModelRegistry", + "PromptType", + "TextPrompt", + "TokensPrompt", + "SamplingParams", + "RequestOutput", + "CompletionOutput", + "PoolingOutput", + "PoolingRequestOutput", + "EmbeddingOutput", + "EmbeddingRequestOutput", + "ClassificationOutput", + "ClassificationRequestOutput", + "ScoringOutput", + "ScoringRequestOutput", + "LLMEngine", + "EngineArgs", + "AsyncLLMEngine", + "AsyncEngineArgs", + "initialize_ray_cluster", + "PoolingParams", +] diff --git a/.venv/lib/python3.11/site-packages/vllm/_custom_ops.py b/.venv/lib/python3.11/site-packages/vllm/_custom_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a68235016767571c88bdbaaa8dcb51dcc35258c8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/_custom_ops.py @@ -0,0 +1,1098 @@ +# SPDX-License-Identifier: Apache-2.0 + +import contextlib +import importlib +from typing import TYPE_CHECKING, List, Optional, Tuple, Union + +import torch +import torch.library + +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType + +logger = init_logger(__name__) + +if not current_platform.is_tpu() and not current_platform.is_hpu(): + try: + import vllm._C + except ImportError as e: + logger.warning("Failed to import from vllm._C with %r", e) + +supports_moe_ops = False +with contextlib.suppress(ImportError): + import vllm._moe_C # noqa: F401 + supports_moe_ops = True + +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + + +# page attention ops +def paged_attention_v1( + out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + block_size: int, + max_seq_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +) -> None: + torch.ops._C.paged_attention_v1( + out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, + seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, + k_scale, v_scale, tp_rank, blocksparse_local_blocks, + blocksparse_vert_stride, blocksparse_block_size, + blocksparse_head_sliding_step) + + +def paged_attention_v2( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + block_size: int, + max_seq_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +) -> None: + torch.ops._C.paged_attention_v2( + out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache, + num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, + alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank, + blocksparse_local_blocks, blocksparse_vert_stride, + blocksparse_block_size, blocksparse_head_sliding_step) + + +def paged_attention_rocm( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + block_size: int, + max_seq_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, +) -> None: + torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query, + key_cache, value_cache, num_kv_heads, + scale, block_tables, seq_lens, + block_size, max_seq_len, alibi_slopes, + kv_cache_dtype, k_scale, v_scale) + + +# pos encoding ops +def rotary_embedding( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + torch.ops._C.rotary_embedding(positions, query, key, head_size, + cos_sin_cache, is_neox) + + +def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, + key: torch.Tensor, head_size: int, + cos_sin_cache: torch.Tensor, is_neox: bool, + rot_dim: int, + cos_sin_cache_offsets: torch.Tensor) -> None: + torch.ops._C.batched_rotary_embedding(positions, query, key, head_size, + cos_sin_cache, is_neox, rot_dim, + cos_sin_cache_offsets) + + +# layer norm ops +def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, + epsilon: float) -> None: + torch.ops._C.rms_norm(out, input, weight, epsilon) + + +def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, + weight: torch.Tensor, epsilon: float) -> None: + torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon) + + +def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, + input_tokens: torch.Tensor, + sampled_token_ids: torch.Tensor, + input_positions: torch.Tensor, + seq_lens: torch.Tensor, slot_mapping: torch.Tensor, + block_tables: torch.Tensor) -> None: + """Advance a step on GPU for existing inputs for a multi-step runner""" + return torch.ops._C.advance_step_flashattn(num_seqs, num_queries, + block_size, input_tokens, + sampled_token_ids, + input_positions, seq_lens, + slot_mapping, block_tables) + + +def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int, + input_tokens: torch.Tensor, + sampled_token_ids: torch.Tensor, + input_positions: torch.Tensor, + seq_lens: torch.Tensor, slot_mapping: torch.Tensor, + block_tables: torch.Tensor, + paged_kv_indices: torch.Tensor, + paged_kv_indptr: torch.Tensor, + paged_kv_last_page_len: torch.Tensor, + block_table_bound: torch.Tensor) -> None: + + return torch.ops._C.advance_step_flashinfer( + num_seqs, num_queries, block_size, input_tokens, sampled_token_ids, + input_positions, seq_lens, slot_mapping, block_tables, + paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len, + block_table_bound) + + +# fused quant layer norm ops +def rms_norm_dynamic_per_token_quant( + input: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + quant_dtype: torch.dtype, + scale_ub: Optional[torch.Tensor] = None, + residual: Optional[torch.Tensor] = None +) -> Tuple[torch.Tensor, torch.Tensor]: + output = torch.empty_like(input, dtype=quant_dtype) + scales = torch.empty((input.numel() // input.shape[-1], 1), + device=input.device, + dtype=torch.float32) + + torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight, + scales, epsilon, scale_ub, + residual) + return output, scales + + +# quantization ops +# awq +def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor, + zeros: torch.Tensor, split_k_iters: int, thx: int, + thy: int) -> torch.Tensor: + if envs.VLLM_USE_TRITON_AWQ: + from vllm.model_executor.layers.quantization.awq_triton import ( + awq_dequantize_triton) + return awq_dequantize_triton(qweight, scales, zeros) + return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters, + thx, thy) + + +def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor, + scales: torch.Tensor, split_k_iters: int) -> torch.Tensor: + if envs.VLLM_USE_TRITON_AWQ: + from vllm.model_executor.layers.quantization.awq_triton import ( + awq_gemm_triton) + return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters) + return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters) + + +# gptq +def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, + b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor, + b_g_idx: torch.Tensor, use_exllama: bool, + bit: int) -> torch.Tensor: + return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, + b_g_idx, use_exllama, bit) + + +if hasattr(torch.ops._C, "gptq_gemm"): + + @register_fake("_C::gptq_gemm") + def _gptq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, + b_gptq_qzeros: torch.Tensor, + b_gptq_scales: torch.Tensor, b_g_idx: torch.Tensor, + use_exllama: bool, bit: int) -> torch.Tensor: + return torch.empty((a.size(0), b_q_weight.size(1)), + dtype=a.dtype, + device=a.device) + + +def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, + bit: int) -> None: + torch.ops._C.gptq_shuffle(q_weight, q_perm, bit) + + +# marlin +def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, + b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int, + size_n: int, size_k: int) -> torch.Tensor: + return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m, + size_n, size_k) + + +# marlin_24 +def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, + b_meta: torch.Tensor, b_scales: torch.Tensor, + workspace: torch.Tensor, b_q_type: ScalarType, + size_m: int, size_n: int, size_k: int) -> torch.Tensor: + return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales, + workspace, b_q_type.id, size_m, + size_n, size_k) + + +if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): + + @register_fake("_C::gptq_marlin_24_gemm") + def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, + b_meta: torch.Tensor, b_scales: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt) -> torch.Tensor: + return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) + + @register_fake("_C::gptq_marlin_gemm") + def _gptq_marlin_gemm_fake(a: torch.Tensor, + b_q_weight: torch.Tensor, + b_scales: torch.Tensor, + b_zeros: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + has_zp: bool = False, + use_fp32_reduce: bool = False, + is_zp_float: bool = False) -> torch.Tensor: + return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) + + @register_fake("_C::marlin_qqq_gemm") + def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, + s_tok: torch.Tensor, s_ch: torch.Tensor, + s_group: torch.Tensor, workspace: torch.Tensor, + size_m: torch.SymInt, size_n: torch.SymInt, + size_k: torch.SymInt) -> torch.Tensor: + return torch.empty((size_m, size_n), + dtype=torch.float16, + device=a.device) + + @register_fake("_C::marlin_gemm") + def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, + b_scales: torch.Tensor, workspace: torch.Tensor, + size_m: torch.SymInt, size_n: torch.SymInt, + size_k: torch.SymInt) -> torch.Tensor: + return torch.empty((size_m, size_n), + dtype=torch.float16, + device=a.device) + + @register_fake("_C::awq_dequantize") + def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor, + zeros: torch.Tensor, split_k_iters: torch.SymInt, + thx: int, thy: int) -> torch.Tensor: + in_c = qweight.size(0) + qout_c = qweight.size(1) + out_c = qout_c * 8 + return torch.empty((in_c, out_c), + dtype=scales.dtype, + device=scales.device) + + @register_fake("_C::awq_gemm") + def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor, + qzeros: torch.Tensor, scales: torch.Tensor, + split_k_iters: torch.SymInt) -> torch.Tensor: + num_in_feats = input.size(0) + return torch.empty((split_k_iters, num_in_feats, qweight.size(1) * 8), + dtype=input.dtype, + device=input.device).sum(0) + + @register_fake("_C::aqlm_gemm") + def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor, + codebooks: torch.Tensor, scales: torch.Tensor, + codebook_partition_sizes: List[int], + bias: Optional[torch.Tensor]) -> torch.Tensor: + out_features = codes.size(0) * codebooks.size(2) + flat_input = input.reshape((-1, input.size(-1))) + flat_output = torch.empty((flat_input.size(0), out_features), + dtype=input.dtype, + device=input.device) + + output_sizes = list(input.shape) + output_sizes.pop() + output_sizes.append(-1) + return flat_output.reshape(tuple(output_sizes)) + + @register_fake("_C::aqlm_dequant") + def _aqlm_dequant_fake( + codes: torch.Tensor, codebooks: torch.Tensor, + codebook_partition_sizes: List[int]) -> torch.Tensor: + in_features = codes.size(1) * 8 + out_features = codes.size(0) + return torch.empty((out_features, in_features), + dtype=codebooks.dtype, + device=codebooks.device) + + @register_fake("_C::fp8_marlin_gemm") + def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, + b_scales: torch.Tensor, workspace: torch.Tensor, + num_bits: int, size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt) -> torch.Tensor: + return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device) + + @register_fake("_C::machete_mm") + def machete_mm_fake( + a: torch.Tensor, + # b_q Should be the tensor returned by machete_prepack_B + b_q: torch.Tensor, + b_type: ScalarType, + out_type: Optional[torch.dtype] = None, + b_group_scales: Optional[torch.Tensor] = None, + b_group_zeros: Optional[torch.Tensor] = None, + b_group_size: Optional[int] = None, + b_channel_scales: Optional[torch.Tensor] = None, + a_token_scales: Optional[torch.Tensor] = None, + schedule: Optional[str] = None, + ) -> torch.Tensor: + m = a.size(0) + n = b_q.size(1) + return torch.empty((m, n), device=a.device, dtype=a.dtype) + + @register_fake("_C::machete_prepack_B") + def machete_prepack_B_fake( + b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType, + group_scales_type: Optional[torch.dtype]) -> torch.Tensor: + return torch.empty_like(b_q_weight, + memory_format=torch.contiguous_format) + + +if hasattr(torch.ops._C, "ggml_dequantize"): + + @register_fake("_C::ggml_dequantize") + def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int, + m: torch.SymInt, + n: torch.SymInt) -> torch.Tensor: + return torch.empty((m, n), dtype=torch.float16, device=W.device) + + @register_fake("_C::ggml_mul_mat_vec_a8") + def _ggml_mul_mat_vec_a8_fake( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: torch.SymInt, + ) -> torch.Tensor: + return torch.empty((1, row), dtype=torch.float16, device=W.device) + + @register_fake("_C::ggml_mul_mat_a8") + def _ggml_mul_mat_a8_fake( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: torch.SymInt, + ) -> torch.Tensor: + batch = X.size(0) + return torch.empty((batch, row), dtype=torch.float16, device=W.device) + + +# cutlass +def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool: + return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability) + + +def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool: + return torch.ops._C.cutlass_scaled_mm_supports_block_fp8( + cuda_device_capability) + + +def cutlass_scaled_mm(a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: torch.dtype, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + `cutlass_scaled_mm` implements a fused version of + `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)` + where scale_a * a and scale_b * b are implemented using numpy-style + broadcasting. + + In order to support blockwise scaling like found in DeepSeek V3 we also + support extended "group" broadcast rules. We extend the numpy-style + broadcasting rules with the following rule: + "if the extent of a dimension in the source shape is between 1 and + corresponding extent in the target shape we repeat each element along + that dimension src_shape[dim] // target_shape[dim] times consecutively" + example if we have: + a = [[1, 2], and target_shape = (2, 4) + [3, 4]] + then we would expand a to: + a = [[1, 1, 2, 2], + [3, 3, 4, 4]] + currently we only support the case: + scale_a.shape * [1, 128] == a.shape + scale_b.shape * [128, 128] == b.shape + """ + assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) + assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16) + assert bias is None or bias.shape[0] == b.shape[ + 1] and bias.dtype == out_dtype + + m = a.shape[0] + n = b.shape[1] + + if current_platform.is_rocm(): + triton_scaled_mm_module = importlib.import_module( + "vllm.model_executor.layers.quantization.compressed_tensors." + "triton_scaled_mm") + triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm + return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) + + out = torch.empty((m, n), dtype=out_dtype, device=a.device) + + torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias) + + return out + + +def cutlass_scaled_mm_azp(a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: torch.dtype, + azp_adj: torch.Tensor, + azp: Optional[torch.Tensor] = None, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + :param azp_adj: In the per-tensor case, this should include the azp. + Always per-channel. + :param azp: Only set in the per-token case. Per-token if set. + """ + assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) + assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16) + assert bias is None or bias.numel( + ) == b.shape[1] and bias.dtype == out_dtype + assert azp is None or azp.numel() == a.shape[0] + + m = a.shape[0] + n = b.shape[1] + out = torch.empty((m, n), dtype=out_dtype, device=a.device) + + torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj, + azp, bias) + return out + + +def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool: + return torch.ops._C.cutlass_sparse_scaled_mm_supported( + cuda_device_capability) + + +def cutlass_sparse_compress(a: torch.Tensor) \ + -> Tuple[torch.Tensor, torch.Tensor]: + """ + Compresses a sparse matrix for use with Cutlass sparse operations. + + This function takes a dense tensor and compresses it into two components: + non-zero elements and metadata. The compressed representation is compatible + with Cutlass sparse kernels. + + Args: + a (torch.Tensor): + The input tensor to be compressed. Must have one of the following data types: + - `torch.int8` + - `torch.float8_e4m3fn` + - `torch.bfloat16` + - `torch.float16` + + Returns: + Tuple[torch.Tensor, torch.Tensor]: + A tuple containing: + - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`. + - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation. + + Raises: + ValueError: If the compression operation fails. + + Notes: + - The `a_meta` tensor has a data type of `torch.uint8`. + - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`). + - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor. + - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`. + """ + assert (a.dtype in [ + torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16 + ]) + assert (a.is_contiguous()) + + # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4 + elemsPerMetaElem = 4 + + m = a.shape[0] + k = a.shape[1] + assert (k % 2 == 0) + a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device) + a_meta = torch.empty((m, k // 2 // elemsPerMetaElem), + dtype=torch.uint8, + device=a.device) + + if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)): + raise ValueError + + assert (a_nzs.is_contiguous()) + assert (a_meta.is_contiguous()) + + return a_nzs, a_meta + + +def cutlass_scaled_sparse_mm( + a: torch.Tensor, + bt_nzs: torch.Tensor, + bt_meta: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: torch.dtype, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + Performs a scaled sparse matrix multiplication using Cutlass. + + Steps: + 1. Create a dense matrix `a` of shape (m, k) on the CUDA device: + `a = torch.randn((m, k), device='cuda')`. + + 2. Create a dense matrix `b` of shape (k, n) on the CUDA device: + `b = torch.randn((k, n), device='cuda')`. + + 3. Prune matrix `b` to 2:4 sparsity along the specified dimension: + `b = prune_to_2_4(b, dim=0)`. + + 4. Compress the transposed sparse matrix `b.t()`: + `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`. + + 5. Perform sparse matrix multiplication using the compressed matrix, + applying scaling factors for `a` and `b`, and the output data type: + `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`. + + Returns: + - The result of the scaled sparse matrix multiplication. + """ + assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0) + assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16) + assert bias is None or bias.shape[0] == bt_nzs.shape[0] \ + and bias.dtype == out_dtype + + m = a.shape[0] + n = bt_nzs.shape[0] + out = torch.empty((m, n), dtype=out_dtype, device=a.device) + + torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a, + scale_b, bias) + + return out + + +# aqlm +def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, + codebooks: torch.Tensor, scales: torch.Tensor, + codebook_partition_sizes: List[int], + bias: Optional[torch.Tensor]) -> torch.Tensor: + return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales, + codebook_partition_sizes, bias) + + +def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, + codebook_partition_sizes: List[int]) -> torch.Tensor: + return torch.ops._C.aqlm_dequant(codes, codebooks, + codebook_partition_sizes) + + +# gptq_marlin +def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor, + size_k: int, size_n: int, + num_bits: int) -> torch.Tensor: + return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, + num_bits) + + +# gptq_marlin +def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int, + num_bits: int) -> torch.Tensor: + return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits) + + +def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor, + size_k: int, size_n: int, + num_bits: int) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype) + for e in range(num_experts): + output[e] = torch.ops._C.gptq_marlin_repack(b_q_weight[e], perm[e], + size_k, size_n, num_bits) + return output + + +def awq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor, + size_k: int, size_n: int, + num_bits: int) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype) + for e in range(num_experts): + output[e] = torch.ops._C.awq_marlin_repack(b_q_weight[e], size_k, + size_n, num_bits) + return output + + +def gptq_marlin_gemm(a: torch.Tensor, + b_q_weight: torch.Tensor, + b_scales: torch.Tensor, + b_zeros: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: int, + size_n: int, + size_k: int, + is_k_full: bool, + has_zp: bool = False, + use_fp32_reduce: bool = False, + is_zp_float: bool = False) -> torch.Tensor: + return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros, + g_idx, perm, workspace, b_q_type.id, + size_m, size_n, size_k, is_k_full, + has_zp, use_fp32_reduce, is_zp_float) + + +# fp8 marlin +def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, + b_scales: torch.Tensor, workspace: torch.Tensor, + num_bits: int, size_m: int, size_n: int, + size_k: int) -> torch.Tensor: + return torch.ops._C.fp8_marlin_gemm(a, b_q_weight, b_scales, workspace, + num_bits, size_m, size_n, size_k) + + +# machete +def machete_supported_schedules( + a_type: torch.dtype, + b_type: ScalarType, + group_scales_type: Optional[torch.dtype], + group_zeros_type: Optional[torch.dtype] = None, + channel_scales_type: Optional[torch.dtype] = None, + token_scales_type: Optional[torch.dtype] = None, + out_type: Optional[torch.dtype] = None) -> List[str]: + return torch.ops._C.machete_supported_schedules( + a_type, b_type.id, group_scales_type, group_zeros_type, + channel_scales_type, token_scales_type, out_type) + + +def machete_mm( + a: torch.Tensor, + # b_q Should be the tensor returned by machete_prepack_B + b_q: torch.Tensor, + b_type: ScalarType, + out_type: Optional[torch.dtype] = None, + b_group_scales: Optional[torch.Tensor] = None, + b_group_zeros: Optional[torch.Tensor] = None, + b_group_size: Optional[int] = None, + b_channel_scales: Optional[torch.Tensor] = None, + a_token_scales: Optional[torch.Tensor] = None, + schedule: Optional[str] = None) -> torch.Tensor: + return torch.ops._C.machete_mm(a, b_q, b_type.id, out_type, b_group_scales, + b_group_zeros, b_group_size, + b_channel_scales, a_token_scales, schedule) + + +def machete_prepack_B( + b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType, + group_scales_type: Optional[torch.dtype]) -> torch.Tensor: + return torch.ops._C.machete_prepack_B(b_q_weight, a_type, b_type.id, + group_scales_type) + + +if hasattr(torch.ops._C, "permute_cols"): + + @register_fake("_C::permute_cols") + def _permute_cols_fake(a: torch.Tensor, + perm: torch.Tensor) -> torch.Tensor: + return torch.empty_like(a) + + +def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor: + return torch.ops._C.permute_cols(a, perm) + + +# fp8 +def scaled_fp8_quant( + input: torch.Tensor, + scale: Optional[torch.Tensor] = None, + num_token_padding: Optional[int] = None, + scale_ub: Optional[torch.Tensor] = None, + use_per_token_if_dynamic: bool = False, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Quantize input tensor to FP8 and return quantized tensor and scale. + + This function supports both static and dynamic quantization: If you + provide the scale, it will use static scaling and if you omit it, + the scale will be determined dynamically. The function also allows + optional padding of the output tensors for downstream kernels that + will benefit from padding. + + Args: + input: The input tensor to be quantized to FP8 + scale: Optional scaling factor for the FP8 quantization + scale_ub: Optional upper bound for scaling factor in dynamic + per token case + num_token_padding: If specified, pad the first dimension + of the output to at least this value. + use_per_token_if_dynamic: Whether to do per_tensor or per_token + in the dynamic quantization case. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and + scaling factor. + """ + # This code assumes batch_dim and num_tokens are flattened + assert (input.ndim == 2) + shape: Union[Tuple[int, int], torch.Size] = input.shape + # For rocm, the output fp8 dtype is torch.float_e3m3fnuz + out_dtype: torch.dtype = torch.float8_e4m3fnuz \ + if current_platform.is_rocm() else torch.float8_e4m3fn + if num_token_padding: + shape = (max(num_token_padding, input.shape[0]), shape[1]) + output = torch.empty(shape, device=input.device, dtype=out_dtype) + + if scale is None: + if use_per_token_if_dynamic: + scale = torch.empty((shape[0], 1), + device=input.device, + dtype=torch.float32) + torch.ops._C.dynamic_per_token_scaled_fp8_quant( + output, input, scale, scale_ub) + else: + scale = torch.zeros(1, device=input.device, dtype=torch.float32) + torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) + else: + # num_token_padding not implemented for this case + assert (scale.numel() == 1 or num_token_padding is None) + torch.ops._C.static_scaled_fp8_quant(output, input, scale) + + return output, scale + + +# int8 +def scaled_int8_quant( + input: torch.Tensor, + scale: Optional[torch.Tensor] = None, + azp: Optional[torch.Tensor] = None, + symmetric: bool = True +) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """ + Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp. + + Args: + input: The input tensor to be quantized to int8. + scale: Optional scaling factor for the int8 quantization. + When not provided, we invoke dynamic-per-token quantization. + azp: Optional zero-point for the int8 quantization. + Must be provided for asymmetric quantization if `scale` is provided. + symmetric: Whether to use symmetric quantization (scale only, azp ignored). + + Returns: + Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. + """ + output = torch.empty_like(input, dtype=torch.int8) + if scale is not None: + # static-per-tensor quantization. + assert symmetric == ( + azp + is None), "azp must only be provided for asymmetric quantization." + torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) + return output, scale, azp + + # dynamic-per-token quantization. + input_scales = torch.empty((input.numel() // input.shape[-1], 1), + device=input.device, + dtype=torch.float32) + input_azp = None if symmetric else torch.empty_like(input_scales, + dtype=torch.int32) + torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, + input_azp) + return output, input_scales, input_azp + + +# qqq ops +def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, + s_tok: torch.Tensor, s_ch: torch.Tensor, + s_group: torch.Tensor, workspace: torch.Tensor, + size_m: int, size_n: int, size_k: int) -> torch.Tensor: + return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group, + workspace, size_m, size_n, size_k) + + +# gguf +def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, + n: int) -> torch.Tensor: + return torch.ops._C.ggml_dequantize(W, quant_type, m, n) + + +def ggml_mul_mat_vec_a8( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: int, +) -> torch.Tensor: + return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row) + + +def ggml_mul_mat_a8( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: int, +) -> torch.Tensor: + return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row) + + +# mamba +def causal_conv1d_fwd(x: torch.Tensor, weight: torch.Tensor, + bias_: Optional[torch.Tensor], + conv_states: Optional[torch.Tensor], + query_start_loc: Optional[torch.Tensor], + cache_indices: Optional[torch.Tensor], + has_initial_state: Optional[torch.Tensor], + silu_activation: bool, pad_slot_id: int): + torch.ops._C.causal_conv1d_fwd(x, weight, bias_, conv_states, + query_start_loc, cache_indices, + has_initial_state, silu_activation, + pad_slot_id) + + +def causal_conv1d_update(x: torch.Tensor, conv_state: torch.Tensor, + weight: torch.Tensor, bias_: Optional[torch.Tensor], + silu_activation: bool, + cache_seqlens: Optional[torch.Tensor], + conv_state_indices: Optional[torch.Tensor], + pad_slot_id: int): + torch.ops._C.causal_conv1d_update(x, conv_state, weight, bias_, + silu_activation, cache_seqlens, + conv_state_indices, pad_slot_id) + + +def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor, + B: torch.Tensor, C: torch.Tensor, + D_: Optional[torch.Tensor], z_: Optional[torch.Tensor], + delta_bias_: Optional[torch.Tensor], + delta_softplus: bool, + query_start_loc: Optional[torch.Tensor], + cache_indices: Optional[torch.Tensor], + has_initial_state: Optional[torch.Tensor], + ssm_states: torch.Tensor, pad_slot_id: int): + torch.ops._C.selective_scan_fwd(u, delta, A, B, C, D_, z_, delta_bias_, + delta_softplus, query_start_loc, + cache_indices, has_initial_state, + ssm_states, pad_slot_id) + + +# moe +def moe_sum(input: torch.Tensor, output: torch.Tensor): + torch.ops._moe_C.moe_sum(input, output) + + +def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, + block_size: int, sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor) -> None: + torch.ops._moe_C.moe_align_block_size(topk_ids, num_experts, block_size, + sorted_token_ids, experts_ids, + num_tokens_post_pad) + + +def sgl_moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, + block_size: int, sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor) -> None: + torch.ops._moe_C.sgl_moe_align_block_size(topk_ids, num_experts, + block_size, sorted_token_ids, + experts_ids, num_tokens_post_pad) + + +def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float) -> None: + torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, + token_expert_indicies, gating_output) + + +if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"): + + @register_fake("_moe_C::marlin_gemm_moe") + def marlin_gemm_moe_fake(a: torch.Tensor, b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, b_scales: torch.Tensor, + b_zero_points: torch.Tensor, g_idx: torch.Tensor, + perm: torch.Tensor, workspace: torch.Tensor, + b_q_type: ScalarType, size_m: torch.SymInt, + size_n: torch.SymInt, size_k: torch.SymInt, + is_k_full: bool, num_experts: int, topk: int, + moe_block_size: int, replicate_input: bool, + apply_weights: bool) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), + dtype=a.dtype, + device=a.device) + + +def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, +) -> None: + torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, + value_cache, slot_mapping, + kv_cache_dtype, k_scale, v_scale) + + +def reshape_and_cache_flash( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, +) -> None: + torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, + value_cache, slot_mapping, + kv_cache_dtype, k_scale, + v_scale) + + +def concat_and_cache_mla( + kv_c: torch.Tensor, + k_pe: torch.Tensor, + kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + scale: torch.Tensor, +) -> None: + torch.ops._C_cache_ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, + slot_mapping, kv_cache_dtype, + scale) + + +def copy_blocks(key_caches: List[torch.Tensor], + value_caches: List[torch.Tensor], + block_mapping: torch.Tensor) -> None: + torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) + + +def copy_blocks_mla(kv_caches: List[torch.Tensor], + block_mapping: torch.Tensor) -> None: + torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping) + + +def swap_blocks(src: torch.Tensor, dst: torch.Tensor, + block_mapping: torch.Tensor) -> None: + torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping) + + +def convert_fp8(output: torch.Tensor, + input: torch.Tensor, + scale: float = 1.0, + kv_dtype: str = "fp8") -> None: + torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype) + + +def get_device_attribute(attribute: int, device: int) -> int: + return torch.ops._C_cuda_utils.get_device_attribute(attribute, device) + + +def get_max_shared_memory_per_block_device_attribute(device: int) -> int: + # ruff: noqa: E501 + return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute( + device) + + +# custom ar +def init_custom_ar(ipc_tensors: List[torch.Tensor], rank_data: torch.Tensor, + rank: int, full_nvlink: bool) -> int: + return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank, + full_nvlink) + + +def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int, + reg_buffer_sz_bytes: int) -> None: + torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, + reg_buffer_sz_bytes) + + +def dispose(fa: int) -> None: + torch.ops._C_custom_ar.dispose(fa) + + +def meta_size() -> int: + return torch.ops._C_custom_ar.meta_size() + + +def register_buffer(fa: int, ipc_tensors: List[int]) -> None: + return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors) + + +def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]: + return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa) + + +def register_graph_buffers(fa: int, handles: List[List[int]], + offsets: List[List[int]]) -> None: + torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets) diff --git a/.venv/lib/python3.11/site-packages/vllm/_ipex_ops.py b/.venv/lib/python3.11/site-packages/vllm/_ipex_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ccb67baa53383b9748491036c1f9c8f56aa038f8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/_ipex_ops.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Optional, Tuple + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import intel_extension_for_pytorch as ipex +except ImportError as e: + logger.warning("Import error msg: %s", e.msg) + + +class ipex_ops: + + @staticmethod + def _reshape_activation_tensor( + x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + num = x.size(0) + d = x.size(1) // 2 + x = x.reshape(num, 2, d) + x1, x2 = torch.chunk(x, chunks=2, dim=1) + x1 = x1.reshape(num, d) + x2 = x2.reshape(num, d) + return x1, x2 + + @staticmethod + def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ipex.llm.functional.silu_and_mul(x, out) + + @staticmethod + def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ipex.llm.functional.gelu_and_mul(x, out) + + @staticmethod + def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ipex.llm.functional.gelu_and_mul(x, out) + + @staticmethod + def gelu_fast(x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.gelu(x) + + @staticmethod + def gelu_new(x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.gelu(x) + + @staticmethod + def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: + ipex.llm.functional.gelu_quick(x, out) + + @staticmethod + def paged_attention_v1( + out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + ipex.llm.modules.PagedAttention.single_query_kv_attention( + out, + query.contiguous(), + key_cache.view_as(value_cache), + value_cache, + num_queries_per_tokens, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + ) + + @staticmethod + def paged_attention_v2( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + ipex.llm.modules.PagedAttention.single_query_kv_attention( + out, + query.contiguous(), + key_cache.view_as(value_cache), + value_cache, + num_queries_per_tokens, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + ) + + @staticmethod + def rotary_embedding( + positions: torch.Tensor, # [batch_size, seq_len] + query: torch.Tensor, # [batch_size, seq_len, num_heads*head_size] + key: torch.Tensor, # [batch_size, seq_len, num_kv_heads*head_size] + head_size: int, + cos_sin_cache: torch.Tensor, # [cos_sin_dim, rot_dim] + is_neox: bool, + ) -> None: + rot_dim = cos_sin_cache.size(1) + ipex.llm.functional.rotary_embedding_batched(positions, query, key, + head_size, cos_sin_cache, + is_neox, rot_dim) + + @staticmethod + def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, + key: torch.Tensor, head_size: int, + cos_sin_cache: torch.Tensor, is_neox: bool, + rot_dim: int, + cos_sin_cache_offsets: torch.Tensor) -> None: + ipex.llm.functional.rotary_embedding_batched(positions, query, key, + head_size, cos_sin_cache, + is_neox, rot_dim, + cos_sin_cache_offsets) + + @staticmethod + def rms_norm(input: torch.Tensor, weight: torch.Tensor, + epsilon: float) -> torch.Tensor: + return ipex.llm.functional.rms_norm(input, weight, epsilon) + + @staticmethod + def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, + weight: torch.Tensor, epsilon: float) -> None: + tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None, + epsilon, True) + input.copy_(tmp) + + @staticmethod + def varlen_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + seqlen_q: torch.Tensor, + seqlen_k: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + pdropout: float, + softmax_scale: float, + zero_tensors: bool, + is_causal: bool, + return_softmax: bool, + gen_: torch.Generator, + logits_soft_cap: float, + ) -> None: + ipex.llm.functional.varlen_attention(query.contiguous(), + key.contiguous(), + value.contiguous(), out, + seqlen_q.int(), seqlen_k.int(), + max_seqlen_q, max_seqlen_k, + pdropout, softmax_scale, + zero_tensors, is_causal, + return_softmax, gen_, + logits_soft_cap) + + @staticmethod + def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + ) -> None: + assert kv_cache_dtype == "auto" + ipex.llm.modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, slot_mapping) + + @staticmethod + def copy_blocks(key_caches: List[torch.Tensor], + value_caches: List[torch.Tensor], + block_mapping: torch.Tensor) -> None: + torch.xpu.copy_blocks( # type: ignore + key_caches, + value_caches, + block_mapping, + ) + + @staticmethod + def swap_blocks(src: torch.Tensor, dst: torch.Tensor, + block_mapping: torch.Tensor) -> None: + torch.xpu.swap_blocks(src, dst, block_mapping) # type: ignore diff --git a/.venv/lib/python3.11/site-packages/vllm/_version.py b/.venv/lib/python3.11/site-packages/vllm/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..393e722907adfa1c47e20ef193c1f4216586a62e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/_version.py @@ -0,0 +1,16 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple, Union + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = '0.7.2' +__version_tuple__ = version_tuple = (0, 7, 2) diff --git a/.venv/lib/python3.11/site-packages/vllm/beam_search.py b/.venv/lib/python3.11/site-packages/vllm/beam_search.py new file mode 100644 index 0000000000000000000000000000000000000000..97b2b630fc3e5101107aec09d856c49b48ff2539 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/beam_search.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from vllm.sequence import Logprob + +if TYPE_CHECKING: + from vllm.multimodal import MultiModalDataDict + + +@dataclass +class BeamSearchSequence: + """A sequence for beam search. + It keeps track of the tokens and the log probability of the sequence. + The text field is optional and will only be filled when the sequence is + about to be returned to the user. + """ + # The tokens includes the prompt. + tokens: List[int] + logprobs: List[Dict[int, Logprob]] + cum_logprob: float = 0.0 + text: Optional[str] = None + finish_reason: Optional[str] = None + stop_reason: Union[int, str, None] = None + multi_modal_data: Optional["MultiModalDataDict"] = None + mm_processor_kwargs: Optional[Dict[str, Any]] = None + + +@dataclass +class BeamSearchOutput: + """The output of beam search. + It contains the list of the best beam search sequences. + The length of the list is equal to the beam width. + """ + sequences: List[BeamSearchSequence] + + +class BeamSearchInstance: + + def __init__(self, prompt_tokens: List[int]): + self.beams: List[BeamSearchSequence] = [ + BeamSearchSequence(tokens=prompt_tokens, logprobs=[]) + ] + self.completed: List[BeamSearchSequence] = [] + + +def get_beam_search_score( + tokens: List[int], + cumulative_logprob: float, + eos_token_id: int, + length_penalty: float = 1.0, +) -> float: + """Calculate the beam search score with length penalty. + + Adapted from + + https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938 + """ + seq_len = len(tokens) + if tokens[-1] == eos_token_id: + seq_len -= 1 + + return cumulative_logprob / (seq_len**length_penalty) + + +def create_sort_beams_key_function(eos_token_id: int, length_penalty: float): + + def sort_beams_key(x: BeamSearchSequence) -> float: + return get_beam_search_score(x.tokens, x.cum_logprob, eos_token_id, + length_penalty) + + return sort_beams_key diff --git a/.venv/lib/python3.11/site-packages/vllm/config.py b/.venv/lib/python3.11/site-packages/vllm/config.py new file mode 100644 index 0000000000000000000000000000000000000000..9ba49757612452f74032955ba4e020cbbf483a1d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/config.py @@ -0,0 +1,3435 @@ +# SPDX-License-Identifier: Apache-2.0 + +import ast +import copy +import enum +import hashlib +import json +import sys +import warnings +from contextlib import contextmanager +from dataclasses import dataclass, field, replace +from pathlib import Path +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, + Final, List, Literal, Mapping, Optional, Protocol, Set, + Tuple, Type, Union) + +import torch +from pydantic import BaseModel, Field, PrivateAttr +from transformers import PretrainedConfig + +import vllm.envs as envs +from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, + get_quantization_config) +from vllm.model_executor.models import ModelRegistry +from vllm.platforms import CpuArchEnum +from vllm.tracing import is_otel_available, otel_import_error_traceback +from vllm.transformers_utils.config import ( + ConfigFormat, get_config, get_hf_image_processor_config, + get_hf_text_config, get_pooling_config, + get_sentence_transformer_tokenizer_config, is_encoder_decoder, + try_get_generation_config, uses_mrope) +from vllm.transformers_utils.s3_utils import S3Model +from vllm.transformers_utils.utils import is_s3 +from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, + get_cpu_memory, random_uuid, resolve_obj_by_qualname) + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + + from vllm.executor.executor_base import ExecutorBase + from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + from vllm.model_executor.model_loader.loader import BaseModelLoader + from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( + BaseTokenizerGroup) +else: + QuantizationConfig = None + +logger = init_logger(__name__) + +_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 +_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 + +TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", + "score", "reward"] + +_ResolvedTask = Literal["generate", "embed", "classify", "score", "reward", + "draft"] + +RunnerType = Literal["generate", "pooling", "draft"] + +_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = { + "generate": ["generate"], + "pooling": ["embed", "classify", "score", "reward"], + "draft": ["draft"], +} + +_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { + task: runner + for runner, tasks in _RUNNER_TASKS.items() + for task in tasks +} + +HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], + PretrainedConfig]] + + +class SupportsHash(Protocol): + + def compute_hash(self) -> str: + ... + + +class ModelImpl(str, enum.Enum): + AUTO = "auto" + VLLM = "vllm" + TRANSFORMERS = "transformers" + + +class ModelConfig: + """Configuration for the model. + + Args: + model: Name or path of the huggingface model to use. + It is also used as the content for `model_name` tag in metrics + output when `served_model_name` is not specified. + task: The task to use the model for. Each vLLM instance only supports + one task, even if the same model can be used for multiple tasks. + When the model only supports one task, "auto" can be used to select + it; otherwise, you must specify explicitly which task to use. + tokenizer: Name or path of the huggingface tokenizer to use. + tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if + available, "slow" will always use the slow tokenizer, and + "mistral" will always use the tokenizer from `mistral_common`. + trust_remote_code: Trust remote code (e.g., from HuggingFace) when + downloading the model and tokenizer. + allowed_local_media_path: Allowing API requests to read local images or + videos from directories specified by the server file system. + This is a security risk. Should only be enabled in trusted + environments. + dtype: Data type for model weights and activations. The "auto" option + will use FP16 precision for FP32 and FP16 models, and BF16 precision + for BF16 models. + seed: Random seed for reproducibility. + revision: The specific model version to use. It can be a branch name, + a tag name, or a commit id. If unspecified, will use the default + version. + code_revision: The specific revision to use for the model code on + Hugging Face Hub. It can be a branch name, a tag name, or a + commit id. If unspecified, will use the default version. + tokenizer_revision: The specific tokenizer version to use. It can be a + branch name, a tag name, or a commit id. If unspecified, will use + the default version. + max_model_len: Maximum length of a sequence (including prompt and + output). If None, will be derived from the model. + spec_target_max_model_len: Specify the the maximum length for spec + decoding draft models. + quantization: Quantization method that was used to quantize the model + weights. If None, we assume the model weights are not quantized. + enforce_eager: Whether to enforce eager execution. If True, we will + disable CUDA graph and always execute the model in eager mode. + If False, we will use CUDA graph and eager execution in hybrid. + If None, the user did not specify, so default to False. + max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. + When a sequence has context length larger than this, we fall back + to eager mode. Additionally for encoder-decoder models, if the + sequence length of the encoder input is larger than this, we fall + back to the eager mode. + max_logprobs: Maximum number of log probabilities. Defaults to 20. + disable_sliding_window: Whether to disable sliding window. If True, + we will disable the sliding window functionality of the model. + If the model does not support sliding window, this argument is + ignored. + skip_tokenizer_init: If true, skip initialization of tokenizer and + detokenizer. + served_model_name: The model name used in metrics tag `model_name`, + matches the model name exposed via the APIs. If multiple model + names provided, the first name will be used. If not specified, + the model name will be the same as `model`. + limit_mm_per_prompt: Maximum number of data items per modality + per prompt. Only applicable for multimodal models. + use_async_output_proc: Whether to use async output processor. + Defaults to True. + config_format: The config format which shall be loaded. + Defaults to 'auto' which defaults to 'hf'. + hf_overrides: If a dictionary, contains arguments to be forwarded to the + HuggingFace config. If a callable, it is called to update the + HuggingFace config. + mm_processor_kwargs: Arguments to be forwarded to the model's processor + for multi-modal data, e.g., image processor. + disable_mm_preprocessor_cache: If true, then disables caching of the + multi-modal preprocessor/mapper. (not recommended) + override_neuron_config: Initialize non default neuron config or + override default neuron config that are specific to Neuron devices, + this argument will be used to configure the neuron config that + can not be gathered from the vllm arguments. + override_pooler_config: Initialize non default pooling config or + override default pooling config for the pooling model. + logits_processor_pattern: Optional regex pattern specifying valid + logits processor qualified names that can be passed with the + `logits_processors` extra completion argument. Defaults to None, + which allows no processors. + generation_config: Configuration parameter file for generation. + model_impl: Which implementation of the model to use: + "auto" will try to use the vLLM implementation if it exists and + fall back to the Transformers implementation if no vLLM + implementation is available. + "vllm" will use the vLLM model implementation. + "transformers" will use the Transformers model implementation. + override_generation_config: Override the generation config with the + given config. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + factors.append(self.model) + factors.append(self.dtype) + factors.append(self.quantization) + factors.append(self.revision) + factors.append(self.code_revision) + factors.append(self.trust_remote_code) + factors.append(self.rope_scaling) + factors.append(self.rope_theta) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __init__( + self, + model: str, + task: Union[TaskOption, Literal["draft"]], + tokenizer: str, + tokenizer_mode: str, + trust_remote_code: bool, + dtype: Union[str, torch.dtype], + seed: int, + allowed_local_media_path: str = "", + revision: Optional[str] = None, + code_revision: Optional[str] = None, + rope_scaling: Optional[Dict[str, Any]] = None, + rope_theta: Optional[float] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: Optional[int] = None, + spec_target_max_model_len: Optional[int] = None, + quantization: Optional[str] = None, + enforce_eager: Optional[bool] = None, + max_seq_len_to_capture: Optional[int] = None, + max_logprobs: int = 20, + disable_sliding_window: bool = False, + skip_tokenizer_init: bool = False, + served_model_name: Optional[Union[str, List[str]]] = None, + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, + use_async_output_proc: bool = True, + config_format: ConfigFormat = ConfigFormat.AUTO, + hf_overrides: Optional[HfOverrides] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + disable_mm_preprocessor_cache: bool = False, + override_neuron_config: Optional[Dict[str, Any]] = None, + override_pooler_config: Optional["PoolerConfig"] = None, + logits_processor_pattern: Optional[str] = None, + generation_config: Optional[str] = None, + enable_sleep_mode: bool = False, + override_generation_config: Optional[Dict[str, Any]] = None, + model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, + ) -> None: + self.model = model + self.tokenizer = tokenizer + self.tokenizer_mode = tokenizer_mode + self.trust_remote_code = trust_remote_code + self.allowed_local_media_path = allowed_local_media_path + self.seed = seed + self.revision = revision + self.code_revision = code_revision + self.rope_scaling = rope_scaling + self.rope_theta = rope_theta + self.model_impl = model_impl + + if hf_overrides is None: + hf_overrides = {} + + if callable(hf_overrides): + hf_overrides_kw = {} + hf_overrides_fn = hf_overrides + else: + hf_overrides_kw = hf_overrides + hf_overrides_fn = None + + if rope_scaling is not None: + hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling} + hf_overrides_kw.update(hf_override) + msg = ("`--rope-scaling` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_override!r}'`") + warnings.warn(DeprecationWarning(msg), stacklevel=2) + if rope_theta is not None: + hf_override = {"rope_theta": rope_theta} + hf_overrides_kw.update(hf_override) + msg = ("`--rope-theta` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_override!r}'`") + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + self.maybe_pull_model_tokenizer_for_s3(model, tokenizer) + + # The tokenizer version is consistent with the model version by default. + if tokenizer_revision is None: + self.tokenizer_revision = revision + else: + self.tokenizer_revision = tokenizer_revision + self.quantization = quantization + self.enforce_eager = enforce_eager + self.max_seq_len_to_capture = max_seq_len_to_capture + self.max_logprobs = max_logprobs + self.disable_sliding_window = disable_sliding_window + self.skip_tokenizer_init = skip_tokenizer_init + self.enable_sleep_mode = enable_sleep_mode + + from vllm.platforms import current_platform + + if self.enable_sleep_mode and not current_platform.is_cuda(): + raise ValueError("Sleep mode is only supported on CUDA devices.") + + hf_config = get_config(self.model, trust_remote_code, revision, + code_revision, config_format) + + if hf_overrides_kw: + logger.info("Overriding HF config with %s", hf_overrides_kw) + hf_config.update(hf_overrides_kw) + if hf_overrides_fn: + logger.info("Overriding HF config with %s", hf_overrides_fn) + hf_config = hf_overrides_fn(hf_config) + + self.hf_config = hf_config + + self.hf_text_config = get_hf_text_config(self.hf_config) + self.encoder_config = self._get_encoder_config() + self.hf_image_processor_config = get_hf_image_processor_config( + self.model, revision) + self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) + self.use_async_output_proc = use_async_output_proc + self.mm_processor_kwargs = mm_processor_kwargs + self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache + + # Set enforce_eager to False if the value is unset. + if self.enforce_eager is None: + self.enforce_eager = False + + sliding_window = getattr(self.hf_text_config, "sliding_window", None) + has_interleaved_attention = (sliding_window is not None) and ( + isinstance(sliding_window, list) or + (self.hf_text_config.model_type in ["gemma2", "cohere2"])) + + if (not self.disable_sliding_window and has_interleaved_attention): + if (backend := + envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): + sliding_window_len_min = get_min_sliding_window( + self.hf_text_config.sliding_window) + + logger.warning_once( + f"{self.hf_text_config.model_type} has interleaved " + "attention, which is currently not supported by the " + f"{backend} backend. Disabling sliding window and capping " + "the max length to the sliding window size " + f"({sliding_window_len_min}).") + self.disable_sliding_window = True + else: + # for a model with interleaved attention, + # the scheduler and the model treat it as full attention + # (i.e., not dropping any tokens outside the window). + # only the attention layer itself is aware of the sliding + # window, and use the window size to compute the attention. + self.hf_text_config.interleaved_sliding_window = sliding_window + delattr(self.hf_text_config, "sliding_window") + sliding_window = None + + self.max_model_len = _get_and_verify_max_len( + hf_config=self.hf_text_config, + max_model_len=max_model_len, + disable_sliding_window=self.disable_sliding_window, + sliding_window_len=self.get_hf_config_sliding_window(), + spec_target_max_model_len=spec_target_max_model_len, + encoder_config=self.encoder_config) + self.served_model_name = get_served_model_name(model, + served_model_name) + self.multimodal_config = self._init_multimodal_config( + limit_mm_per_prompt) + if not self.skip_tokenizer_init: + self._verify_tokenizer_mode() + + self.is_attention_free = self._init_attention_free() + self.is_hybrid = self._init_is_hybrid() + self.has_inner_state = self._init_has_inner_state() + + if current_platform.is_neuron(): + self.override_neuron_config = override_neuron_config + else: + self.override_neuron_config = None + + supported_tasks, task = self._resolve_task(task, self.hf_config) + self.supported_tasks = supported_tasks + self.task: Final = task + if self.task in ("draft", "generate"): + self.truncation_side = "left" + else: + self.truncation_side = "right" + + self.pooler_config = self._init_pooler_config(override_pooler_config) + self.logits_processor_pattern = logits_processor_pattern + + self.generation_config = generation_config + self.override_generation_config = override_generation_config or {} + + self._verify_quantization() + self._verify_cuda_graph() + self._verify_bnb_config() + + def maybe_pull_model_tokenizer_for_s3(self, model: str, + tokenizer: str) -> None: + """ + Pull the model config or tokenizer to a temporary + directory in case of S3. + + Args: + model: The model name or path. + tokenizer: The tokenizer name or path. + + """ + if is_s3(model) or is_s3(tokenizer): + if is_s3(model): + s3_model = S3Model() + s3_model.pull_files(model, allow_pattern=["*config.json"]) + self.model_weights = self.model + self.model = s3_model.dir + + if is_s3(tokenizer): + s3_tokenizer = S3Model() + s3_tokenizer.pull_files( + model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"]) + self.tokenizer = s3_tokenizer.dir + + def _init_multimodal_config( + self, limit_mm_per_prompt: Optional[Mapping[str, int]] + ) -> Optional["MultiModalConfig"]: + architectures = getattr(self.hf_config, "architectures", []) + if ModelRegistry.is_multimodal_model(architectures): + return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {}) + + if limit_mm_per_prompt: + raise ValueError("`limit_mm_per_prompt` is only supported for " + "multimodal models.") + + return None + + def _get_encoder_config(self): + return get_sentence_transformer_tokenizer_config( + self.model, self.revision) + + def _init_pooler_config( + self, + override_pooler_config: Optional["PoolerConfig"], + ) -> Optional["PoolerConfig"]: + + if self.runner_type == "pooling": + user_config = override_pooler_config or PoolerConfig() + + base_config = get_pooling_config(self.model, self.revision) + if base_config is not None: + # Only set values that are not overridden by the user + for k, v in base_config.items(): + if getattr(user_config, k) is None: + setattr(user_config, k, v) + + return user_config + + return None + + def _init_attention_free(self) -> bool: + architectures = getattr(self.hf_config, "architectures", []) + return ModelRegistry.is_attention_free_model(architectures) + + def _init_is_hybrid(self) -> bool: + architectures = getattr(self.hf_config, "architectures", []) + return ModelRegistry.is_hybrid_model(architectures) + + def _init_has_inner_state(self) -> bool: + architectures = getattr(self.hf_config, "architectures", []) + return ModelRegistry.model_has_inner_state(architectures) + + def _verify_tokenizer_mode(self) -> None: + tokenizer_mode = self.tokenizer_mode.lower() + if tokenizer_mode not in ["auto", "slow", "mistral"]: + raise ValueError( + f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " + "either 'auto', 'slow' or 'mistral'.") + self.tokenizer_mode = tokenizer_mode + + def _get_preferred_task( + self, + architectures: List[str], + supported_tasks: Set[_ResolvedTask], + ) -> Optional[_ResolvedTask]: + model_id = self.model + if get_pooling_config(model_id, self.revision): + return "embed" + if ModelRegistry.is_cross_encoder_model(architectures): + return "score" + + suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [ + # Other models follow this pattern + ("ForCausalLM", "generate"), + ("ForConditionalGeneration", "generate"), + ("ForSequenceClassification", "classify"), + ("ChatModel", "generate"), + ("LMHeadModel", "generate"), + ("EmbeddingModel", "embed"), + ("RewardModel", "reward"), + ] + _, arch = ModelRegistry.inspect_model_cls(architectures) + + for suffix, pref_task in suffix_to_preferred_task: + if arch.endswith(suffix) and pref_task in supported_tasks: + return pref_task + + return None + + def _resolve_task( + self, + task_option: Union[TaskOption, Literal["draft"]], + hf_config: PretrainedConfig, + ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]: + if task_option == "draft": + return {"draft"}, "draft" + + architectures = getattr(hf_config, "architectures", []) + + runner_support: Dict[RunnerType, bool] = { + # NOTE: Listed from highest to lowest priority, + # in case the model supports multiple of them + "generate": ModelRegistry.is_text_generation_model(architectures), + "pooling": ModelRegistry.is_pooling_model(architectures), + } + supported_runner_types_lst: List[RunnerType] = [ + runner_type + for runner_type, is_supported in runner_support.items() + if is_supported + ] + + supported_tasks_lst: List[_ResolvedTask] = [ + task for runner_type in supported_runner_types_lst + for task in _RUNNER_TASKS[runner_type] + ] + supported_tasks = set(supported_tasks_lst) + + if task_option == "auto": + selected_task = next(iter(supported_tasks_lst)) + + if len(supported_tasks_lst) > 1: + preferred_task = self._get_preferred_task( + architectures, supported_tasks) + if preferred_task is not None: + selected_task = preferred_task + + logger.info( + "This model supports multiple tasks: %s. " + "Defaulting to '%s'.", supported_tasks, selected_task) + else: + # Aliases + if task_option == "embedding": + preferred_task = self._get_preferred_task( + architectures, supported_tasks) + if preferred_task != "embed": + msg = ("The 'embedding' task will be restricted to " + "embedding models in a future release. Please " + "pass `--task classify`, `--task score`, or " + "`--task reward` explicitly for other pooling " + "models.") + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + task_option = preferred_task or "embed" + + if task_option not in supported_tasks: + msg = ( + f"This model does not support the '{task_option}' task. " + f"Supported tasks: {supported_tasks}") + raise ValueError(msg) + + selected_task = task_option + + return supported_tasks, selected_task + + def _parse_quant_hf_config(self): + quant_cfg = getattr(self.hf_config, "quantization_config", None) + if quant_cfg is None: + # compressed-tensors uses a "compression_config" key + quant_cfg = getattr(self.hf_config, "compression_config", None) + return quant_cfg + + def _verify_quantization(self) -> None: + supported_quantization = QUANTIZATION_METHODS + optimized_quantization_methods = [ + "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", + "awq_marlin", "fbgemm_fp8", "compressed_tensors", + "compressed-tensors", "experts_int8", "quark" + ] + if self.quantization is not None: + self.quantization = self.quantization.lower() + + # Parse quantization method from the HF model config, if available. + quant_cfg = self._parse_quant_hf_config() + + if quant_cfg is not None: + quant_method = quant_cfg.get("quant_method", "").lower() + + # Detect which checkpoint is it + for name in QUANTIZATION_METHODS: + method = get_quantization_config(name) + quantization_override = method.override_quantization_method( + quant_cfg, self.quantization) + if quantization_override: + quant_method = quantization_override + self.quantization = quantization_override + break + + # Verify quantization configurations. + if self.quantization is None: + self.quantization = quant_method + elif self.quantization != quant_method: + raise ValueError( + "Quantization method specified in the model config " + f"({quant_method}) does not match the quantization " + f"method specified in the `quantization` argument " + f"({self.quantization}).") + + if self.quantization is not None: + if self.quantization not in supported_quantization: + raise ValueError( + f"Unknown quantization method: {self.quantization}. Must " + f"be one of {supported_quantization}.") + from vllm.platforms import current_platform + current_platform.verify_quantization(self.quantization) + if self.quantization not in optimized_quantization_methods: + logger.warning( + "%s quantization is not fully " + "optimized yet. The speed can be slower than " + "non-quantized models.", self.quantization) + + def _verify_cuda_graph(self) -> None: + if self.max_seq_len_to_capture is None: + self.max_seq_len_to_capture = self.max_model_len + self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, + self.max_model_len) + + MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama'] + if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH + and not self.enforce_eager): + logger.warning( + "CUDA graph is not supported for %s yet, fallback to the eager " + "mode.", self.hf_config.model_type) + self.enforce_eager = True + + def _verify_bnb_config(self) -> None: + """ + The current version of bitsandbytes (0.44.0) with 8-bit models does not + yet support CUDA graph. + """ + is_bitsandbytes = self.quantization == "bitsandbytes" + has_quantization_config = (getattr(self.hf_config, + "quantization_config", None) + is not None) + is_8bit = (self.hf_config.quantization_config.get( + "load_in_8bit", False) if has_quantization_config else False) + if all([ + is_bitsandbytes, + has_quantization_config, + is_8bit, + not self.enforce_eager, + ]): + logger.warning( + "CUDA graph is not supported on BitAndBytes 8bit yet, " + "fallback to the eager mode.") + self.enforce_eager = True + + def verify_async_output_proc(self, parallel_config, speculative_config, + device_config) -> None: + if not self.use_async_output_proc: + # Nothing to check + return + + if parallel_config.pipeline_parallel_size > 1: + logger.warning("Async output processing can not be enabled " + "with pipeline parallel") + self.use_async_output_proc = False + return + + # Reminder: Please update docs/source/features/compatibility_matrix.md + # If the feature combo become valid + from vllm.platforms import current_platform + if not current_platform.is_async_output_supported(self.enforce_eager): + logger.warning( + "Async output processing is not supported on the " + "current platform type %s.", current_platform.device_type) + self.use_async_output_proc = False + return + + if envs.VLLM_USE_RAY_SPMD_WORKER: + logger.warning( + "Async output processing can not be enabled with ray spmd") + self.use_async_output_proc = False + return + + # Async postprocessor is not necessary for pooling models + # since there is no token generation + if self.runner_type == "pooling": + self.use_async_output_proc = False + + # Reminder: Please update docs/source/features/compatibility_matrix.md + # If the feature combo become valid + if speculative_config: + logger.warning("Async output processing is not supported with" + " speculative decoding currently.") + self.use_async_output_proc = False + + def verify_with_parallel_config( + self, + parallel_config: "ParallelConfig", + ) -> None: + total_num_attention_heads = getattr(self.hf_text_config, + "num_attention_heads", 0) + tensor_parallel_size = parallel_config.tensor_parallel_size + if total_num_attention_heads % tensor_parallel_size != 0: + raise ValueError( + f"Total number of attention heads ({total_num_attention_heads})" + " must be divisible by tensor parallel size " + f"({tensor_parallel_size}).") + + pipeline_parallel_size = parallel_config.pipeline_parallel_size + if pipeline_parallel_size > 1: + architectures = getattr(self.hf_config, "architectures", []) + if not ModelRegistry.is_pp_supported_model(architectures): + raise NotImplementedError( + "Pipeline parallelism is not supported for this model. " + "Supported models implement the `SupportsPP` interface.") + + if self.use_async_output_proc: + logger.warning("Async output processor is not supported with " + "pipeline parallelism currently. Disabling it.") + self.use_async_output_proc = False + + def get_hf_config_sliding_window( + self) -> Union[Optional[int], List[Optional[int]]]: + """Get the sliding window size, or None if disabled.""" + + # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in + # addition to sliding window size. We check if that field is present + # and if it's False, return None. + if (hasattr(self.hf_text_config, "use_sliding_window") + and not self.hf_text_config.use_sliding_window): + return None + return getattr(self.hf_text_config, "sliding_window", None) + + def get_sliding_window(self) -> Optional[Union[int, List[Optional[int]]]]: + """Get the sliding window size, or None if disabled. + """ + # If user disables sliding window, return None. + if self.disable_sliding_window: + return None + # Otherwise get the value from the hf config. + return self.get_hf_config_sliding_window() + + def get_vocab_size(self) -> int: + return self.hf_text_config.vocab_size + + def get_hidden_size(self) -> int: + return self.hf_text_config.hidden_size + + @property + def is_deepseek_mla(self) -> bool: + return (hasattr(self.hf_text_config, "model_type")) \ + and (self.hf_text_config.model_type in \ + ('deepseek_v2', 'deepseek_v3'))\ + and (self.hf_text_config.kv_lora_rank is not None) + + def get_head_size(self) -> int: + # TODO remove hard code + if self.is_deepseek_mla: + qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", + 0) + if self.use_mla: + return self.hf_text_config.kv_lora_rank + qk_rope_head_dim + else: + qk_nope_head_dim = getattr(self.hf_text_config, + "qk_nope_head_dim", 0) + if qk_rope_head_dim and qk_nope_head_dim: + return qk_rope_head_dim + qk_nope_head_dim + + if self.is_attention_free: + return 0 + + if hasattr(self.hf_text_config, "head_dim"): + return self.hf_text_config.head_dim + # FIXME(woosuk): This may not be true for all models. + return (self.hf_text_config.hidden_size // + self.hf_text_config.num_attention_heads) + + def get_total_num_kv_heads(self) -> int: + """Returns the total number of KV heads.""" + # For GPTBigCode & Falcon: + # NOTE: for falcon, when new_decoder_architecture is True, the + # multi_query flag is ignored and we use n_head_kv for the number of + # KV heads. + falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] + new_decoder_arch_falcon = ( + self.hf_config.model_type in falcon_model_types + and getattr(self.hf_config, "new_decoder_architecture", False)) + if not new_decoder_arch_falcon and getattr(self.hf_text_config, + "multi_query", False): + # Multi-query attention, only one KV head. + # Currently, tensor parallelism is not supported in this case. + return 1 + + # For DBRX and MPT + if self.hf_config.model_type == "mpt": + if "kv_n_heads" in self.hf_config.attn_config: + return self.hf_config.attn_config["kv_n_heads"] + return self.hf_config.num_attention_heads + if self.hf_config.model_type == "dbrx": + return getattr(self.hf_config.attn_config, "kv_n_heads", + self.hf_config.num_attention_heads) + + if self.is_attention_free: + return 0 + + attributes = [ + # For Falcon: + "n_head_kv", + "num_kv_heads", + # For LLaMA-2: + "num_key_value_heads", + # For ChatGLM: + "multi_query_group_num", + ] + for attr in attributes: + num_kv_heads = getattr(self.hf_text_config, attr, None) + if num_kv_heads is not None: + return num_kv_heads + + # For non-grouped-query attention models, the number of KV heads is + # equal to the number of attention heads. + return self.hf_text_config.num_attention_heads + + def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: + """Returns the number of KV heads per GPU.""" + if self.use_mla: + # When using MLA during decode it becomes MQA + return 1 + + total_num_kv_heads = self.get_total_num_kv_heads() + # If tensor parallelism is used, we divide the number of KV heads by + # the tensor parallel size. We will replicate the KV heads in the + # case where the number of KV heads is smaller than the tensor + # parallel size so each GPU has at least one KV head. + return max(1, + total_num_kv_heads // parallel_config.tensor_parallel_size) + + def get_num_attention_heads(self, + parallel_config: "ParallelConfig") -> int: + num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) + return num_heads // parallel_config.tensor_parallel_size + + def get_layers_start_end_indices( + self, parallel_config: "ParallelConfig") -> Tuple[int, int]: + from vllm.distributed.utils import get_pp_indices + total_num_hidden_layers = getattr(self.hf_text_config, + "num_hidden_layers", 0) + pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size + pp_size = parallel_config.pipeline_parallel_size + start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) + return start, end + + def get_num_layers(self, parallel_config: "ParallelConfig") -> int: + start, end = self.get_layers_start_end_indices(parallel_config) + return end - start + + def get_num_layers_by_block_type( + self, + parallel_config: "ParallelConfig", + block_type: LayerBlockType = LayerBlockType.attention, + ) -> int: + # This function relies on 'layers_block_type' in hf_config, + # for w/o this attribute, we will need to have workarounds like so + attn_block_type = block_type == LayerBlockType.attention + is_transformer = not self.is_hybrid and not self.is_attention_free + start, end = self.get_layers_start_end_indices(parallel_config) + + if is_transformer: + # Handle the basic case first + return end - start if attn_block_type else 0 + elif self.is_attention_free: + # Attention free + # Note that this code assumes there + # is only one type of attention-free block type. + return 0 if attn_block_type else end - start + else: + # Hybrid model + layers_block_type_value = getattr(self.hf_config, + "layers_block_type", None) + if layers_block_type_value is None: + raise ValueError("The model is an hybrid without a" + "layers_block_type in the hf_config," + "cannot determine the num of " + f"{block_type.value} layers") + + return sum(t == block_type.value + for t in layers_block_type_value[start:end]) + + def get_multimodal_config(self) -> "MultiModalConfig": + """ + Get the multimodal configuration of the model. + + Raises: + ValueError: If the model is not multimodal. + """ + if self.multimodal_config is None: + raise ValueError("The model is not multimodal.") + + return self.multimodal_config + + def try_get_generation_config(self) -> Dict[str, Any]: + if self.generation_config is None or self.generation_config == "auto": + config = try_get_generation_config( + self.model, + trust_remote_code=self.trust_remote_code, + revision=self.revision, + ) + else: + config = try_get_generation_config( + self.generation_config, + trust_remote_code=self.trust_remote_code, + ) + + if config is None: + return {} + + return config.to_diff_dict() + + def get_diff_sampling_param(self) -> Dict[str, Any]: + """ + This method returns a dictionary containing the parameters + that differ from the default sampling parameters, but only + if `generation_config` is set. If `generation_config` is not + set, an empty dictionary is returned. + + Returns: + Dict[str, Any]: A dictionary with the differing sampling + parameters if `generation_config` is set, otherwise an + empty dictionary. + """ + if self.generation_config is None: + # When generation_config is not set + config = {} + else: + config = self.try_get_generation_config() + + # Overriding with given generation config + config.update(self.override_generation_config) + + available_params = [ + "repetition_penalty", + "temperature", + "top_k", + "top_p", + "min_p", + "max_new_tokens", + ] + if any(p in config for p in available_params): + diff_sampling_param = { + p: config.get(p) + for p in available_params if config.get(p) is not None + } + # Huggingface definition of max_new_tokens is equivalent + # to vLLM's max_tokens + if "max_new_tokens" in diff_sampling_param: + diff_sampling_param["max_tokens"] = diff_sampling_param.pop( + "max_new_tokens") + else: + diff_sampling_param = {} + return diff_sampling_param + + @property + def is_encoder_decoder(self) -> bool: + """Extract the HF encoder/decoder model flag.""" + return is_encoder_decoder(self.hf_config) + + @property + def uses_mrope(self) -> bool: + return uses_mrope(self.hf_config) + + @property + def is_multimodal_model(self) -> bool: + return self.multimodal_config is not None + + @property + def is_cross_encoder(self) -> bool: + architectures = getattr(self.hf_config, "architectures", []) + return ModelRegistry.is_cross_encoder_model(architectures) + + @property + def use_mla(self) -> bool: + if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE: + return False + + if self.quantization is not None and self.quantization not in [\ + "fp8", "compressed-tensors"]: + logger.warning( + "MLA is not supported with %s quantization. " + "Disabling MLA.", self.quantization) + return False + + # If using a "compressed-tensors" checkpoint, check that all groups + # have fp8 for both weights and activations. + if self.quantization == "compressed-tensors": + quant_config = self._parse_quant_hf_config() + for group_name, cfg in quant_config.get("config_groups", { + "": {} + }).items(): + act_cfg = cfg.get("input_activations", {}) + act_type = None if act_cfg is None else act_cfg.get("type", "") + w_cfg = cfg.get("weights", {}) + w_type = None if w_cfg is None else w_cfg.get("type", "") + if act_type != "fp8" or w_type != "fp8": + logger.warning( + "compressed-tensors MLA support requires fp8 " + "activations and weights in group '%s', but got " + "activations type '%s' and weights type '%s'.\n " + "Full config: %s", group_name, act_type, w_type, + quant_config) + return False + + return True + + @property + def supported_runner_types(self) -> Set[RunnerType]: + return {_TASK_RUNNER[task] for task in self.supported_tasks} + + @property + def runner_type(self) -> RunnerType: + return _TASK_RUNNER[self.task] + + +class CacheConfig: + """Configuration for the KV cache. + + Args: + block_size: Size of a cache block in number of tokens. + gpu_memory_utilization: Fraction of GPU memory to use for the + vLLM execution. + swap_space: Size of the CPU swap space per GPU (in GiB). + cache_dtype: Data type for kv cache storage. + is_attention_free: Whether the model is attention-free. + num_gpu_blocks_override: Number of GPU blocks to use. This overrides the + profiled num_gpu_blocks if specified. Does nothing if None. + sliding_window: Sliding window size for the KV cache. Can not work with + prefix caching enabled. + enable_prefix_caching: Whether to enable prefix caching. + cpu_offload_gb: Size of the CPU offload buffer in GiB. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + factors.append(self.cache_dtype) + # `cpu_offload_gb` does not use `torch.compile` yet. + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __init__( + self, + block_size: int, + gpu_memory_utilization: float, + swap_space: float, + cache_dtype: str, + is_attention_free: bool = False, + num_gpu_blocks_override: Optional[int] = None, + sliding_window: Optional[int] = None, + enable_prefix_caching: bool = False, + cpu_offload_gb: float = 0, + calculate_kv_scales: Optional[bool] = None, + ) -> None: + self.block_size = block_size + self.gpu_memory_utilization = gpu_memory_utilization + self.swap_space_bytes = swap_space * GiB_bytes + self.num_gpu_blocks_override = num_gpu_blocks_override + self.cache_dtype = cache_dtype + self.is_attention_free = is_attention_free + self.sliding_window = sliding_window + self.enable_prefix_caching = enable_prefix_caching + self.cpu_offload_gb = cpu_offload_gb + self.calculate_kv_scales = calculate_kv_scales + self._verify_args() + self._verify_cache_dtype() + self._verify_prefix_caching() + + # Will be set after profiling. + self.num_gpu_blocks: Optional[int] = None + self.num_cpu_blocks: Optional[int] = None + + # Set calculate_kv_scales to False if the value is unset. + if self.calculate_kv_scales is None: + self.calculate_kv_scales = False + + def metrics_info(self): + # convert cache_config to dict(key: str, value: str) for prometheus + # metrics info + return {key: str(value) for key, value in self.__dict__.items()} + + def _verify_args(self) -> None: + if self.gpu_memory_utilization > 1.0: + raise ValueError( + "GPU memory utilization must be less than 1.0. Got " + f"{self.gpu_memory_utilization}.") + + def _verify_cache_dtype(self) -> None: + if self.cache_dtype == "auto": + pass + elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"): + logger.info( + "Using fp8 data type to store kv cache. It reduces the GPU " + "memory footprint and boosts the performance. " + "Meanwhile, it may cause accuracy drop without a proper " + "scaling factor") + else: + raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") + + def _verify_prefix_caching(self) -> None: + if not self.enable_prefix_caching: + return + + if self.sliding_window is not None: + raise NotImplementedError( + "Prefix caching is not supported with sliding window. " + "Run with --disable-sliding-window to use prefix caching.") + + def verify_with_parallel_config( + self, + parallel_config: "ParallelConfig", + ) -> None: + total_cpu_memory = get_cpu_memory() + # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel + # group are in the same node. However, the GPUs may span multiple nodes. + num_gpus_per_node = parallel_config.tensor_parallel_size + cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node + + msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the " + f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory " + "is allocated for the swap space.") + if cpu_memory_usage > 0.7 * total_cpu_memory: + raise ValueError("Too large swap space. " + msg) + elif cpu_memory_usage > 0.4 * total_cpu_memory: + logger.warning("Possibly too large swap space. %s", msg) + + +@dataclass +class TokenizerPoolConfig: + """Configuration for the tokenizer pool. + + Args: + pool_size: Number of tokenizer workers in the pool. + pool_type: Type of the pool. + extra_config: Additional config for the pool. + The way the config will be used depends on the + pool type. + """ + pool_size: int + pool_type: Union[str, Type["BaseTokenizerGroup"]] + extra_config: dict + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __post_init__(self): + if self.pool_type not in ("ray", ) and not isinstance( + self.pool_type, type): + raise ValueError(f"Unknown pool type: {self.pool_type}") + if not isinstance(self.extra_config, dict): + raise ValueError("extra_config must be a dictionary.") + + @classmethod + def create_config( + cls, tokenizer_pool_size: int, + tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]], + tokenizer_pool_extra_config: Optional[Union[str, dict]] + ) -> Optional["TokenizerPoolConfig"]: + """Create a TokenizerPoolConfig from the given parameters. + + If tokenizer_pool_size is 0, return None. + + Args: + tokenizer_pool_size: Number of tokenizer workers in the pool. + tokenizer_pool_type: Type of the pool. + tokenizer_pool_extra_config: Additional config for the pool. + The way the config will be used depends on the + pool type. This can be a JSON string (will be parsed). + """ + if tokenizer_pool_size: + if isinstance(tokenizer_pool_extra_config, str): + tokenizer_pool_extra_config_parsed = json.loads( + tokenizer_pool_extra_config) + else: + tokenizer_pool_extra_config_parsed = ( + tokenizer_pool_extra_config or {}) + tokenizer_pool_config = cls(tokenizer_pool_size, + tokenizer_pool_type, + tokenizer_pool_extra_config_parsed) + else: + tokenizer_pool_config = None + return tokenizer_pool_config + + +class LoadFormat(str, enum.Enum): + AUTO = "auto" + PT = "pt" + SAFETENSORS = "safetensors" + NPCACHE = "npcache" + DUMMY = "dummy" + TENSORIZER = "tensorizer" + SHARDED_STATE = "sharded_state" + GGUF = "gguf" + BITSANDBYTES = "bitsandbytes" + MISTRAL = "mistral" + RUNAI_STREAMER = "runai_streamer" + + +@dataclass +class LoadConfig: + """ + download_dir: Directory to download and load the weights, default to the + default cache directory of huggingface. + load_format: The format of the model weights to load: + "auto" will try to load the weights in the safetensors format and + fall back to the pytorch bin format if safetensors format is + not available. + "pt" will load the weights in the pytorch bin format. + "safetensors" will load the weights in the safetensors format. + "npcache" will load the weights in pytorch format and store + a numpy cache to speed up the loading. + "dummy" will initialize the weights with random values, which is + mainly for profiling. + "tensorizer" will use CoreWeave's tensorizer library for + fast weight loading. + "bitsandbytes" will load nf4 type weights. + model_loader_extra_config: The extra config for the model loader. + ignore_patterns: The list of patterns to ignore when loading the model. + Default to "original/**/*" to avoid repeated loading of llama's + checkpoints. + """ + + load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO + download_dir: Optional[str] = None + model_loader_extra_config: Optional[Union[str, dict]] = field( + default_factory=dict) + ignore_patterns: Optional[Union[List[str], str]] = None + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __post_init__(self): + model_loader_extra_config = self.model_loader_extra_config or {} + if isinstance(model_loader_extra_config, str): + self.model_loader_extra_config = json.loads( + model_loader_extra_config) + if isinstance(self.load_format, str): + load_format = self.load_format.lower() + self.load_format = LoadFormat(load_format) + + if self.ignore_patterns is not None and len(self.ignore_patterns) > 0: + logger.info( + "Ignoring the following patterns when downloading weights: %s", + self.ignore_patterns) + else: + self.ignore_patterns = ["original/**/*"] + + +@dataclass +class ParallelConfig: + """Configuration for the distributed execution.""" + + pipeline_parallel_size: int = 1 # Number of pipeline parallel groups. + tensor_parallel_size: int = 1 # Number of tensor parallel groups. + + # Maximum number of multiple batches + # when load model sequentially. To avoid RAM OOM when using tensor + # parallel and large models. + max_parallel_loading_workers: Optional[int] = None + + # Disable the custom all-reduce kernel and fall back to NCCL. + disable_custom_all_reduce: bool = False + + # Config for the tokenizer pool. If None, will use synchronous tokenization. + tokenizer_pool_config: Optional[TokenizerPoolConfig] = None + + # Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler. + ray_workers_use_nsight: bool = False + + # ray distributed model workers placement group. + placement_group: Optional["PlacementGroup"] = None + + # Backend to use for distributed model + # workers, either "ray" or "mp" (multiprocessing). If the product + # of pipeline_parallel_size and tensor_parallel_size is less than + # or equal to the number of GPUs available, "mp" will be used to + # keep processing on a single host. Otherwise, this will default + # to "ray" if Ray is installed and fail otherwise. Note that tpu + # and hpu only support Ray for distributed inference. + distributed_executor_backend: Optional[Union[str, + Type["ExecutorBase"]]] = None + + # the full name of the worker class to use. If "auto", the worker class + # will be determined based on the platform. + worker_cls: str = "auto" + sd_worker_cls: str = "auto" + + world_size: int = field(init=False) + + rank: int = 0 + + def compute_hash(self): + """ + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + factors.append(self.pipeline_parallel_size) + factors.append(self.tensor_parallel_size) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __post_init__(self) -> None: + self.world_size = self.pipeline_parallel_size * \ + self.tensor_parallel_size + + ray_only_devices = ["tpu"] + from vllm.platforms import current_platform + if (current_platform.device_type in ray_only_devices + and self.world_size > 1): + if self.distributed_executor_backend is None: + self.distributed_executor_backend = "ray" + if self.distributed_executor_backend != "ray": + raise ValueError( + f"{current_platform.device_type.upper()} backend only " + "supports Ray for distributed inference.") + + if self.distributed_executor_backend is None and self.world_size > 1: + # We use multiprocessing by default if world_size fits on the + # current node and we aren't in a ray placement group. + + from vllm.executor import ray_utils + backend = "mp" + ray_found = ray_utils.ray_is_available() + if current_platform.is_neuron(): + # neuron uses single process to control multiple devices + backend = "uni" + elif (current_platform.is_cuda() + and cuda_device_count_stateless() < self.world_size): + if not ray_found: + raise ValueError("Unable to load Ray which is " + "required for multi-node inference, " + "please install Ray with `pip install " + "ray`.") from ray_utils.ray_import_err + backend = "ray" + elif ray_found: + if self.placement_group: + backend = "ray" + else: + from ray import is_initialized as ray_is_initialized + if ray_is_initialized(): + from ray.util import get_current_placement_group + if get_current_placement_group(): + backend = "ray" + self.distributed_executor_backend = backend + logger.info("Defaulting to use %s for distributed inference", + backend) + + self._verify_args() + + @property + def use_ray(self) -> bool: + return self.distributed_executor_backend == "ray" or ( + isinstance(self.distributed_executor_backend, type) + and self.distributed_executor_backend.uses_ray) + + def _verify_args(self) -> None: + # Lazy import to avoid circular import + from vllm.executor.executor_base import ExecutorBase + from vllm.platforms import current_platform + if self.distributed_executor_backend not in ( + "ray", "mp", "uni", + "external_launcher", None) and not (isinstance( + self.distributed_executor_backend, type) and issubclass( + self.distributed_executor_backend, ExecutorBase)): + raise ValueError( + "Unrecognized distributed executor backend " + f"{self.distributed_executor_backend}. Supported " + "values are 'ray', 'mp' 'uni', 'external_launcher' or" + " custom ExecutorBase subclass.") + if self.use_ray: + from vllm.executor import ray_utils + ray_utils.assert_ray_available() + if current_platform.is_rocm(): + self.disable_custom_all_reduce = True + logger.info( + "Disabled the custom all-reduce kernel because it is not " + "supported on AMD GPUs.") + if self.ray_workers_use_nsight and not self.use_ray: + raise ValueError("Unable to use nsight profiling unless workers " + "run with Ray.") + + +@dataclass +class SchedulerConfig: + """Scheduler configuration.""" + + runner_type: str = "generate" # The runner type to launch for the model. + + # Maximum number of tokens to be processed in a single iteration. + max_num_batched_tokens: int = field(default=None) # type: ignore + + # Maximum number of sequences to be processed in a single iteration. + max_num_seqs: int = 128 + + # Maximum length of a sequence (including prompt and generated text). + max_model_len: int = 8192 + + # The number of slots to allocate per sequence per + # step, beyond the known token ids. This is used in speculative + # decoding to store KV activations of tokens which may or may not be + # accepted. + num_lookahead_slots: int = 0 + + # Apply a delay (of delay factor multiplied by previous + # prompt latency) before scheduling next prompt. + delay_factor: float = 0.0 + + # If True, prefill requests can be chunked based + # on the remaining max_num_batched_tokens. + enable_chunked_prefill: bool = False + + is_multimodal_model: bool = False + + # NOTE: The following multimodal encoder budget will be initialized to + # max_num_batched_tokens and overridden in case max multimodal embedding + # size is larger. + # TODO (ywang96): Make these configurable. + # Multimodal encoder compute budget, only used in V1 + max_num_encoder_input_tokens: int = field(default=None) # type: ignore + + # Multimodal encoder cache size, only used in V1 + encoder_cache_size: int = field(default=None) # type: ignore + + # Whether to perform preemption by swapping or + # recomputation. If not specified, we determine the mode as follows: + # We use recomputation by default since it incurs lower overhead than + # swapping. However, when the sequence group has multiple sequences + # (e.g., beam search), recomputation is not currently supported. In + # such a case, we use swapping instead. + preemption_mode: Optional[str] = None + + num_scheduler_steps: int = 1 + + multi_step_stream_outputs: bool = False + + # Private API. If used, scheduler sends delta data to + # workers instead of an entire data. It should be enabled only + # when SPMD worker architecture is enabled. I.e., + # VLLM_USE_RAY_SPMD_WORKER=1 + send_delta_data: bool = False + + # The scheduling policy to use. "fcfs" (default) or "priority". + policy: str = "fcfs" + + chunked_prefill_enabled: bool = field(init=False) + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __post_init__(self) -> None: + if self.max_num_batched_tokens is None: + if self.enable_chunked_prefill: + if self.num_scheduler_steps > 1: + # Multi-step Chunked-Prefill doesn't allow prompt-chunking + # for now. Have max_num_batched_tokens set to max_model_len + # so we don't reject sequences on account of a short + # max_num_batched_tokens. + self.max_num_batched_tokens = max(self.max_model_len, 2048) + else: + # This value is chosen to have a balance between ITL + # and TTFT. Note it is not optimized for throughput. + self.max_num_batched_tokens = 2048 + else: + # If max_model_len is too short, use 2048 as the default value + # for higher throughput. + self.max_num_batched_tokens = max(self.max_model_len, 2048) + + if self.runner_type == "pooling": + # Choose specific value for higher throughput + self.max_num_batched_tokens = max( + self.max_num_batched_tokens, + _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, + ) + if self.is_multimodal_model: + # The value needs to be at least the number of multimodal tokens + self.max_num_batched_tokens = max( + self.max_num_batched_tokens, + _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, + ) + + self.max_num_encoder_input_tokens = self.max_num_batched_tokens + self.encoder_cache_size = self.max_num_batched_tokens + + if self.enable_chunked_prefill: + logger.info( + "Chunked prefill is enabled with max_num_batched_tokens=%d.", + self.max_num_batched_tokens) + + self.chunked_prefill_enabled = self.enable_chunked_prefill + self._verify_args() + + def _verify_args(self) -> None: + if (self.max_num_batched_tokens < self.max_model_len + and not self.chunked_prefill_enabled): + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " + f"smaller than max_model_len ({self.max_model_len}). " + "This effectively limits the maximum sequence length to " + "max_num_batched_tokens and makes vLLM reject longer " + "sequences. Please increase max_num_batched_tokens or " + "decrease max_model_len.") + + if self.max_num_batched_tokens < self.max_num_seqs: + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " + "be greater than or equal to max_num_seqs " + f"({self.max_num_seqs}).") + + if self.num_lookahead_slots < 0: + raise ValueError( + "num_lookahead_slots " + f"({self.num_lookahead_slots}) must be greater than or " + "equal to 0.") + + if self.num_scheduler_steps < 1: + raise ValueError( + "num_scheduler_steps " + f"({self.num_scheduler_steps}) must be greater than or " + "equal to 1.") + + @property + def is_multi_step(self) -> bool: + return self.num_scheduler_steps > 1 + + +class DeviceConfig: + device: Optional[torch.device] + device_type: str + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # the device/platform information will be summarized + # by torch/vllm automatically. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __init__(self, device: str = "auto") -> None: + if device == "auto": + # Automated device type detection + from vllm.platforms import current_platform + self.device_type = current_platform.device_type + if not self.device_type: + raise RuntimeError("Failed to infer device type") + else: + # Device type is assigned explicitly + self.device_type = device + + # Some device types require processing inputs on CPU + if self.device_type in ["neuron", "openvino"]: + self.device = torch.device("cpu") + elif self.device_type in ["tpu"]: + self.device = None + else: + # Set device with device type + self.device = torch.device(self.device_type) + + +class SpeculativeConfig: + """Configuration for speculative decoding. + + The configuration is currently specialized to draft-model speculative + decoding with top-1 proposals. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # spec decode does not use `torch.compile` yet. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + @staticmethod + def maybe_create_spec_config( + target_model_config: ModelConfig, + target_parallel_config: ParallelConfig, + target_dtype: str, + speculative_model: Optional[str], + speculative_model_quantization: Optional[str], + speculative_draft_tensor_parallel_size: Optional[int], + num_speculative_tokens: Optional[int], + speculative_disable_mqa_scorer: Optional[bool], + speculative_max_model_len: Optional[int], + enable_chunked_prefill: bool, + disable_log_stats: bool, + speculative_disable_by_batch_size: Optional[int], + ngram_prompt_lookup_max: Optional[int], + ngram_prompt_lookup_min: Optional[int], + draft_token_acceptance_method: str, + typical_acceptance_sampler_posterior_threshold: Optional[float], + typical_acceptance_sampler_posterior_alpha: Optional[float], + disable_logprobs: Optional[bool], + ) -> Optional["SpeculativeConfig"]: + """Create a SpeculativeConfig if possible, else return None. + + This function attempts to create a SpeculativeConfig object based on the + provided parameters. If the necessary conditions are met, it returns an + instance of SpeculativeConfig. Otherwise, it returns None. + + Args: + target_model_config (ModelConfig): The configuration of the target + model. + target_parallel_config (ParallelConfig): The parallel configuration + for the target model. + target_dtype (str): The data type used for the target model. + speculative_model (Optional[str]): The name of the speculative + model, if provided. + speculative_model_quantization (Optional[str]): Quantization method + that was used to quantize the speculative model weights. If + None, we assume the model weights are not quantized. + speculative_draft_tensor_parallel_size (Optional[int]): The degree + of the tensor parallelism for the draft model. + num_speculative_tokens (Optional[int]): The number of speculative + tokens, if provided. Will default to the number in the draft + model config if present, otherwise is required. + speculative_disable_mqa_scorer (Optional[bool]): Disable the MQA + scorer for the speculative model and fall back to batch + expansion for scoring. + speculative_max_model_len (Optional[int]): The maximum model len of + the speculative model. Used when testing the ability to skip + speculation for some sequences. + enable_chunked_prefill (bool): Whether vLLM is configured to use + chunked prefill or not. Used for raising an error since its not + yet compatible with spec decode. + speculative_disable_by_batch_size (Optional[int]): Disable + speculative decoding for new incoming requests when the number + of enqueue requests is larger than this value, if provided. + ngram_prompt_lookup_max (Optional[int]): Max size of ngram token + window, if provided. + ngram_prompt_lookup_min (Optional[int]): Min size of ngram token + window, if provided. + draft_token_acceptance_method (str): The method to use for + accepting draft tokens. This can take two possible + values 'rejection_sampler' and 'typical_acceptance_sampler' + for RejectionSampler and TypicalAcceptanceSampler + respectively. + typical_acceptance_sampler_posterior_threshold (Optional[float]): + A threshold value that sets a lower bound on the posterior + probability of a token in the target model for it to be + accepted. This threshold is used only when we use the + TypicalAcceptanceSampler for token acceptance. + typical_acceptance_sampler_posterior_alpha (Optional[float]): + A scaling factor for the entropy-based threshold in the + TypicalAcceptanceSampler. + disable_logprobs (Optional[bool]): If set to True, token log + probabilities are not returned during speculative decoding. + If set to False, token log probabilities are returned + according to the log probability settings in SamplingParams. + If not specified, it defaults to True. + + Returns: + Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if + the necessary conditions are met, else None. + """ + + if speculative_model is None: + if num_speculative_tokens is not None: + raise ValueError("num_speculative_tokens was provided without " + "speculative_model.") + return None + + if (speculative_disable_by_batch_size is not None + and speculative_disable_by_batch_size < 2): + raise ValueError("Expect the batch size threshold of disabling " + "speculative decoding is > 1, but got " + f"{speculative_disable_by_batch_size=}") + if (enable_chunked_prefill and speculative_model == "eagle"): + raise ValueError("Chunked prefill and EAGLE are not compatible.") + # TODO: The user should be able to specify revision/max model len + # for the draft model. It is not currently supported. + draft_revision = None + draft_code_revision = None + draft_quantization = speculative_model_quantization + + if speculative_model == "[ngram]": + if ngram_prompt_lookup_min is None: + ngram_prompt_lookup_min = 1 + if ngram_prompt_lookup_max is None or ngram_prompt_lookup_max < 1: + raise ValueError(f"{ngram_prompt_lookup_max=} must be > 0") + if ngram_prompt_lookup_min < 1: + raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0") + if ngram_prompt_lookup_min > ngram_prompt_lookup_max: + raise ValueError(f"{ngram_prompt_lookup_min=} cannot be " + f"larger than {ngram_prompt_lookup_max=}") + + # TODO: current we still need extract vocab_size from target model + # config, in future, we may try refactor it out, and set + # draft related config as None here. + draft_model_config = target_model_config + draft_parallel_config = target_parallel_config + else: + ngram_prompt_lookup_max = 0 + ngram_prompt_lookup_min = 0 + draft_model_config = ModelConfig( + model=speculative_model, + task="draft", + tokenizer=target_model_config.tokenizer, + tokenizer_mode=target_model_config.tokenizer_mode, + trust_remote_code=target_model_config.trust_remote_code, + allowed_local_media_path=target_model_config. + allowed_local_media_path, + dtype=target_model_config.dtype, + seed=target_model_config.seed, + revision=draft_revision, + code_revision=draft_code_revision, + tokenizer_revision=target_model_config.tokenizer_revision, + max_model_len=None, + spec_target_max_model_len=target_model_config.max_model_len, + quantization=draft_quantization, + enforce_eager=target_model_config.enforce_eager, + max_seq_len_to_capture=target_model_config. + max_seq_len_to_capture, + max_logprobs=target_model_config.max_logprobs, + ) + + draft_hf_config = draft_model_config.hf_config + + if (num_speculative_tokens is not None + and hasattr(draft_hf_config, "num_lookahead_tokens")): + draft_hf_config.num_lookahead_tokens = num_speculative_tokens + + n_predict = getattr(draft_hf_config, "n_predict", None) + if n_predict is not None: + if num_speculative_tokens is None: + # Default to max value defined in draft model config. + num_speculative_tokens = n_predict + elif num_speculative_tokens > n_predict: + # Verify provided value doesn't exceed the maximum + # supported by the draft model. + raise ValueError( + "This speculative model supports a maximum of " + f"num_speculative_tokens={n_predict}, but " + f"{num_speculative_tokens=} was provided.") + + speculative_draft_tensor_parallel_size = \ + SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size( + target_parallel_config, + speculative_draft_tensor_parallel_size, + draft_hf_config + ) + + draft_model_config.max_model_len = ( + SpeculativeConfig._maybe_override_draft_max_model_len( + speculative_max_model_len, + draft_model_config.max_model_len, + target_model_config.max_model_len, + )) + + draft_parallel_config = ( + SpeculativeConfig.create_draft_parallel_config( + target_parallel_config, + speculative_draft_tensor_parallel_size, draft_hf_config)) + + if num_speculative_tokens is None: + raise ValueError( + "num_speculative_tokens must be provided with " + "speculative_model unless the draft model config contains an " + "n_predict parameter.") + + if typical_acceptance_sampler_posterior_threshold is None: + typical_acceptance_sampler_posterior_threshold = 0.09 + if typical_acceptance_sampler_posterior_alpha is None: + typical_acceptance_sampler_posterior_alpha = 0.3 + if disable_logprobs is None: + disable_logprobs = True + + return SpeculativeConfig( + draft_model_config, + draft_parallel_config, + num_speculative_tokens, + speculative_disable_mqa_scorer, + speculative_disable_by_batch_size, + ngram_prompt_lookup_max, + ngram_prompt_lookup_min, + draft_token_acceptance_method=draft_token_acceptance_method, + typical_acceptance_sampler_posterior_threshold=\ + typical_acceptance_sampler_posterior_threshold, + typical_acceptance_sampler_posterior_alpha=\ + typical_acceptance_sampler_posterior_alpha, + disable_logprobs=disable_logprobs, + disable_log_stats=disable_log_stats, + ) + + @staticmethod + def _maybe_override_draft_max_model_len( + speculative_max_model_len: Optional[int], + draft_max_model_len: int, + target_max_model_len: int, + ) -> int: + """Determine the max sequence len for the draft model. This is usually + the draft_max_model_len, but may be the target_max_model_len if it is + less than the draft_max_model_len, or may be speculative_max_model_len + if it is specified. + + This is necessary so that sequences do not exceed the capacity of the + draft model or the target model. + + speculative_max_model_len is mainly used for testing that sequences can + skip speculation. + """ + + if speculative_max_model_len is not None: + + if speculative_max_model_len > draft_max_model_len: + raise ValueError(f"{speculative_max_model_len=} cannot be " + f"larger than {draft_max_model_len=}") + + if speculative_max_model_len > target_max_model_len: + raise ValueError(f"{speculative_max_model_len=} cannot be " + f"larger than {target_max_model_len=}") + + return speculative_max_model_len + + return min( + draft_max_model_len, + target_max_model_len, + ) + + @staticmethod + def _verify_and_get_draft_model_tensor_parallel_size( + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: Optional[int], + draft_hf_config: PretrainedConfig) -> int: + """ + Verifies and adjusts the tensor parallel size for a draft model + specified using speculative_draft_tensor_parallel_size. + """ + # If speculative_draft_tensor_parallel_size is unset then set it + # appropriately else verify that it is set correctly. + if speculative_draft_tensor_parallel_size is None: + if draft_hf_config.model_type == "mlp_speculator": + speculative_draft_tensor_parallel_size = 1 + if target_parallel_config.tensor_parallel_size > 1: + logger.warning( + "MLPSpeculator cannot currently be run with tp>1; " + "setting speculative_draft_tensor_parallel_size=1") + else: + speculative_draft_tensor_parallel_size = \ + target_parallel_config.tensor_parallel_size + elif speculative_draft_tensor_parallel_size not in ( + 1, target_parallel_config.tensor_parallel_size): + raise ValueError( + f"{speculative_draft_tensor_parallel_size=} cannot be " + f"other value than 1 or target model tensor_parallel_size") + return speculative_draft_tensor_parallel_size + + @staticmethod + def create_draft_parallel_config( + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: int, + draft_hf_config: PretrainedConfig, + ) -> ParallelConfig: + """Create a parallel config for use by the draft worker. + + This is mostly a copy of the target parallel config, except the tp_size. + """ + draft_parallel_config = ParallelConfig( + pipeline_parallel_size=target_parallel_config. + pipeline_parallel_size, + tensor_parallel_size=speculative_draft_tensor_parallel_size, + distributed_executor_backend=target_parallel_config. + distributed_executor_backend, + max_parallel_loading_workers=target_parallel_config. + max_parallel_loading_workers, + disable_custom_all_reduce=target_parallel_config. + disable_custom_all_reduce, + tokenizer_pool_config=target_parallel_config.tokenizer_pool_config, + ray_workers_use_nsight=target_parallel_config. + ray_workers_use_nsight, + placement_group=target_parallel_config.placement_group, + ) + + return draft_parallel_config + + def __init__( + self, + draft_model_config: ModelConfig, + draft_parallel_config: ParallelConfig, + num_speculative_tokens: int, + speculative_disable_mqa_scorer: Optional[bool], + speculative_disable_by_batch_size: Optional[int], + ngram_prompt_lookup_max: Optional[int], + ngram_prompt_lookup_min: Optional[int], + draft_token_acceptance_method: str, + typical_acceptance_sampler_posterior_threshold: float, + typical_acceptance_sampler_posterior_alpha: float, + disable_logprobs: bool, + disable_log_stats: bool, + ): + """Create a SpeculativeConfig object. + + Args: + draft_model_config: ModelConfig for the draft model. + draft_parallel_config: ParallelConfig for the draft model. + num_speculative_tokens: The number of tokens to sample from the + draft model before scoring with the target model. + speculative_disable_by_batch_size: Disable speculative + decoding for new incoming requests when the number of + enqueue requests is larger than this value. + ngram_prompt_lookup_max: Max size of ngram token window. + ngram_prompt_lookup_min: Min size of ngram token window. + draft_token_acceptance_method (str): The method to use for + accepting draft tokens. This can take two possible + values 'rejection_sampler' and 'typical_acceptance_sampler' + for RejectionSampler and TypicalAcceptanceSampler + respectively. + typical_acceptance_sampler_posterior_threshold (Optional[float]): + A threshold value that sets a lower bound on the posterior + probability of a token in the target model for it to be + accepted. This threshold is used only when we use the + TypicalAcceptanceSampler for token acceptance. + typical_acceptance_sampler_posterior_alpha (Optional[float]): + A scaling factor for the entropy-based threshold in the + TypicalAcceptanceSampler. + disable_logprobs: If set to True, token log probabilities will not + be returned even if requested by sampling parameters. This + reduces latency by skipping logprob calculation in proposal + sampling, target sampling, and after accepted tokens are + determined. If set to False, log probabilities will be + returned. + disable_log_stats: Whether to disable periodic printing of stage + times in speculative decoding. + """ + self.draft_model_config = draft_model_config + self.draft_parallel_config = draft_parallel_config + self.num_speculative_tokens = num_speculative_tokens + self.speculative_disable_mqa_scorer = speculative_disable_mqa_scorer + self.speculative_disable_by_batch_size = \ + speculative_disable_by_batch_size + self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0 + self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0 + self.draft_token_acceptance_method = draft_token_acceptance_method + self.typical_acceptance_sampler_posterior_threshold = \ + typical_acceptance_sampler_posterior_threshold + self.typical_acceptance_sampler_posterior_alpha = \ + typical_acceptance_sampler_posterior_alpha + self.disable_logprobs = disable_logprobs + self.disable_log_stats = disable_log_stats + + self._verify_args() + + def _verify_args(self) -> None: + if self.num_speculative_tokens <= 0: + raise ValueError("Expected num_speculative_tokens to be greater " + f"than zero ({self.num_speculative_tokens}).") + + if self.draft_model_config: + self.draft_model_config.verify_with_parallel_config( + self.draft_parallel_config) + # Validate and set draft token acceptance related settings. + + if (self.draft_token_acceptance_method is None): + raise ValueError("draft_token_acceptance_method is not set. " + "Expected values are rejection_sampler or " + "typical_acceptance_sampler.") + + if (self.draft_token_acceptance_method != 'rejection_sampler' + and self.draft_token_acceptance_method + != 'typical_acceptance_sampler'): + raise ValueError( + "Expected draft_token_acceptance_method to be either " + "rejection_sampler or typical_acceptance_sampler. Instead it " + f"is {self.draft_token_acceptance_method}") + + if (self.typical_acceptance_sampler_posterior_threshold < 0 + or self.typical_acceptance_sampler_posterior_alpha < 0): + raise ValueError( + "Expected typical_acceptance_sampler_posterior_threshold " + "and typical_acceptance_sampler_posterior_alpha to be > 0. " + "Instead found " + f"typical_acceptance_sampler_posterior_threshold = " + f"{self.typical_acceptance_sampler_posterior_threshold} and " + f"typical_acceptance_sampler_posterior_alpha = " + f"{self.typical_acceptance_sampler_posterior_alpha}") + + @property + def num_lookahead_slots(self) -> int: + """The number of additional slots the scheduler should allocate per + step, in addition to the slots allocated for each known token. + + This is equal to the number of speculative tokens, as each speculative + token must be scored. + """ + return self.num_speculative_tokens + + def __repr__(self) -> str: + if self.ngram_prompt_lookup_max > 0: + draft_model = "[ngram]" + else: + draft_model = self.draft_model_config.model + num_spec_tokens = self.num_speculative_tokens + return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})" + + +@dataclass +class LoRAConfig: + max_lora_rank: int + max_loras: int + fully_sharded_loras: bool = False + max_cpu_loras: Optional[int] = None + lora_dtype: Optional[Union[torch.dtype, str]] = None + lora_extra_vocab_size: int = 256 + # This is a constant. + lora_vocab_padding_size: ClassVar[int] = 256 + long_lora_scaling_factors: Optional[Tuple[float]] = None + bias_enabled: bool = False + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # LoRA is not compatible with `torch.compile` . + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __post_init__(self): + # Setting the maximum rank to 256 should be able to satisfy the vast + # majority of applications. + possible_max_ranks = (8, 16, 32, 64, 128, 256) + possible_lora_extra_vocab_size = (0, 256, 512) + if self.max_lora_rank not in possible_max_ranks: + raise ValueError( + f"max_lora_rank ({self.max_lora_rank}) must be one of " + f"{possible_max_ranks}.") + if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: + raise ValueError( + f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " + f"must be one of {possible_lora_extra_vocab_size}.") + if self.max_loras < 1: + raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") + if self.max_cpu_loras is None: + self.max_cpu_loras = self.max_loras + elif self.max_cpu_loras < self.max_loras: + raise ValueError( + f"max_cpu_loras ({self.max_cpu_loras}) must be >= " + f"max_loras ({self.max_loras})") + + def verify_with_cache_config(self, cache_config: CacheConfig): + # TODO LoRA supports CPU offload. + if cache_config.cpu_offload_gb > 0: + raise ValueError("CPU offload is not supported with LoRA yet.") + + def verify_with_model_config(self, model_config: ModelConfig): + if self.lora_dtype in (None, "auto"): + self.lora_dtype = model_config.dtype + elif isinstance(self.lora_dtype, str): + self.lora_dtype = getattr(torch, self.lora_dtype) + if model_config.quantization and model_config.quantization not in [ + "awq", "gptq" + ]: + # TODO support marlin + logger.warning("%s quantization is not tested with LoRA yet.", + model_config.quantization) + + def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): + # Reminder: Please update docs/source/features/compatibility_matrix.md + # If the feature combo become valid + if scheduler_config.chunked_prefill_enabled: + logger.warning("LoRA with chunked prefill is still experimental " + "and may be unstable.") + + +@dataclass +class PromptAdapterConfig: + max_prompt_adapters: int + max_prompt_adapter_token: int + max_cpu_prompt_adapters: Optional[int] = None + prompt_adapter_dtype: Optional[torch.dtype] = None + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __post_init__(self): + + if self.max_prompt_adapters < 1: + raise ValueError(f"max_prompt_adapters " + f"({self.max_prompt_adapters}) must be >= 1.") + if self.max_prompt_adapter_token == 0: + raise ValueError("max_prompt_adapter_token must be set.") + if self.max_cpu_prompt_adapters is None: + self.max_cpu_prompt_adapters = self.max_prompt_adapters + + def verify_with_model_config(self, model_config: ModelConfig): + if self.prompt_adapter_dtype in (None, "auto"): + self.prompt_adapter_dtype = model_config.dtype + elif isinstance(self.prompt_adapter_dtype, str): + self.prompt_adapter_dtype = getattr(torch, + self.prompt_adapter_dtype) + + +@dataclass +class MultiModalConfig: + """Controls the behavior of multimodal models.""" + + limit_per_prompt: Mapping[str, int] = field(default_factory=dict) + """ + The maximum number of input items allowed per prompt for each modality. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + # TODO: Add configs to init vision tower or not. + + +@dataclass +class PoolerConfig: + """Controls the behavior of output pooling in pooling models.""" + + pooling_type: Optional[str] = None + """ + The pooling method of the pooling model. This should be a key in + :class:`vllm.model_executor.layers.pooler.PoolingType`. + """ + + normalize: Optional[bool] = None + """ + Whether to normalize the pooled outputs. Usually, this should be set to + ``True`` for embedding outputs. + """ + + softmax: Optional[bool] = None + """ + Whether to apply softmax to the pooled outputs. Usually, this should be set + to ``True`` for classification outputs. + """ + + step_tag_id: Optional[int] = None + """ + If set, only the score corresponding to the ``step_tag_id`` in the + generated sentence should be returned. Otherwise, the scores for all tokens + are returned. + """ + + returned_token_ids: Optional[List[int]] = None + """ + A list of indices for the vocabulary dimensions to be extracted, + such as the token IDs of ``good_token`` and ``bad_token`` in the + ``math-shepherd-mistral-7b-prm`` model. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + @staticmethod + def from_json(json_str: str) -> "PoolerConfig": + return PoolerConfig(**json.loads(json_str)) + + +_STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.float16, + "float16": torch.float16, + "float": torch.float32, + "float32": torch.float32, + "bfloat16": torch.bfloat16, +} + +_ROCM_NOT_SUPPORTED_DTYPE: List[str] = [] # + + +def _get_and_verify_dtype( + config: PretrainedConfig, + dtype: Union[str, torch.dtype], +) -> torch.dtype: + # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct + # because config.torch_dtype can be None. + config_dtype = getattr(config, "torch_dtype", None) + if config_dtype is None: + config_dtype = torch.float32 + + if isinstance(dtype, str): + dtype = dtype.lower() + if dtype == "auto": + if config_dtype == torch.float32: + if config.model_type == "gemma2": + logger.info( + "For Gemma 2, we downcast float32 to bfloat16 instead " + "of float16 by default. Please specify `dtype` if you " + "want to use float16.") + torch_dtype = torch.bfloat16 + else: + # Following the common practice, we use float16 for float32 + # models. + torch_dtype = torch.float16 + else: + torch_dtype = config_dtype + + from vllm.platforms import current_platform + if (current_platform.is_cpu() + and current_platform.get_cpu_architecture() + == CpuArchEnum.POWERPC + and (config_dtype == torch.float16 + or config_dtype == torch.float32)): + logger.info( + "For POWERPC, we cast models to bfloat16 instead of " + "using float16 by default. Float16 is not currently " + "supported for POWERPC.") + torch_dtype = torch.bfloat16 + + # TODO: change this condition to check if the platform support bf16 + # instead of checking the OS. For instance M2 shall supports bf16 + # already. But we need to modify `cpu_extension.cmake` to activate + # the feature in the build. + if (current_platform.is_cpu() and sys.platform.startswith("darwin") + and current_platform.get_cpu_architecture() + == CpuArchEnum.ARM and config_dtype == torch.bfloat16): + logger.info("For macOS with Apple Silicon, currently bfloat16 " + "is not supported. Setting dtype to float16.") + torch_dtype = torch.float16 + + if current_platform.is_hpu() and config_dtype == torch.float16: + logger.info( + "For HPU, we cast models to bfloat16 instead of" + "using float16 by default. Please specify `dtype` if you " + "want to use float16.") + torch_dtype = torch.bfloat16 + else: + if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {dtype}") + torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] + elif isinstance(dtype, torch.dtype): + torch_dtype = dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + # Verify the dtype. + if torch_dtype != config_dtype: + if torch_dtype == torch.float32: + # Upcasting to float32 is allowed. + logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) + pass + elif config_dtype == torch.float32: + # Downcasting from float32 to float16 or bfloat16 is allowed. + logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) + pass + else: + # Casting between float16 and bfloat16 is allowed with a warning. + logger.warning("Casting %s to %s.", config_dtype, torch_dtype) + + return torch_dtype + + +def _get_and_verify_max_len( + hf_config: PretrainedConfig, + max_model_len: Optional[int], + disable_sliding_window: bool, + sliding_window_len: Optional[Union[int, List[Optional[int]]]], + spec_target_max_model_len: Optional[int] = None, + encoder_config: Optional[Any] = None, +) -> int: + """Get and verify the model's maximum length.""" + derived_max_model_len = float("inf") + possible_keys = [ + # OPT + "max_position_embeddings", + # GPT-2 + "n_positions", + # MPT + "max_seq_len", + # ChatGLM2 + "seq_length", + # Command-R + "model_max_length", + # Whisper + "max_target_positions", + # Others + "max_sequence_length", + "max_seq_length", + "seq_len", + ] + # Choose the smallest "max_length" from the possible keys. + max_len_key = None + for key in possible_keys: + max_len = getattr(hf_config, key, None) + if max_len is not None: + max_len_key = key if max_len < derived_max_model_len \ + else max_len_key + derived_max_model_len = min(derived_max_model_len, max_len) + + # If sliding window is manually disabled, max_length should be less + # than the sliding window length in the model config. + if disable_sliding_window and sliding_window_len is not None: + + sliding_window_len_min = get_min_sliding_window(sliding_window_len) + max_len_key = "sliding_window" \ + if sliding_window_len_min < derived_max_model_len else max_len_key + derived_max_model_len = min(derived_max_model_len, + sliding_window_len_min) + + # If none of the keys were found in the config, use a default and + # log a warning. + if derived_max_model_len == float("inf"): + if max_model_len is not None: + # If max_model_len is specified, we use it. + return max_model_len + + if spec_target_max_model_len is not None: + # If this is a speculative draft model, we use the max model len + # from the target model. + return spec_target_max_model_len + + default_max_len = 2048 + logger.warning( + "The model's config.json does not contain any of the following " + "keys to determine the original maximum length of the model: " + "%s. Assuming the model's maximum length is %d.", possible_keys, + default_max_len) + derived_max_model_len = default_max_len + + rope_scaling = getattr(hf_config, "rope_scaling", None) + if rope_scaling is not None: + # No need to consider "type" key because of patch_rope_scaling when + # loading HF config + rope_type = rope_scaling["rope_type"] + + if rope_type not in ("su", "longrope", "llama3"): + if disable_sliding_window: + # TODO(robertgshaw): Find a model that supports rope_scaling + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "with rope_scaling. Please raise an issue so we can " + "investigate.") + + # NOTE: rope_type == "default" does not define factor + # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + scaling_factor = rope_scaling.get("factor", 1.0) + + if rope_type == "yarn": + derived_max_model_len = rope_scaling[ + "original_max_position_embeddings"] + derived_max_model_len *= scaling_factor + + if encoder_config and "max_seq_length" in encoder_config: + derived_max_model_len = encoder_config["max_seq_length"] + + # If the user specified a max length, make sure it is smaller than the + # derived length from the HF model config. + if max_model_len is None: + max_model_len = int(derived_max_model_len) + elif max_model_len > derived_max_model_len: + # Some models might have a separate key for specifying model_max_length + # that will be bigger than derived_max_model_len. We compare user input + # with model_max_length and allow this override when it's smaller. + model_max_length = getattr(hf_config, "model_max_length", None) + if model_max_length is not None and max_model_len <= model_max_length: + if disable_sliding_window: + # TODO(robertgshaw): Find a model that has model_max_length + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "model_max_length in the config. Please raise an issue " + "so we can investigate.") + else: + msg = ( + f"User-specified max_model_len ({max_model_len}) is greater " + f"than the derived max_model_len ({max_len_key}=" + f"{derived_max_model_len} or model_max_length=" + f"{model_max_length} in model's config.json). This may lead " + "to incorrect model outputs or CUDA errors.") + if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN: + logger.warning( + "%s Make sure the value is correct and within the " + "model context size.", msg) + else: + raise ValueError( + f"{msg} To allow overriding this maximum, set " + "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1") + return int(max_model_len) + + +def get_min_sliding_window( + sliding_window: Union[int, List[Optional[int]]]) -> int: + if isinstance(sliding_window, list): + return min(s for s in sliding_window if s is not None) + + return sliding_window + + +def get_served_model_name(model: str, + served_model_name: Optional[Union[str, List[str]]]): + """ + If the input is a non-empty list, the first model_name in + `served_model_name` is taken. + If the input is a non-empty string, it is used directly. + For cases where the input is either an empty string or an + empty list, the fallback is to use `self.model`. + """ + if not served_model_name: + return model + if isinstance(served_model_name, list): + return served_model_name[0] + return served_model_name + + +@dataclass +class DecodingConfig: + """Dataclass which contains the decoding strategy of the engine""" + + # Which guided decoding algo to use. + # 'outlines' / 'lm-format-enforcer' / 'xgrammar' + guided_decoding_backend: str = 'xgrammar' + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __post_init__(self): + valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar'] + backend = self.guided_decoding_backend + if backend not in valid_guided_backends: + raise ValueError(f"Invalid guided_decoding_backend '{backend}," + f"must be one of {valid_guided_backends}") + + +@dataclass +class ObservabilityConfig: + """Configuration for observability.""" + otlp_traces_endpoint: Optional[str] = None + + # Collecting detailed timing information for each request can be expensive. + + # If set, collects the model forward time for the request. + collect_model_forward_time: bool = False + + # If set, collects the model execute time for the request. + collect_model_execute_time: bool = False + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + def __post_init__(self): + if not is_otel_available() and self.otlp_traces_endpoint is not None: + raise ValueError( + "OpenTelemetry is not available. Unable to configure " + "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " + f"installed. Original error:\n{otel_import_error_traceback}") + + +class KVTransferConfig(BaseModel): + """Configuration for distributed KV cache transfer.""" + + # The KV connector for vLLM to transmit KV caches between vLLM instances. + kv_connector: Optional[str] = None + + # The device used by kv connector to buffer the KV cache. + # Currently only support 'cuda'. + kv_buffer_device: Optional[str] = "cuda" + + # The buffer size for TorchDistributedConnector. Measured in number of + # bytes. Recommended value: 1e9 (about 1GB). + kv_buffer_size: float = 1e9 + + # Whether this vLLM instance produces, consumes KV cache, or both. Choices + # are 'kv_producer', 'kv_consumer', and 'both'. + kv_role: Optional[str] = None + + # The rank of this vLLM instance in the KV cache transfer. Typical value: + # 0 for prefill instance, 1 for decode instance. + # Currently only 1P1D is supported. + kv_rank: Optional[int] = None + + # The number of parallel instances for KV cache transfer. For + # PyNcclConnector, this should be 2. + kv_parallel_size: int = 1 + + # The KV connector ip, used to build distributed connection + kv_ip: str = "127.0.0.1" + + # The KV connector port, used to build distributed connection + kv_port: int = 14579 + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + + @classmethod + def from_cli(cls, cli_value: str) -> "KVTransferConfig": + """Parse the CLI value for the kv cache transfer config.""" + return KVTransferConfig.model_validate_json(cli_value) + + def model_post_init(self, __context: Any) -> None: + + if self.kv_role is not None and self.kv_role not in [ + "kv_producer", "kv_consumer", "kv_both" + ]: + raise ValueError( + f"Unsupported kv_role: {self.kv_role}. " + f"Supported roles are `kv_producer`, `kv_consumer`, " + f"and `kv_both`") + + if self.kv_connector is not None and self.kv_role is None: + raise ValueError("Please specify kv_disagg_role when kv_connector " + "is set, supported roles are `kv_producer`, " + "`kv_consumer`, and `kv_both`") + + @property + def is_kv_transfer_instance(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in ["kv_producer", "kv_consumer", "kv_both"] + + @property + def need_kv_parallel_group(self) -> bool: + # for those database-based connector, vLLM does not need to create + # parallel group, and in that case the kv parallel size will be 1. + return self.kv_connector is not None and self.kv_parallel_size > 1 + + @property + def is_kv_producer(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in ["kv_producer", "kv_both"] + + @property + def is_kv_consumer(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in ["kv_consumer", "kv_both"] + + +class CompilationLevel: + # constants for the levels of the compilation process + NO_COMPILATION = 0 + DYNAMO_AS_IS = 1 + DYNAMO_ONCE = 2 + PIECEWISE = 3 + + +class CompilationConfig(BaseModel): + """ + Configuration for compilation. + It has three parts: + - Top-level Compilation control: + - level: the level of compilation. + - 0: no compilation. + - 1: dynamo as is. + - 2: dynamo once. + - 3: piecewise compilation. + - debug_dump_path: the path to dump the debug information. + - cache_dir: the directory to store the compiled graph, to + accelerate Inductor compilation. By default, it will use + model-related information to generate a cache directory. + - backend: the backend for compilation. It needs to be a string. + - "" (empty string): use the default backend. + - "eager"/"openxla"/...: use the specified backend registered in PyTorch. + - "full.module.name": a qualified name which can be used to import the backend function. + We use string to avoid serialization issues when using compilation in a distributed setting. + When the compilation level is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). + When the compilation level is 3, the backend is used for the piecewise compilation (it sees a part of the graph). + - custom_ops: fine-grained control over which custom ops to enable/disable. + Use 'all' to enable all, 'none' to disable all. + Also specify a list of custom op names to enable (prefixed with a '+'), + or disable (prefixed with a '-'). + Examples: + - 'all,-op1' to enable all except op1 + - 'none,+op1,+op2' to enable only op1 and op2 + By default, all custom ops are enabled when running without Inductor + and disabled when running with Inductor (compile_level >= Inductor). + - splitting_ops: a list of ops to split the full graph into subgraphs, used in piecewise compilation. + - CudaGraph capture: + - use_cudagraph: whether to use cudagraph inside compilation. + - False: cudagraph inside compilation is not used. + - True: cudagraph inside compilation is used. It requires + that all input buffers have fixed addresses, and all + splitting ops write their outputs to input buffers. + Note that this is orthogonal to the cudagraph capture logic + outside of compilation. + TODO: move outside cudagraph logic into compilation. + torch.compile will handle cudagraph capture logic in the future. + - cudagraph_capture_sizes: sizes to capture cudagraph. + - None (default): capture sizes are inferred from vllm config. + - List[int]: capture sizes are specified as given. + - cudagraph_num_of_warmups: number of warmup runs for cudagraph. + It means the first several runs will be treated as warmup runs. + Only after that, the execution will be recorded, and the recorded + cudagraph will be used for subsequent runs. + - cudagraph_copy_inputs: whether to copy input tensors for + cudagraph. If the caller can guarantee that the same input buffers + are always used, it can set this to False. Otherwise, it should + set this to True, and the compiler will copy the input to an + internally managed buffer. Default is False. + - Inductor compilation: + - use_inductor: whether to use inductor compilation. + - False: inductor compilation is not used. graph runs in eager. + - True: inductor compilation is used. one graph for symbolic shape + is compiled. In addition, compile for compile_sizes, + using configurations in inductor_compile_config. + - compile_sizes: sizes to compile for inductor. In addition + to integers, it also supports "cudagraph_capture_sizes" to + specify the sizes for cudagraph capture. + - inductor_compile_config: additional configurations for inductor. + - None: use default configurations. + - inductor_passes: additional passes for inductor. It is a dictionary + from pass name to pass function qualified name. We use function + name because the config uses json format. If we pass the config + from Python, functions can also be passed directly via Python object + constructor, e.g. `CompilationConfig(inductor_passes={"a": func})` + - custom inductor passes: see PassConfig for more details + + Why we have different sizes for cudagraph and inductor: + - cudagraph: a cudagraph captured for a specific size can only be used + for the same size. We need to capture all the sizes we want to use. + - inductor: a graph compiled by inductor for a general shape can be used + for different sizes. Inductor can also compile for specific sizes, + where it can have more information to optimize the graph with fully + static shapes. However, we find the general shape compilation is + sufficient for most cases. It might be beneficial to compile for + certain small batchsizes, where inductor is good at optimizing. + """ # noqa + level: int = 0 + debug_dump_path: str = "" + cache_dir: str = "" + backend: str = "" + custom_ops: List[str] = Field(default_factory=list) + splitting_ops: List[str] = Field(default=None) # type: ignore + + use_inductor: bool = True + compile_sizes: Optional[List[Union[int, str]]] = Field(default=None) + inductor_compile_config: Dict = Field(default_factory=dict) + inductor_passes: Dict[str, str] = Field(default_factory=dict) + + use_cudagraph: bool = False + cudagraph_num_of_warmups: int = 0 + cudagraph_capture_sizes: Optional[List[int]] = None + cudagraph_copy_inputs: bool = False + + class PassConfig(BaseModel): + """ + Configuration for custom Inductor passes. + This is separate from general CompilationConfig so that inductor passes + don't all have access to full configuration - that would create a cycle + as the PassManager is set as a property of config. + - dump_graph_stages: list of stages for which we want to dump the graph. + Each pass defines its own stages (before, after, maybe in-between). + - dump_graph_dir: directory to dump the graphs. Default is . + - enable_fusion: whether to enable the custom fusion pass. + - enable_reshape: whether to enable the custom reshape elimination pass. + TODO better pass enabling system. + """ + dump_graph_stages: List[str] = Field(default_factory=list) + dump_graph_dir: Path = Field(default=Path(".")) + enable_fusion: bool = True + enable_reshape: bool = True + + def uuid(self): + """ + Produces a hash unique to the pass configuration. + Any new fields that affect compilation should be added to the hash. + Do not include dump_graph_* in the hash - they don't affect + compilation. + """ + dict_ = self.model_dump( + include={"enable_fusion", "enable_reshape"}) + encoded = json.dumps(dict_, sort_keys=True).encode("utf-8") + return hashlib.sha256(encoded).digest() + + def model_post_init(self, __context: Any) -> None: + if not self.enable_reshape and self.enable_fusion: + logger.warning_once( + "Fusion enabled but reshape elimination disabled." + "RMSNorm + quant (fp8) fusion might not work") + + pass_config: PassConfig = Field(default_factory=PassConfig) + + # not configurable, computed after init + max_capture_size: int = PrivateAttr + local_cache_dir: str = PrivateAttr # local cache dir for each rank + # optimization: + # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. + # since we know all keys are in a range [0, max_capture_size], + # we can optimize it to List[int] for better lookup performance. + bs_to_padded_graph_size: List[int] = PrivateAttr + + # keep track of enabled and disabled custom ops + enabled_custom_ops: Counter[str] = PrivateAttr + disabled_custom_ops: Counter[str] = PrivateAttr + traced_files: Set[str] = PrivateAttr + compilation_time: float = PrivateAttr + + # Per-model forward context + # Map from layer name to the attention cls + static_forward_context: Dict[str, Any] = PrivateAttr + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + factors.append(self.level) + factors.append(self.backend) + factors.append(self.custom_ops) + factors.append(self.splitting_ops) + factors.append(self.use_inductor) + factors.append(self.inductor_compile_config) + factors.append(self.inductor_passes) + factors.append(self.pass_config.uuid()) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __repr__(self) -> str: + exclude = { + "static_forward_context", + "enabled_custom_ops", + "disabled_custom_ops", + "compilation_time", + "bs_to_padded_graph_size", + "pass_config", + "traced_files", + } + return self.model_dump_json(exclude=exclude, exclude_unset=True) + + __str__ = __repr__ + + @classmethod + def from_cli(cls, cli_value: str) -> "CompilationConfig": + """Parse the CLI value for the compilation config.""" + if cli_value in ["0", "1", "2", "3"]: + return cls(level=int(cli_value)) + # do not use `eval`, it is dangerous and can execute arbitrary code + dict_value = ast.literal_eval(cli_value) + return CompilationConfig.model_validate(dict_value) + + def model_post_init(self, __context: Any) -> None: + + count_none = self.custom_ops.count("none") + count_all = self.custom_ops.count("all") + assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" + + if self.splitting_ops is None: + if envs.VLLM_USE_V1: + # v1 must split the graph on attention ops + # for piecewise cudagraph + self.splitting_ops = [ + "vllm.unified_attention", + "vllm.unified_attention_with_output", + ] + else: + # v0 uses full graph compilation + self.splitting_ops = [] + + for k, v in self.inductor_passes.items(): + if not isinstance(v, str): + assert callable(v), ( + f"pass {k} should be callable or a qualified name") + self.inductor_compile_config[k] = v if isinstance( + v, InductorPass) else CallableInductorPass(v) + continue + + # resolve function from qualified name + names = v.split(".") + module = ".".join(names[:-1]) + func_name = names[-1] + func = __import__(module).__dict__[func_name] + self.inductor_compile_config[k] = func if isinstance( + func, InductorPass) else CallableInductorPass(func) + + self.enabled_custom_ops = Counter() + self.disabled_custom_ops = Counter() + self.traced_files = set() + self.static_forward_context = {} + self.compilation_time = 0.0 + + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: + if self.level == CompilationLevel.NO_COMPILATION: + raise ValueError("No compilation level is set.") + + from torch._dynamo.backends.registry import list_backends + torch_backends = list_backends(exclude_tags=tuple()) + if self.level in [ + CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE + ]: + if self.backend == "": + return "eager" + if self.backend in torch_backends: + return self.backend + return resolve_obj_by_qualname(self.backend) + + # TODO: pass user-specified backend to piecewise compilation + # merge with the config use_inductor + assert self.level == CompilationLevel.PIECEWISE + + from vllm.compilation.backends import VllmBackend + return VllmBackend(vllm_config) + + def init_with_cudagraph_sizes(self, + cudagraph_capture_sizes: List[int]) -> None: + """To complete the initialization of config, + we need to know the cudagraph sizes.""" + + if self.cudagraph_capture_sizes is None: + self.cudagraph_capture_sizes = cudagraph_capture_sizes + else: + # de-duplicate the sizes provided by the config + self.cudagraph_capture_sizes = list( + set(self.cudagraph_capture_sizes)) + logger.info(("cudagraph sizes specified by model runner" + " %s is overridden by config %s"), + cudagraph_capture_sizes, self.cudagraph_capture_sizes) + + computed_compile_sizes = [] + if self.compile_sizes is not None: + # de-duplicate the sizes provided by the config + self.compile_sizes = list(set(self.compile_sizes)) + for x in self.compile_sizes: + if isinstance(x, str): + assert x == "cudagraph_capture_sizes", \ + "Unrecognized size type in compile_sizes, " \ + f"expect 'cudagraph_capture_sizes', got {x}" + computed_compile_sizes.extend(self.cudagraph_capture_sizes) + else: + assert isinstance(x, int) + computed_compile_sizes.append(x) + self.compile_sizes = computed_compile_sizes # type: ignore + + # sort to make sure cudagraph capture sizes are in descending order + self.cudagraph_capture_sizes.sort(reverse=True) + self.max_capture_size = self.cudagraph_capture_sizes[ + 0] if self.cudagraph_capture_sizes else 0 + + # pre-compute the mapping from batch size to padded graph size + self.bs_to_padded_graph_size = [ + 0 for i in range(self.max_capture_size + 1) + ] + for end, start in zip(self.cudagraph_capture_sizes, + self.cudagraph_capture_sizes[1:] + [0]): + for bs in range(start, end): + if bs == start: + self.bs_to_padded_graph_size[bs] = start + else: + self.bs_to_padded_graph_size[bs] = end + self.bs_to_padded_graph_size[ + self.max_capture_size] = self.max_capture_size + + +@dataclass +class VllmConfig: + """Dataclass which contains all vllm-related configuration. This + simplifies passing around the distinct configurations in the codebase. + """ + + model_config: ModelConfig = field(default=None, init=True) # type: ignore + cache_config: CacheConfig = field(default=None, init=True) # type: ignore + parallel_config: ParallelConfig = field(default_factory=ParallelConfig, + init=True) + scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig, + init=True) + device_config: DeviceConfig = field(default=None, + init=True) # type: ignore + load_config: LoadConfig = field(default=None, init=True) # type: ignore + lora_config: Optional[LoRAConfig] = None + speculative_config: Optional[SpeculativeConfig] = None + decoding_config: Optional[DecodingConfig] = None + observability_config: Optional[ObservabilityConfig] = None + prompt_adapter_config: Optional[PromptAdapterConfig] = None + quant_config: Optional[QuantizationConfig] = None + compilation_config: CompilationConfig = field(default=None, + init=True) # type: ignore + kv_transfer_config: KVTransferConfig = field(default=None, + init=True) # type: ignore + # some opaque config, only used to provide additional information + # for the hash computation, mainly used for testing and debugging. + additional_config: SupportsHash = field(default=None, + init=True) # type: ignore + instance_id: str = "" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + # summarize system state + from torch._inductor.codecache import CacheBase + system_factors = CacheBase.get_system() + factors.append(system_factors) + + # summarize pytorch state + from torch._inductor.codecache import torch_key + torch_factors = torch_key() + factors.append(torch_factors) + + # summarize vllm config + vllm_factors: List[Any] = [] + from vllm import __version__ + vllm_factors.append(__version__) + if self.model_config: + vllm_factors.append(self.model_config.compute_hash()) + else: + vllm_factors.append("None") + if self.cache_config: + vllm_factors.append(self.cache_config.compute_hash()) + else: + vllm_factors.append("None") + if self.parallel_config: + vllm_factors.append(self.parallel_config.compute_hash()) + else: + vllm_factors.append("None") + if self.scheduler_config: + vllm_factors.append(self.scheduler_config.compute_hash()) + else: + vllm_factors.append("None") + if self.device_config: + vllm_factors.append(self.device_config.compute_hash()) + else: + vllm_factors.append("None") + if self.load_config: + vllm_factors.append(self.load_config.compute_hash()) + else: + vllm_factors.append("None") + if self.lora_config: + vllm_factors.append(self.lora_config.compute_hash()) + else: + vllm_factors.append("None") + if self.speculative_config: + vllm_factors.append(self.speculative_config.compute_hash()) + else: + vllm_factors.append("None") + if self.decoding_config: + vllm_factors.append(self.decoding_config.compute_hash()) + else: + vllm_factors.append("None") + if self.observability_config: + vllm_factors.append(self.observability_config.compute_hash()) + else: + vllm_factors.append("None") + if self.prompt_adapter_config: + vllm_factors.append(self.prompt_adapter_config.compute_hash()) + else: + vllm_factors.append("None") + if self.quant_config: + pass # should be captured by model_config.quantization + if self.compilation_config: + vllm_factors.append(self.compilation_config.compute_hash()) + else: + vllm_factors.append("None") + if self.kv_transfer_config: + vllm_factors.append(self.kv_transfer_config.compute_hash()) + else: + vllm_factors.append("None") + if self.additional_config: + vllm_factors.append(self.additional_config.compute_hash()) + else: + vllm_factors.append("None") + factors.append(vllm_factors) + + hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10] + return hash_str + + def pad_for_cudagraph(self, batch_size: int) -> int: + # if batch_size > self.compilation_config.max_capture_size, + # it should raise an IndexError. + # the caller should make sure the batch_size is within the range, + # i.e., batch_size <= self.compilation_config.max_capture_size + return self.compilation_config.bs_to_padded_graph_size[batch_size] + + @staticmethod + def _get_quantization_config( + model_config: ModelConfig, + load_config: LoadConfig) -> Optional[QuantizationConfig]: + """Get the quantization config.""" + from vllm.platforms import current_platform + if model_config.quantization is not None: + from vllm.model_executor.model_loader.weight_utils import ( + get_quant_config) + quant_config = get_quant_config(model_config, load_config) + capability_tuple = current_platform.get_device_capability() + + if capability_tuple is not None: + capability = capability_tuple.to_int() + if capability < quant_config.get_min_capability(): + raise ValueError( + f"The quantization method {model_config.quantization} " + "is not supported for the current GPU. Minimum " + f"capability: {quant_config.get_min_capability()}. " + f"Current capability: {capability}.") + supported_dtypes = quant_config.get_supported_act_dtypes() + if model_config.dtype not in supported_dtypes: + raise ValueError( + f"{model_config.dtype} is not supported for quantization " + f"method {model_config.quantization}. Supported dtypes: " + f"{supported_dtypes}") + return quant_config + return None + + def with_hf_config( + self, + hf_config: PretrainedConfig, + architectures: Optional[list[str]] = None, + ) -> "VllmConfig": + if architectures is not None: + hf_config = copy.deepcopy(hf_config) + hf_config.architectures = architectures + + model_config = copy.deepcopy(self.model_config) + model_config.hf_config = hf_config + + return replace(self, model_config=model_config) + + def __post_init__(self): + """Verify configs are valid & consistent with each other. + """ + if self.model_config is not None: + self.model_config.verify_async_output_proc(self.parallel_config, + self.speculative_config, + self.device_config) + self.model_config.verify_with_parallel_config(self.parallel_config) + + if self.cache_config is not None: + self.cache_config.verify_with_parallel_config(self.parallel_config) + + if self.lora_config: + self.lora_config.verify_with_cache_config(self.cache_config) + self.lora_config.verify_with_model_config(self.model_config) + self.lora_config.verify_with_scheduler_config( + self.scheduler_config) + if self.prompt_adapter_config: + self.prompt_adapter_config.verify_with_model_config( + self.model_config) + + if self.quant_config is None and \ + self.model_config is not None and self.load_config is not None: + self.quant_config = VllmConfig._get_quantization_config( + self.model_config, self.load_config) + + from vllm.platforms import current_platform + if self.scheduler_config is not None and \ + self.model_config is not None and \ + self.scheduler_config.chunked_prefill_enabled and \ + self.model_config.dtype == torch.float32 and \ + current_platform.get_device_capability() == (7, 5): + logger.warning_once( + "Turing devices tensor cores do not support float32 matmul. " + "To workaround this limitation, vLLM will set 'ieee' input " + "precision for chunked prefill triton kernels.") + + if self.compilation_config is None: + self.compilation_config = CompilationConfig() + if envs.VLLM_USE_V1 and self.model_config is not None and \ + not self.model_config.enforce_eager: + # NOTE(woosuk): Currently, we use inductor because the piecewise + # CUDA graphs do not work properly with the custom CUDA kernels. + # FIXME(woosuk): Disable inductor to reduce the compilation time + # and avoid any potential issues with the inductor. + self.compilation_config.custom_ops = ["none"] + self.compilation_config.use_cudagraph = True + self.compilation_config.use_inductor = True + self.compilation_config.cudagraph_num_of_warmups = 1 + self.compilation_config.pass_config.enable_fusion = False + self.compilation_config.pass_config.enable_reshape = False + self.compilation_config.level = CompilationLevel.PIECEWISE + + self._set_cudagraph_sizes() + + if self.cache_config is not None and \ + self.cache_config.cpu_offload_gb > 0 and \ + self.compilation_config.level != CompilationLevel.NO_COMPILATION: + logger.warning( + "CPU offload is not supported with `torch.compile` yet." + " Disabling `torch.compile`.") + self.compilation_config.level = CompilationLevel.NO_COMPILATION + + if self.lora_config is not None and self.compilation_config.level !=\ + CompilationLevel.NO_COMPILATION: + logger.warning("LoRA is not supported with `torch.compile` yet. " + "Disabling `torch.compile`.") + self.compilation_config.level = CompilationLevel.NO_COMPILATION + + current_platform.check_and_update_config(self) + + # If MLA is enabled, force disable chunked prefill and prefix caching + if self.model_config and self.model_config.use_mla: + logger.info("MLA is enabled; forcing chunked prefill and prefix " + "caching to be disabled.") + self.scheduler_config.enable_chunked_prefill = False + self.scheduler_config.chunked_prefill_enabled = False + + if self.cache_config is not None: + self.cache_config.enable_prefix_caching = False + + if not self.instance_id: + self.instance_id = random_uuid()[:5] + + def _set_cudagraph_sizes(self): + """ + cudagraph batchsize padding logic: + + `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible + batch sizes that cudagraph will capture. + + Depending on the engine's configuration of `max_num_seqs`, the + candidate batch sizes to capture cudagraph will shrink to the subset + which just cover the range of `[1, max_num_seqs]`. In the common case, + `max_num_seqs` is 256, and the cudagraph batch sizes will be + `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`. + + However, if users specify the cudagraph capture sizes through + compilation config, we will use the specified sizes instead. + + In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` + will be the final sizes to capture cudagraph (in descending order). + + During runtime, if batchsize is larger than + `vllm_config.compilation_config.cudagraph_capture_sizes`, + no cudagraph will be used. + If the batch size is no larger than + `vllm_config.compilation_config.cudagraph_capture_sizes`, + we can quickly find the padded graph size for a given batch size by + looking up `vllm_config.compilation_config.bs_to_padded_graph_size`. + """ + + # calculate the default `batch_size_capture_list` + if not envs.VLLM_USE_V1: + batch_size_capture_list = [] + max_batchsize_to_capture = 0 + if self.scheduler_config is not None and \ + self.model_config is not None and \ + not self.model_config.enforce_eager: + + possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)] + # find the minimum size that is larger than max_num_seqs, + # which then becomes the max_batchsize_to_capture + larger_sizes = [ + x for x in possible_sizes + if x >= self.scheduler_config.max_num_seqs + ] + if larger_sizes: + max_batchsize_to_capture = larger_sizes[0] + else: + max_batchsize_to_capture = possible_sizes[-1] + + # filter out the sizes that are + # larger than max_batchsize_to_capture + batch_size_capture_list = [ + size for size in possible_sizes + if size <= max_batchsize_to_capture + ] + else: + batch_size_capture_list = [] + if self.model_config is not None and \ + not self.model_config.enforce_eager: + batch_size_capture_list = [1, 2, 4 + ] + [i for i in range(8, 513, 8)] + + self.compilation_config.init_with_cudagraph_sizes( + batch_size_capture_list) + + def __str__(self): + return ( + f"model={self.model_config.model!r}," + f" speculative_config={self.speculative_config!r}," + f" tokenizer={self.model_config.tokenizer!r}, " + f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}," + f" tokenizer_mode={self.model_config.tokenizer_mode}, " + f"revision={self.model_config.revision}, " + f"override_neuron_config={self.model_config.override_neuron_config}," + f" tokenizer_revision={self.model_config.tokenizer_revision}, " + f"trust_remote_code={self.model_config.trust_remote_code}, " + f"dtype={self.model_config.dtype}, " + f"max_seq_len={self.model_config.max_model_len}," + f" download_dir={self.load_config.download_dir!r}, " + f"load_format={self.load_config.load_format}, " + f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}," + f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa + f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa + f"quantization={self.model_config.quantization}, " + f"enforce_eager={self.model_config.enforce_eager}, " + f"kv_cache_dtype={self.cache_config.cache_dtype}, " + f" device_config={self.device_config.device}, " + f"decoding_config={self.decoding_config!r}, " + f"observability_config={self.observability_config!r}, " + f"seed={self.model_config.seed}, " + f"served_model_name={self.model_config.served_model_name}, " + f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, " + f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, " # noqa + f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " + f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa + f"use_async_output_proc={self.model_config.use_async_output_proc}, " + f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, " # noqa + f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, " + f"pooler_config={self.model_config.pooler_config!r}, " + f"compilation_config={self.compilation_config!r}") + + +_current_vllm_config: Optional[VllmConfig] = None + + +@contextmanager +def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): + """ + Temporarily set the current VLLM config. + Used during model initialization. + We save the current VLLM config in a global variable, + so that all modules can access it, e.g. custom ops + can access the VLLM config to determine how to dispatch. + """ + global _current_vllm_config + old_vllm_config = _current_vllm_config + from vllm.compilation.counter import compilation_counter + num_models_seen = compilation_counter.num_models_seen + try: + _current_vllm_config = vllm_config + yield + finally: + logger.debug("enabled custom ops: %s", + vllm_config.compilation_config.enabled_custom_ops) + logger.debug("disabled custom ops: %s", + vllm_config.compilation_config.disabled_custom_ops) + if check_compile and \ + vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ + and compilation_counter.num_models_seen == num_models_seen: + # If the model supports compilation, + # compilation_counter.num_models_seen should be increased + # by at least 1. + # If it is not increased, it means the model does not support + # compilation (does not have @support_torch_compile decorator). + logger.warning( + "`torch.compile` is turned on, but the model %s" + " does not support it. Please open an issue on GitHub" + "if you want it to be supported.", + vllm_config.model_config.model) + _current_vllm_config = old_vllm_config + + +def get_current_vllm_config() -> VllmConfig: + if _current_vllm_config is None: + # in ci, usually when we test custom ops/modules directly, + # we don't set the vllm config. In that case, we set a default + # config. + logger.warning("Current VLLM config is not set.") + from vllm.config import VllmConfig + return VllmConfig() + return _current_vllm_config diff --git a/.venv/lib/python3.11/site-packages/vllm/connections.py b/.venv/lib/python3.11/site-packages/vllm/connections.py new file mode 100644 index 0000000000000000000000000000000000000000..dc060bb6f88a75f9b8d300c364d21594cf23d1f6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/connections.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path +from typing import Mapping, MutableMapping, Optional +from urllib.parse import urlparse + +import aiohttp +import requests + +from vllm.version import __version__ as VLLM_VERSION + + +class HTTPConnection: + """Helper class to send HTTP requests.""" + + def __init__(self, *, reuse_client: bool = True) -> None: + super().__init__() + + self.reuse_client = reuse_client + + self._sync_client: Optional[requests.Session] = None + self._async_client: Optional[aiohttp.ClientSession] = None + + def get_sync_client(self) -> requests.Session: + if self._sync_client is None or not self.reuse_client: + self._sync_client = requests.Session() + + return self._sync_client + + # NOTE: We intentionally use an async function even though it is not + # required, so that the client is only accessible inside async event loop + async def get_async_client(self) -> aiohttp.ClientSession: + if self._async_client is None or not self.reuse_client: + self._async_client = aiohttp.ClientSession(trust_env=True) + + return self._async_client + + def _validate_http_url(self, url: str): + parsed_url = urlparse(url) + + if parsed_url.scheme not in ("http", "https"): + raise ValueError("Invalid HTTP URL: A valid HTTP URL " + "must have scheme 'http' or 'https'.") + + def _headers(self, **extras: str) -> MutableMapping[str, str]: + return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras} + + def get_response( + self, + url: str, + *, + stream: bool = False, + timeout: Optional[float] = None, + extra_headers: Optional[Mapping[str, str]] = None, + ): + self._validate_http_url(url) + + client = self.get_sync_client() + extra_headers = extra_headers or {} + + return client.get(url, + headers=self._headers(**extra_headers), + stream=stream, + timeout=timeout) + + async def get_async_response( + self, + url: str, + *, + timeout: Optional[float] = None, + extra_headers: Optional[Mapping[str, str]] = None, + ): + self._validate_http_url(url) + + client = await self.get_async_client() + extra_headers = extra_headers or {} + + return client.get(url, + headers=self._headers(**extra_headers), + timeout=timeout) + + def get_bytes(self, url: str, *, timeout: Optional[float] = None) -> bytes: + with self.get_response(url, timeout=timeout) as r: + r.raise_for_status() + + return r.content + + async def async_get_bytes( + self, + url: str, + *, + timeout: Optional[float] = None, + ) -> bytes: + async with await self.get_async_response(url, timeout=timeout) as r: + r.raise_for_status() + + return await r.read() + + def get_text(self, url: str, *, timeout: Optional[float] = None) -> str: + with self.get_response(url, timeout=timeout) as r: + r.raise_for_status() + + return r.text + + async def async_get_text( + self, + url: str, + *, + timeout: Optional[float] = None, + ) -> str: + async with await self.get_async_response(url, timeout=timeout) as r: + r.raise_for_status() + + return await r.text() + + def get_json(self, url: str, *, timeout: Optional[float] = None) -> str: + with self.get_response(url, timeout=timeout) as r: + r.raise_for_status() + + return r.json() + + async def async_get_json( + self, + url: str, + *, + timeout: Optional[float] = None, + ) -> str: + async with await self.get_async_response(url, timeout=timeout) as r: + r.raise_for_status() + + return await r.json() + + def download_file( + self, + url: str, + save_path: Path, + *, + timeout: Optional[float] = None, + chunk_size: int = 128, + ) -> Path: + with self.get_response(url, timeout=timeout) as r: + r.raise_for_status() + + with save_path.open("wb") as f: + for chunk in r.iter_content(chunk_size): + f.write(chunk) + + return save_path + + async def async_download_file( + self, + url: str, + save_path: Path, + *, + timeout: Optional[float] = None, + chunk_size: int = 128, + ) -> Path: + async with await self.get_async_response(url, timeout=timeout) as r: + r.raise_for_status() + + with save_path.open("wb") as f: + async for chunk in r.content.iter_chunked(chunk_size): + f.write(chunk) + + return save_path + + +global_http_connection = HTTPConnection() +"""The global :class:`HTTPConnection` instance used by vLLM.""" diff --git a/.venv/lib/python3.11/site-packages/vllm/cumem_allocator.abi3.so b/.venv/lib/python3.11/site-packages/vllm/cumem_allocator.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..d9867990328ea0e68141c7b1d8ba0a438c99859e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/cumem_allocator.abi3.so differ diff --git a/.venv/lib/python3.11/site-packages/vllm/envs.py b/.venv/lib/python3.11/site-packages/vllm/envs.py new file mode 100644 index 0000000000000000000000000000000000000000..745b068b7a458d917437bacea18eb6e8fdcfad28 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/envs.py @@ -0,0 +1,588 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import tempfile +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional + +if TYPE_CHECKING: + VLLM_HOST_IP: str = "" + VLLM_PORT: Optional[int] = None + VLLM_RPC_BASE_PATH: str = tempfile.gettempdir() + VLLM_USE_MODELSCOPE: bool = False + VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60 + VLLM_NCCL_SO_PATH: Optional[str] = None + LD_LIBRARY_PATH: Optional[str] = None + VLLM_USE_TRITON_FLASH_ATTN: bool = False + VLLM_FLASH_ATTN_VERSION: Optional[int] = None + LOCAL_RANK: int = 0 + CUDA_VISIBLE_DEVICES: Optional[str] = None + VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 + VLLM_API_KEY: Optional[str] = None + S3_ACCESS_KEY_ID: Optional[str] = None + S3_SECRET_ACCESS_KEY: Optional[str] = None + S3_ENDPOINT_URL: Optional[str] = None + VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm") + VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") + VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" + VLLM_NO_USAGE_STATS: bool = False + VLLM_DO_NOT_TRACK: bool = False + VLLM_USAGE_SOURCE: str = "" + VLLM_CONFIGURE_LOGGING: int = 1 + VLLM_LOGGING_LEVEL: str = "INFO" + VLLM_LOGGING_PREFIX: str = "" + VLLM_LOGGING_CONFIG_PATH: Optional[str] = None + VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None + VLLM_TRACE_FUNCTION: int = 0 + VLLM_ATTENTION_BACKEND: Optional[str] = None + VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None + VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False + VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False + VLLM_PP_LAYER_PARTITION: Optional[str] = None + VLLM_CPU_KVCACHE_SPACE: int = 0 + VLLM_CPU_OMP_THREADS_BIND: str = "" + VLLM_OPENVINO_DEVICE: str = "CPU" + VLLM_OPENVINO_KVCACHE_SPACE: int = 0 + VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None + VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False + VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") + VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 + VLLM_USE_RAY_SPMD_WORKER: bool = False + VLLM_USE_RAY_COMPILED_DAG: bool = False + VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True + VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False + VLLM_WORKER_MULTIPROC_METHOD: str = "fork" + VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") + VLLM_IMAGE_FETCH_TIMEOUT: int = 5 + VLLM_VIDEO_FETCH_TIMEOUT: int = 30 + VLLM_AUDIO_FETCH_TIMEOUT: int = 10 + VLLM_TARGET_DEVICE: str = "cuda" + MAX_JOBS: Optional[str] = None + NVCC_THREADS: Optional[str] = None + VLLM_USE_PRECOMPILED: bool = False + VLLM_NO_DEPRECATION_WARNING: bool = False + VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False + CMAKE_BUILD_TYPE: Optional[str] = None + VERBOSE: bool = False + VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False + VLLM_TEST_FORCE_FP8_MARLIN: bool = False + VLLM_RPC_TIMEOUT: int = 10000 # ms + VLLM_PLUGINS: Optional[List[str]] = None + VLLM_TORCH_PROFILER_DIR: Optional[str] = None + VLLM_USE_TRITON_AWQ: bool = False + VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False + VLLM_SKIP_P2P_CHECK: bool = False + VLLM_DISABLED_KERNELS: List[str] = [] + VLLM_USE_V1: bool = False + VLLM_ENABLE_V1_MULTIPROCESSING: bool = True + VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 + VLLM_DISABLE_COMPILE_CACHE: bool = False + K_SCALE_CONSTANT: int = 200 + V_SCALE_CONSTANT: int = 100 + VLLM_SERVER_DEV_MODE: bool = False + VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 + VLLM_MLA_DISABLE: bool = False + VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True + VLLM_MLA_DISABLE_REQUANTIZATION: bool = False + VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True + VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False + VLLM_RAY_PER_WORKER_GPUS: float = 1.0 + VLLM_RAY_BUNDLE_INDICES: str = "" + + +def get_default_cache_root(): + return os.getenv( + "XDG_CACHE_HOME", + os.path.join(os.path.expanduser("~"), ".cache"), + ) + + +def get_default_config_root(): + return os.getenv( + "XDG_CONFIG_HOME", + os.path.join(os.path.expanduser("~"), ".config"), + ) + + +def maybe_convert_int(value: Optional[str]) -> Optional[int]: + if value is None: + return None + return int(value) + + +# The begin-* and end* here are used by the documentation generator +# to extract the used env vars. + +# begin-env-vars-definition + +environment_variables: Dict[str, Callable[[], Any]] = { + + # ================== Installation Time Env Vars ================== + + # Target device of vLLM, supporting [cuda (by default), + # rocm, neuron, cpu, openvino] + "VLLM_TARGET_DEVICE": + lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), + + # Maximum number of compilation jobs to run in parallel. + # By default this is the number of CPUs + "MAX_JOBS": + lambda: os.getenv("MAX_JOBS", None), + + # Number of threads to use for nvcc + # By default this is 1. + # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU. + "NVCC_THREADS": + lambda: os.getenv("NVCC_THREADS", None), + + # If set, vllm will use precompiled binaries (*.so) + "VLLM_USE_PRECOMPILED": + lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( + os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + + # CMake build type + # If not set, defaults to "Debug" or "RelWithDebInfo" + # Available options: "Debug", "Release", "RelWithDebInfo" + "CMAKE_BUILD_TYPE": + lambda: os.getenv("CMAKE_BUILD_TYPE"), + + # If set, vllm will print verbose logs during installation + "VERBOSE": + lambda: bool(int(os.getenv('VERBOSE', '0'))), + + # Root directory for VLLM configuration files + # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set + # Note that this not only affects how vllm finds its configuration files + # during runtime, but also affects how vllm installs its configuration + # files during **installation**. + "VLLM_CONFIG_ROOT": + lambda: os.path.expanduser( + os.getenv( + "VLLM_CONFIG_ROOT", + os.path.join(get_default_config_root(), "vllm"), + )), + + # ================== Runtime Env Vars ================== + + # Root directory for VLLM cache files + # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set + "VLLM_CACHE_ROOT": + lambda: os.path.expanduser( + os.getenv( + "VLLM_CACHE_ROOT", + os.path.join(get_default_cache_root(), "vllm"), + )), + + # used in distributed environment to determine the ip address + # of the current node, when the node has multiple network interfaces. + # If you are using multi-node inference, you should set this differently + # on each node. + 'VLLM_HOST_IP': + lambda: os.getenv('VLLM_HOST_IP', ""), + + # used in distributed environment to manually set the communication port + # Note: if VLLM_PORT is set, and some code asks for multiple ports, the + # VLLM_PORT will be used as the first port, and the rest will be generated + # by incrementing the VLLM_PORT value. + # '0' is used to make mypy happy + 'VLLM_PORT': + lambda: int(os.getenv('VLLM_PORT', '0')) + if 'VLLM_PORT' in os.environ else None, + + # path used for ipc when the frontend api server is running in + # multi-processing mode to communicate with the backend engine process. + 'VLLM_RPC_BASE_PATH': + lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()), + + # If true, will load models from ModelScope instead of Hugging Face Hub. + # note that the value is true or false, not numbers + "VLLM_USE_MODELSCOPE": + lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true", + + # Interval in seconds to log a warning message when the ring buffer is full + "VLLM_RINGBUFFER_WARNING_INTERVAL": + lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")), + + # path to cudatoolkit home directory, under which should be bin, include, + # and lib directories. + "CUDA_HOME": + lambda: os.environ.get("CUDA_HOME", None), + + # Path to the NCCL library file. It is needed because nccl>=2.19 brought + # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234 + "VLLM_NCCL_SO_PATH": + lambda: os.environ.get("VLLM_NCCL_SO_PATH", None), + + # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl + # library file in the locations specified by `LD_LIBRARY_PATH` + "LD_LIBRARY_PATH": + lambda: os.environ.get("LD_LIBRARY_PATH", None), + + # flag to control if vllm should use triton flash attention + "VLLM_USE_TRITON_FLASH_ATTN": + lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in + ("true", "1")), + + # Force vllm to use a specific flash-attention version (2 or 3), only valid + # when using the flash-attention backend. + "VLLM_FLASH_ATTN_VERSION": + lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)), + + # Internal flag to enable Dynamo fullgraph capture + "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": + lambda: bool( + os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), + + # local rank of the process in the distributed setting, used to determine + # the GPU device id + "LOCAL_RANK": + lambda: int(os.environ.get("LOCAL_RANK", "0")), + + # used to control the visible devices in the distributed setting + "CUDA_VISIBLE_DEVICES": + lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None), + + # timeout for each iteration in the engine + "VLLM_ENGINE_ITERATION_TIMEOUT_S": + lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")), + + # API key for VLLM API server + "VLLM_API_KEY": + lambda: os.environ.get("VLLM_API_KEY", None), + + # S3 access information, used for tensorizer to load model from S3 + "S3_ACCESS_KEY_ID": + lambda: os.environ.get("S3_ACCESS_KEY_ID", None), + "S3_SECRET_ACCESS_KEY": + lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None), + "S3_ENDPOINT_URL": + lambda: os.environ.get("S3_ENDPOINT_URL", None), + + # Usage stats collection + "VLLM_USAGE_STATS_SERVER": + lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"), + "VLLM_NO_USAGE_STATS": + lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", + "VLLM_DO_NOT_TRACK": + lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get( + "DO_NOT_TRACK", None) or "0") == "1", + "VLLM_USAGE_SOURCE": + lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"), + + # Logging configuration + # If set to 0, vllm will not configure logging + # If set to 1, vllm will configure logging using the default configuration + # or the configuration file specified by VLLM_LOGGING_CONFIG_PATH + "VLLM_CONFIGURE_LOGGING": + lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")), + "VLLM_LOGGING_CONFIG_PATH": + lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"), + + # this is used for configuring the default logging level + "VLLM_LOGGING_LEVEL": + lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"), + + # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages + "VLLM_LOGGING_PREFIX": + lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), + + # if set, vllm will call logits processors in a thread pool with this many + # threads. This is useful when using custom logits processors that either + # (a) launch additional CUDA kernels or (b) do significant CPU-bound work + # while not holding the python GIL, or both. + "VLLM_LOGITS_PROCESSOR_THREADS": + lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")) + if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None, + + # Trace function calls + # If set to 1, vllm will trace function calls + # Useful for debugging + "VLLM_TRACE_FUNCTION": + lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), + + # Backend for attention computation + # Available options: + # - "TORCH_SDPA": use torch.nn.MultiheadAttention + # - "FLASH_ATTN": use FlashAttention + # - "XFORMERS": use XFormers + # - "ROCM_FLASH": use ROCmFlashAttention + # - "FLASHINFER": use flashinfer + "VLLM_ATTENTION_BACKEND": + lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), + + # If set, vllm will use flashinfer sampler + "VLLM_USE_FLASHINFER_SAMPLER": + lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])) + if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None, + + # If set, vllm will force flashinfer to use tensor cores; + # otherwise will use heuristic based on model architecture. + "VLLM_FLASHINFER_FORCE_TENSOR_CORES": + lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))), + + # Pipeline stage partition strategy + "VLLM_PP_LAYER_PARTITION": + lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), + + # (CPU backend only) CPU key-value cache space. + # default is 4GB + "VLLM_CPU_KVCACHE_SPACE": + lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")), + + # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", + # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. + "VLLM_CPU_OMP_THREADS_BIND": + lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"), + + # OpenVINO device selection + # default is CPU + "VLLM_OPENVINO_DEVICE": + lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(), + + # OpenVINO key-value cache space + # default is 4GB + "VLLM_OPENVINO_KVCACHE_SPACE": + lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")), + + # OpenVINO KV cache precision + # default is bf16 if natively supported by platform, otherwise f16 + # To enable KV cache compression, please, explicitly specify u8 + "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION": + lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None), + + # Enables weights compression during model export via HF Optimum + # default is False + "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS": + lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)), + + # If the env var is set, then all workers will execute as separate + # processes from the engine, and we use the same mechanism to trigger + # execution on all workers. + # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it. + "VLLM_USE_RAY_SPMD_WORKER": + lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))), + + # If the env var is set, it uses the Ray's compiled DAG API + # which optimizes the control plane overhead. + # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. + "VLLM_USE_RAY_COMPILED_DAG": + lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))), + + # If the env var is set, it uses NCCL for communication in + # Ray's compiled DAG. This flag is ignored if + # VLLM_USE_RAY_COMPILED_DAG is not set. + "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": + lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1")) + ), + + # If the env var is set, it enables GPU communication overlap + # (experimental feature) in Ray's compiled DAG. This flag is ignored if + # VLLM_USE_RAY_COMPILED_DAG is not set. + "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": + lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0")) + ), + + # Use dedicated multiprocess context for workers. + # Both spawn and fork work + "VLLM_WORKER_MULTIPROC_METHOD": + lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), + + # Path to the cache for storing downloaded assets + "VLLM_ASSETS_CACHE": + lambda: os.path.expanduser( + os.getenv( + "VLLM_ASSETS_CACHE", + os.path.join(get_default_cache_root(), "vllm", "assets"), + )), + + # Timeout for fetching images when serving multimodal models + # Default is 5 seconds + "VLLM_IMAGE_FETCH_TIMEOUT": + lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")), + + # Timeout for fetching videos when serving multimodal models + # Default is 15 seconds + "VLLM_VIDEO_FETCH_TIMEOUT": + lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")), + + # Timeout for fetching audio when serving multimodal models + # Default is 10 seconds + "VLLM_AUDIO_FETCH_TIMEOUT": + lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), + + # Path to the XLA persistent cache directory. + # Only used for XLA devices such as TPUs. + "VLLM_XLA_CACHE_PATH": + lambda: os.path.expanduser( + os.getenv( + "VLLM_XLA_CACHE_PATH", + os.path.join(get_default_cache_root(), "vllm", "xla_cache"), + )), + "VLLM_FUSED_MOE_CHUNK_SIZE": + lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")), + + # If set, vllm will skip the deprecation warnings. + "VLLM_NO_DEPRECATION_WARNING": + lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))), + + # If set, the OpenAI API server will stay alive even after the underlying + # AsyncLLMEngine errors and stops serving requests + "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": + lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)), + + # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows + # the user to specify a max sequence length greater than + # the max length derived from the model's config.json. + # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": + lambda: + (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in + ("1", "true")), + + # If set, forces FP8 Marlin to be used for FP8 quantization regardless + # of the hardware support for FP8 compute. + "VLLM_TEST_FORCE_FP8_MARLIN": + lambda: + (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in + ("1", "true")), + "VLLM_TEST_FORCE_LOAD_FORMAT": + lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"), + + # Time in ms for the zmq client to wait for a response from the backend + # server for simple data operations + "VLLM_RPC_TIMEOUT": + lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")), + + # a list of plugin names to load, separated by commas. + # if this is not set, it means all plugins will be loaded + # if this is set to an empty string, no plugins will be loaded + "VLLM_PLUGINS": + lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[ + "VLLM_PLUGINS"].split(","), + + # Enables torch profiler if set. Path to the directory where torch profiler + # traces are saved. Note that it must be an absolute path. + "VLLM_TORCH_PROFILER_DIR": + lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os + .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))), + + # If set, vLLM will use Triton implementations of AWQ. + "VLLM_USE_TRITON_AWQ": + lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), + + # If set, allow loading or unloading lora adapters in runtime, + "VLLM_ALLOW_RUNTIME_LORA_UPDATING": + lambda: + (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in + ("1", "true")), + + # By default, vLLM will check the peer-to-peer capability itself, + # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa + # If this env var is set to 1, vLLM will skip the peer-to-peer check, + # and trust the driver's peer-to-peer capability report. + "VLLM_SKIP_P2P_CHECK": + lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1", + + # List of quantization kernels that should be disabled, used for testing + # and performance comparisons. Currently only affects MPLinearKernel + # selection + # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel) + "VLLM_DISABLED_KERNELS": + lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[ + "VLLM_DISABLED_KERNELS"].split(","), + + # If set, use the V1 code path. + "VLLM_USE_V1": + lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), + + # Divisor for dynamic key scale factor calculation for FP8 KV Cache + "K_SCALE_CONSTANT": + lambda: int(os.getenv("K_SCALE_CONSTANT", "200")), + + # Divisor for dynamic value scale factor calculation for FP8 KV Cache + "V_SCALE_CONSTANT": + lambda: int(os.getenv("V_SCALE_CONSTANT", "100")), + # If set, enable multiprocessing in LLM for the V1 code path. + "VLLM_ENABLE_V1_MULTIPROCESSING": + lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))), + "VLLM_LOG_BATCHSIZE_INTERVAL": + lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), + "VLLM_DISABLE_COMPILE_CACHE": + lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))), + + # If set, vllm will run in development mode, which will enable + # some additional endpoints for developing and debugging, + # e.g. `/reset_prefix_cache` + "VLLM_SERVER_DEV_MODE": + lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))), + + # Controls the maximum number of requests to handle in a + # single asyncio task when processing per-token outputs in the + # V1 AsyncLLM interface. It is applicable when handling a high + # concurrency of streaming requests. + # Setting this too high can result in a higher variance of + # inter-message latencies. Setting it too low can negatively impact + # TTFT and overall throughput. + "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": + lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")), + + # If set, vLLM will disable the MLA attention optimizations. + "VLLM_MLA_DISABLE": + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), + + # Flag that can control whether or not we perform matrix-absorption for MLA + # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the + # matrices reduces the runtime FLOPs needed to compute MLA but requires + # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage, + # the is enabled by default + "VLLM_MLA_PERFORM_MATRIX_ABSORPTION": + lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))), + + # When running MLA with matrix-absorption enabled and fp8 quantized weights + # we perform the matrix-absorption in float32 precision, after the matrices + # are absorbed we requantize the weights back to fp8, this flag can be used + # to disable the requantization step, and instead convert the absorbed + # matrices to match the activation type. This can lead to higher memory and + # compute usage but better preserves the accuracy of the original model. + "VLLM_MLA_DISABLE_REQUANTIZATION": + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))), + + # If set, vLLM will use the Triton implementation of moe_align_block_size, + # i.e. moe_align_block_size_triton in fused_moe.py. + "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": + lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) + ), + + # Number of GPUs per worker in Ray, if it is set to be a fraction, + # it allows ray to schedule multiple actors on a single GPU, + # so that users can colocate other actors on the same GPUs as vLLM. + "VLLM_RAY_PER_WORKER_GPUS": + lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")), + + # Bundle indices for Ray, if it is set, it can control precisely + # which indices are used for the Ray bundle, for every worker. + # Format: comma-separated list of integers, e.g. "0,1,2,3" + "VLLM_RAY_BUNDLE_INDICES": + lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""), + + # When on a Nvidia GPU aligns single entries (within a page) so they are 256 + # byte aligned for better performance, this increases the memory usage of + # the cache. Currently this only affects MLA that results in non-256 + # byte aligned entries. This matches the alignment the CUDA runtime uses + # for all allocations. Currently this primarily affects MLA, for most other + # models the alignment is already naturally aligned to 256 bytes. + "VLLM_CUDA_MEM_ALIGN_KV_CACHE": + lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))), +} + +# end-env-vars-definition + + +def __getattr__(name: str): + # lazy evaluation of environment variables + if name in environment_variables: + return environment_variables[name]() + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + return list(environment_variables.keys()) diff --git a/.venv/lib/python3.11/site-packages/vllm/executor/__init__.py b/.venv/lib/python3.11/site-packages/vllm/executor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/vllm/executor/__pycache__/ray_distributed_executor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/executor/__pycache__/ray_distributed_executor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04c971c6cd9b3e11c71bb524880736e01a51b00f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/executor/__pycache__/ray_distributed_executor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/executor/executor_base.py b/.venv/lib/python3.11/site-packages/vllm/executor/executor_base.py new file mode 100644 index 0000000000000000000000000000000000000000..fb76276bb4b3476f70d4eb6b619bd40ead168dad --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/executor/executor_base.py @@ -0,0 +1,376 @@ +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +from abc import ABC, abstractmethod +from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, + Union) + +import torch.nn as nn +from typing_extensions import TypeVar + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sequence import ExecuteModelRequest, PoolerOutput +from vllm.utils import make_async +from vllm.worker.worker_base import WorkerBase + +logger = init_logger(__name__) + +_R = TypeVar("_R", default=Any) + + +class ExecutorBase(ABC): + """Base class for all executors. + + An executor is responsible for executing the model on one device, + or it can be a distributed executor + that can execute the model on multiple devices. + """ + + uses_ray: bool # whether the executor uses Ray for orchestration. + + def __init__( + self, + vllm_config: VllmConfig, + ) -> None: + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + self._init_executor() + self.is_sleeping = False + + @abstractmethod + def _init_executor(self) -> None: + raise NotImplementedError + + @abstractmethod + def collective_rpc(self, + method: Union[str, Callable[..., _R]], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + """ + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. + """ + raise NotImplementedError + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available blocks for the GPU KV cache and + swappable CPU KV cache. + + Normally, this should simply delegate to the underlying Worker. Some + ExecutorBase may require modification of the result, e.g. to ensure the + selected cache sizes are compatible with all workers. + + Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + are blocks that are "active" on the device and can be appended to. + num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be + appended to. + """ + results = self.collective_rpc("determine_num_available_blocks") + a = min([r[0] for r in results]) + b = min([r[1] for r in results]) + return a, b + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ + # NOTE: This is logged in the executor because there can be >1 workers. + logger.info("# %s blocks: %d, # CPU blocks: %d", + current_platform.dispatch_key, num_gpu_blocks, + num_cpu_blocks) + max_concurrency = (num_gpu_blocks * self.cache_config.block_size / + self.model_config.max_model_len) + logger.info("Maximum concurrency for %s tokens per request: %.2fx", + self.model_config.max_model_len, max_concurrency) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self.collective_rpc("initialize_cache", + args=(num_gpu_blocks, num_cpu_blocks)) + + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + + def rpc_func(worker: WorkerBase) -> _R: + return func(worker.get_model()) + + return self.collective_rpc(rpc_func) + + def execute_model( + self, execute_model_req: ExecuteModelRequest + ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: + output = self.collective_rpc("execute_model", + args=(execute_model_req, )) + return output[0] + + def stop_remote_worker_execution_loop(self) -> None: + """Releases parallel workers from model loop.""" + return + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return all(self.collective_rpc("add_lora", args=(lora_request, ))) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return all(self.collective_rpc("remove_lora", args=(lora_id, ))) + + def pin_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return all(self.collective_rpc("pin_lora", args=(lora_id, ))) + + def list_loras(self) -> Set[int]: + sets = self.collective_rpc("list_loras") + for s in sets: + assert s == sets[0], "All workers should have the same LORAs." + return sets[0] + + def add_prompt_adapter( + self, prompt_adapter_request: PromptAdapterRequest) -> bool: + assert prompt_adapter_request.prompt_adapter_id > 0, \ + "prompt_adapter_id must be greater than 0." + return all( + self.collective_rpc("add_prompt_adapter", + args=(prompt_adapter_request, ))) + + def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: + assert prompt_adapter_id > 0, \ + "prompt_adapter_id must be greater than 0." + return all( + self.collective_rpc("remove_prompt_adapter", + args=(prompt_adapter_id, ))) + + def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: + assert prompt_adapter_id > 0, \ + "prompt_adapter_id must be greater than 0." + return all( + self.collective_rpc("pin_prompt_adapter", + args=(prompt_adapter_id, ))) + + def list_prompt_adapters(self) -> Set[int]: + sets = self.collective_rpc("list_prompt_adapters") + for s in sets: + assert (s == sets[0] + ), "All workers should have the same prompt adapters." + return sets[0] + + def start_profile(self) -> None: + self.collective_rpc("start_profile") + + def stop_profile(self) -> None: + self.collective_rpc("stop_profile") + + def sleep(self, level: int = 1): + if self.is_sleeping: + logger.warning("Executor is already sleeping.") + return + self.collective_rpc("sleep", kwargs=dict(level=level)) + self.is_sleeping = True + + def wake_up(self): + if not self.is_sleeping: + logger.warning("Executor is not sleeping.") + return + self.collective_rpc("wake_up") + self.is_sleeping = False + + def save_sharded_state( + self, + path: str, + pattern: Optional[str] = None, + max_size: Optional[int] = None, + ) -> None: + self.collective_rpc("save_sharded_state", + kwargs=dict(path=path, + pattern=pattern, + max_size=max_size)) + + @abstractmethod + def check_health(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError + + def shutdown(self) -> None: + """Shutdown the executor.""" + return + + def __del__(self): + self.shutdown() + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + """Executes one model step on the given sequences.""" + output = await make_async(self.execute_model)(execute_model_req) + return output + + async def stop_remote_worker_execution_loop_async(self) -> None: + """Releases parallel workers from model loop.""" + return + + async def check_health_async(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + self.check_health() + + +class DistributedExecutorBase(ExecutorBase): + """Abstract superclass of distributed executor implementations.""" + + def __init__(self, *args, **kwargs): + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + + super().__init__(*args, **kwargs) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + # TODO: unify into collective_rpc + if self.parallel_worker_tasks is None: + self.parallel_worker_tasks = self._run_workers( + "start_worker_execution_loop", + async_run_tensor_parallel_workers_only=True) + + # Only the driver worker returns the sampling results. + driver_outputs = self._driver_execute_model(execute_model_req) + assert driver_outputs is not None + return driver_outputs + + def stop_remote_worker_execution_loop(self) -> None: + if self.parallel_worker_tasks is None: + return + + self._driver_execute_model(execute_model_req=None) + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + self._wait_for_tasks_completion(parallel_worker_tasks) + + @abstractmethod + def _driver_execute_model( + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution loop + running in each of the remote workers. In this case, this method + returns None. Otherwise, this method returns the model output. + """ + raise NotImplementedError + + def collective_rpc(self, + method: Union[str, Callable], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict] = None) -> List[Any]: + return self._run_workers(method, *args, **(kwargs or {})) + + @abstractmethod + def _run_workers( + self, + method: Union[str, Callable], + *args, + async_run_tensor_parallel_workers_only: bool = False, + max_concurrent_workers: Optional[int] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers. + + Args: + async_run_tensor_parallel_workers_only: If True the method will be + run only in the remote TP workers, not the driver worker. + It will also be run asynchronously and return a list of futures + rather than blocking on the results. + + # TODO: simplify and merge with collective_rpc + """ + raise NotImplementedError + + @abstractmethod + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + raise NotImplementedError + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + # Start model execution loop running in the parallel workers + self.parallel_worker_tasks = asyncio.create_task( + self._start_worker_execution_loop()) + + # Only the driver worker returns the sampling results. + return await self._driver_execute_model_async(execute_model_req) + + async def stop_remote_worker_execution_loop_async(self) -> None: + if self.parallel_worker_tasks is None: + return + + await self._driver_execute_model_async() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + await parallel_worker_tasks + + @abstractmethod + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + ) -> List[SamplerOutput]: + """Execute the model asynchronously in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + raise NotImplementedError + + @abstractmethod + async def _start_worker_execution_loop(self): + """Run execution loop on all workers. It guarantees all workers run + the loop or None of them is running the loop. Loop can be stopped by + `stop_remote_worker_execution_loop`. + The API is idempotent (guarantee only 1 loop run at any moment).""" + raise NotImplementedError diff --git a/.venv/lib/python3.11/site-packages/vllm/executor/mp_distributed_executor.py b/.venv/lib/python3.11/site-packages/vllm/executor/mp_distributed_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..d1f8c36fbbec7638f89c0e828cbc42e3a66bee02 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/executor/mp_distributed_executor.py @@ -0,0 +1,243 @@ +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import os +from typing import Any, Callable, List, Optional, Union + +import cloudpickle + +from vllm.executor.executor_base import DistributedExecutorBase +from vllm.executor.multiproc_worker_utils import ( + ProcessWorkerWrapper, ResultHandler, WorkerMonitor, + set_multiprocessing_worker_envs) +from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest +from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, + get_distributed_init_method, get_ip, get_open_port, + make_async, run_method, update_environment_variables) +from vllm.worker.worker_base import WorkerWrapperBase + +logger = init_logger(__name__) + + +class MultiprocessingDistributedExecutor(DistributedExecutorBase): + """Python multiprocessing-based distributed executor""" + + uses_ray: bool = False + + def _check_cuda(self) -> None: + """Check that the number of GPUs is sufficient for the parallel + configuration. Separate from _init_executor to reduce the number of + indented blocks. + """ + parallel_config = self.parallel_config + world_size = parallel_config.world_size + tensor_parallel_size = parallel_config.tensor_parallel_size + + cuda_device_count = cuda_device_count_stateless() + # Use confusing message for more common TP-only case. + if tensor_parallel_size > cuda_device_count: + raise RuntimeError( + f"please set tensor_parallel_size ({tensor_parallel_size}) " + f"to less than max local gpu count ({cuda_device_count})") + + if world_size > cuda_device_count: + raise RuntimeError( + f"please ensure that world_size ({world_size}) " + f"is less than than max local gpu count ({cuda_device_count})") + + # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers + if "CUDA_VISIBLE_DEVICES" not in os.environ: + update_environment_variables({ + "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) + }) + + def _init_executor(self) -> None: + + from vllm.platforms import current_platform + if current_platform.is_cuda_alike(): + self._check_cuda() + + # Create the parallel GPU workers. + world_size = self.parallel_config.world_size + tensor_parallel_size = self.parallel_config.tensor_parallel_size + + # Set multiprocessing envs that are common to V0 and V1 + set_multiprocessing_worker_envs(self.parallel_config) + + # Multiprocessing-based executor does not support multi-node setting. + # Since it only works for single node, we can use the loopback address + # 127.0.0.1 for communication. + distributed_init_method = get_distributed_init_method( + "127.0.0.1", get_open_port()) + + self.workers: List[ProcessWorkerWrapper] = [] + # This is the list of workers that are rank 0 of each TP group EXCEPT + # global rank 0. These are the workers that will broadcast to the + # rest of the workers. + self.tp_driver_workers: List[ProcessWorkerWrapper] = [] + # This is the list of workers that are not drivers and not the first + # worker in a TP group. These are the workers that will be + # broadcasted to. + self.non_driver_workers: List[ProcessWorkerWrapper] = [] + + if world_size == 1: + self.worker_monitor = None + else: + result_handler = ResultHandler() + for rank in range(1, world_size): + worker = ProcessWorkerWrapper(result_handler, + WorkerWrapperBase, + self.vllm_config, rank) + self.workers.append(worker) + if rank % tensor_parallel_size == 0: + self.tp_driver_workers.append(worker) + else: + self.non_driver_workers.append(worker) + + self.worker_monitor = WorkerMonitor(self.workers, result_handler) + result_handler.start() + self.worker_monitor.start() + + # Set up signal handlers to shutdown the executor cleanly + # sometimes gc does not work well + + self.driver_worker = WorkerWrapperBase(self.vllm_config, 0) + + all_kwargs = [] + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + for i in range(world_size): + local_rank = i + rank = i + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=(not self.parallel_config) + or (rank % self.parallel_config.tensor_parallel_size == 0), + ) + all_kwargs.append(kwargs) + self._run_workers("init_worker", all_kwargs) + self._run_workers("init_device") + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + self.driver_exec_model = make_async(self.driver_worker.execute_model) + self.pp_locks: Optional[List[asyncio.Lock]] = None + + def shutdown(self): + if (worker_monitor := getattr(self, "worker_monitor", + None)) is not None: + worker_monitor.close() + + def _driver_execute_model( + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_model(execute_model_req) + + def _run_workers( + self, + method: Union[str, Callable], + *args, + async_run_tensor_parallel_workers_only: bool = False, + max_concurrent_workers: Optional[int] = None, + **kwargs, + ) -> List[Any]: + """Runs the given method on all workers. + + Args: + async_run_tensor_parallel_workers_only: If True the method will be + run only in the remote TP workers, not the driver worker. + It will also be run asynchronously and return a list of futures + rather than blocking on the results. + """ + if isinstance(method, str): + sent_method = method + else: + sent_method = cloudpickle.dumps(method) + del method + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + if async_run_tensor_parallel_workers_only: + # Run only non-driver workers and just return futures. + return [ + worker.execute_method(sent_method, *args, **kwargs) + for worker in self.non_driver_workers + ] + + # Start all remote workers first. + worker_outputs = [ + worker.execute_method(sent_method, *args, **kwargs) + for worker in self.workers + ] + + driver_worker_output = run_method(self.driver_worker, sent_method, + args, kwargs) + + # Get the results of the workers. + return [driver_worker_output + ] + [output.get() for output in worker_outputs] + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + if self.worker_monitor is not None and not self.worker_monitor.is_alive( + ): + raise RuntimeError("Worker processes are not running") + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + for result in parallel_worker_tasks: + result.get() + + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if not self.tp_driver_workers: + return await self.driver_exec_model(execute_model_req) + + if self.pp_locks is None: + # This locks each pipeline parallel stage so multiple virtual + # engines can't execute on the same stage at the same time + # We create the locks here to avoid creating them in the constructor + # which uses a different asyncio loop. + self.pp_locks = [ + asyncio.Lock() + for _ in range(self.parallel_config.pipeline_parallel_size) + ] + + tasks = [ + asyncio.create_task( + _run_task_with_lock(self.driver_exec_model, self.pp_locks[0], + execute_model_req)) + ] + for pp_rank, driver_worker in enumerate(self.tp_driver_workers, + start=1): + tasks.append( + asyncio.create_task( + _run_task_with_lock(driver_worker.execute_method_async, + self.pp_locks[pp_rank], + "execute_model", execute_model_req))) + results = await asyncio.gather(*tasks) + + # Only the last PP stage has the final results. + return results[-1] + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method_async("start_worker_execution_loop") + for worker in self.non_driver_workers + ] + return await asyncio.gather(*coros) diff --git a/.venv/lib/python3.11/site-packages/vllm/executor/msgspec_utils.py b/.venv/lib/python3.11/site-packages/vllm/executor/msgspec_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e680d53cbd10e216407f0850a84d2f0f688967d9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/executor/msgspec_utils.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 + +from array import array +from typing import Any, Type + +from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE + + +def encode_hook(obj: Any) -> Any: + """Custom msgspec enc hook that supports array types. + + See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder + """ + if isinstance(obj, array): + assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, ( + f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " + f"Given array has a type code of {obj.typecode}.") + return obj.tobytes() + + +def decode_hook(type: Type, obj: Any) -> Any: + """Custom msgspec dec hook that supports array types. + + See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder + """ + if type is array: + deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) + deserialized.frombytes(obj) + return deserialized diff --git a/.venv/lib/python3.11/site-packages/vllm/executor/ray_distributed_executor.py b/.venv/lib/python3.11/site-packages/vllm/executor/ray_distributed_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..6a25a4d50fb98b083b83a84a770429144fc0e3c7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/executor/ray_distributed_executor.py @@ -0,0 +1,638 @@ +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import os +from collections import defaultdict +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union + +import cloudpickle +import msgspec + +import vllm.envs as envs +from vllm.executor.executor_base import ( + DistributedExecutorBase) # yapf: disable +from vllm.executor.msgspec_utils import encode_hook +from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster, + ray) +from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform +from vllm.sequence import ExecuteModelRequest +from vllm.utils import (_run_task_with_lock, get_distributed_init_method, + get_ip, get_open_port, make_async) + +if ray is not None: + from ray.actor import ActorHandle + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +else: + ActorHandle = None + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + + +@dataclass +class RayWorkerMetaData: + """ + Metadata for a Ray worker. + The order of ray worker creation can be random, + and we need to reset the rank after creating all workers. + """ + worker: ActorHandle + created_rank: int + adjusted_rank: int = -1 + ip: str = "" + + +class RayDistributedExecutor(DistributedExecutorBase): + + uses_ray: bool = True + + def _init_executor(self) -> None: + self.forward_dag: Optional[ray.dag.CompiledDAG] = None + if envs.VLLM_USE_V1: + # v1 always uses the compiled DAG and SPMD worker. + os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1" + os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1" + # If the env var is set, it uses the Ray's compiled DAG API + # which optimizes the control plane overhead. + # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. + # Currently, this requires USE_RAY_SPMD_WORKER=True. + self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG + # If the env var is set, then we do not distinguish between the + # "driver worker" vs other workers. Also, the rank 0 worker will + # be executed in a remote Ray worker. Currently this requires + # USE_RAY_COMPILED_DAG=True. + self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER + if self.use_ray_compiled_dag: + assert self.use_ray_spmd_worker, ( + "VLLM_USE_RAY_COMPILED_DAG=1 requires " + "VLLM_USE_RAY_SPMD_WORKER=1") + if self.use_ray_spmd_worker: + # TODO: Support SPMD worker for non-DAG Ray executor. + assert self.use_ray_compiled_dag, ( + "VLLM_USE_RAY_SPMD_WORKER=1 requires " + "VLLM_USE_RAY_COMPILED_DAG=1") + + assert self.uses_ray + initialize_ray_cluster(self.parallel_config) + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) + self.output_decoder = msgspec.msgpack.Decoder( + Optional[List[SamplerOutput]]) + self.use_v1 = envs.VLLM_USE_V1 + + self.pp_locks: Optional[List[asyncio.Lock]] = None + self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER + if not self.use_ray_compiled_dag: + self.driver_exec_method = make_async( + self.driver_worker.execute_method) + + def shutdown(self) -> None: + if hasattr(self, "forward_dag") and self.forward_dag is not None: + self.forward_dag.teardown() + import ray + for worker in self.workers: + ray.kill(worker) + self.forward_dag = None + + def _configure_ray_workers_use_nsight(self, + ray_remote_kwargs) -> Dict[str, Any]: + # If nsight profiling is enabled, we need to set the profiling + # configuration for the ray workers as runtime env. + runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) + runtime_env.update({ + "nsight": { + "t": "cuda,cudnn,cublas", + "o": "'worker_process_%p'", + "cuda-graph-trace": "node", + } + }) + + return ray_remote_kwargs + + # child class could overwrite this to return actual env vars. + def _get_env_vars_to_be_updated(self): + return self._env_vars_for_all_workers + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Used in ray compiled DAG: indexed first by PP rank, + # and then TP rank. In other words, the inner list is + # the TP group of workers for a PP rank. + self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] + + if self.parallel_config.ray_workers_use_nsight: + ray_remote_kwargs = self._configure_ray_workers_use_nsight( + ray_remote_kwargs) + + logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) + + # Create the workers. + bundle_indices: List[int] + if envs.VLLM_RAY_BUNDLE_INDICES: + # Use the bundle indices specified by the user. + bundle_indices = list( + map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(","))) + assert len(bundle_indices) == self.parallel_config.world_size, \ + ("VLLM_RAY_BUNDLE_INDICES must have the same size" + f" as the world size, but got {bundle_indices=} " + f"and {self.parallel_config.world_size=}") + assert len(set(bundle_indices)) == len(bundle_indices), \ + ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values," + f" but got {bundle_indices=}") + else: + # use the first N bundles that have GPU resources. + bundle_indices = [] + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if bundle.get(current_platform.ray_device_key, 0): + bundle_indices.append(bundle_id) + bundle_indices = bundle_indices[:self.parallel_config.world_size] + + worker_metadata: List[RayWorkerMetaData] = [] + driver_ip = get_ip() + for rank, bundle_id in enumerate(bundle_indices): + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + if current_platform.ray_device_key == "GPU": + # NV+AMD GPUs, and Intel XPUs + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, + rpc_rank=rank) + else: + worker = ray.remote( + num_cpus=0, + num_gpus=0, + resources={current_platform.ray_device_key: num_gpus}, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, + rpc_rank=rank) + worker_metadata.append( + RayWorkerMetaData(worker=worker, created_rank=rank)) + + worker_ips = ray.get([ + each.worker.get_node_ip.remote() # type: ignore[attr-defined] + for each in worker_metadata + ]) + + for each, ip in zip(worker_metadata, worker_ips): + each.ip = ip + + if not self.use_ray_spmd_worker: + for i, each in enumerate(worker_metadata): + # find and remove the dummy worker from the list + worker = each.worker + worker_ip = each.ip + if self.driver_dummy_worker is None and worker_ip == driver_ip: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + vllm_config=self.vllm_config, rpc_rank=0) + worker_metadata.pop(i) + break + + logger.debug("workers: %s", worker_metadata) + logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) + if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + ip_counts: Dict[str, int] = {} + for ip in worker_ips: + ip_counts[ip] = ip_counts.get(ip, 0) + 1 + + def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): + """ + Sort the workers based on 3 properties: + 1. If the worker is on the same node as the driver (vllm engine), + it should be placed first. + 2. Then, if the worker is on a node with fewer workers, it should + be placed first. + 3. Finally, if the work is on a node with smaller IP address, it + should be placed first. + """ + ip = item.ip + return (0 if ip == driver_ip else 1, ip_counts[ip], ip) + + # After sorting, the workers on the same node will be + # close to each other, and the workers on the driver + # node will be placed first. + sorted_worker_metadata = sorted(worker_metadata, + key=sort_by_driver_then_worker_ip) + start_rank = 0 if self.use_ray_spmd_worker else 1 + for i, item in enumerate(sorted_worker_metadata): + item.adjusted_rank = i + start_rank + self.workers = [item.worker for item in sorted_worker_metadata] + rerank_mapping = { + item.created_rank: item.adjusted_rank + for item in sorted_worker_metadata + } + self._run_workers("adjust_rank", rerank_mapping) + + # Get the set of GPU IDs used on each node. + worker_node_and_gpu_ids = [] + for worker in [self.driver_dummy_worker] + self.workers: + if worker is None: + # driver_dummy_worker can be None when using ray spmd worker. + continue + worker_node_and_gpu_ids.append( + ray.get(worker.get_node_and_gpu_ids.remote()) \ + ) # type: ignore + + node_workers = defaultdict(list) # node id -> list of worker ranks + node_gpus = defaultdict(list) # node id -> list of gpu ids + + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + # `gpu_ids` can be a list of strings or integers. + # convert them to integers for consistency. + # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), + # string sorting is not sufficient. + # see https://github.com/vllm-project/vllm/issues/5590 + gpu_ids = [int(x) for x in gpu_ids] + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + all_ips = set(worker_ips + [driver_ip]) + n_ips = len(all_ips) + n_nodes = len(node_workers) + + if n_nodes != n_ips: + raise RuntimeError( + f"Every node should have a unique IP address. Got {n_nodes}" + f" nodes with node ids {list(node_workers.keys())} and " + f"{n_ips} unique IP addresses {all_ips}. Please check your" + " network configuration. If you set `VLLM_HOST_IP`" + " environment variable, make sure it is unique for" + " each node.") + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [{ + current_platform.device_control_env_var: + ",".join(map(str, node_gpus[node_id])), + } for (node_id, _) in worker_node_and_gpu_ids] + + for args in all_args_to_update_environment_variables: + # some carry-over env vars from the driver + # TODO: refactor platform-specific env vars + for name in [ + "VLLM_ATTENTION_BACKEND", + "TPU_CHIPS_PER_HOST_BOUNDS", + "TPU_HOST_BOUNDS", + "VLLM_USE_V1", + "VLLM_TRACE_FUNCTION", + ]: + if name in os.environ: + args[name] = os.environ[name] + + self._env_vars_for_all_workers = ( + all_args_to_update_environment_variables) + + self._run_workers("update_environment_variables", + self._get_env_vars_to_be_updated()) + + if len(node_gpus) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + all_kwargs = [] + for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids): + local_rank = node_workers[node_id].index(rank) + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=(not self.parallel_config) + or (rank % self.parallel_config.tensor_parallel_size == 0), + ) + all_kwargs.append(kwargs) + self._run_workers("init_worker", all_kwargs) + + self._run_workers("init_device") + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + + if self.use_ray_spmd_worker: + for pp_rank in range(self.parallel_config.pipeline_parallel_size): + self.pp_tp_workers.append([]) + for tp_rank in range( + self.parallel_config.tensor_parallel_size): + # PP=2, TP=4 + # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]] + rank = (pp_rank * self.parallel_config.tensor_parallel_size + ) + tp_rank + assert len(self.pp_tp_workers[pp_rank]) == tp_rank + assert pp_rank < len(self.pp_tp_workers) + self.pp_tp_workers[pp_rank].append(self.workers[rank]) + + # This is the list of workers that are rank 0 of each TP group EXCEPT + # global rank 0. These are the workers that will broadcast to the + # rest of the workers. + self.tp_driver_workers: List[RayWorkerWrapper] = [] + # This is the list of workers that are not drivers and not the first + # worker in a TP group. These are the workers that will be + # broadcasted to. + self.non_driver_workers: List[RayWorkerWrapper] = [] + + # Enforce rank order for correct rank to return final output. + for index, worker in enumerate(self.workers): + # The driver worker is rank 0 and not in self.workers. + rank = index + 1 + if rank % self.parallel_config.tensor_parallel_size == 0: + self.tp_driver_workers.append(worker) + else: + self.non_driver_workers.append(worker) + + def _driver_execute_model( + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + assert not self.use_ray_spmd_worker, ( + "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") + return self.driver_worker.execute_method("execute_model", + execute_model_req) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if not self.use_ray_spmd_worker: + return super().execute_model(execute_model_req) + + if self.forward_dag is None: + self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) + + if self.use_v1: + serialized_data = execute_model_req + else: + serialized_data = self.input_encoder.encode(execute_model_req) + outputs = ray.get(self.forward_dag.execute(serialized_data)) + if self.use_v1: + output = outputs[0] + else: + output = self.output_decoder.decode(outputs[0]) + return output + + def _run_workers( + self, + method: Union[str, Callable], + *args, + async_run_tensor_parallel_workers_only: bool = False, + max_concurrent_workers: Optional[int] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers. Can be used in the following + ways: + + Args: + - async_run_tensor_parallel_workers_only: If True the method will be + run only in the remote TP workers, not the driver worker. + It will also be run asynchronously and return a list of futures + rather than blocking on the results. + - args/kwargs: All workers share the same args/kwargs + """ + if isinstance(method, str): + sent_method = method + else: + sent_method = cloudpickle.dumps(method) + del method + if self.use_ray_spmd_worker: + assert not async_run_tensor_parallel_workers_only, ( + "async_run_tensor_parallel_workers_only is not supported for " + "spmd mode.") + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + # Start the ray workers first. + ray_workers = self.workers + if async_run_tensor_parallel_workers_only: + ray_workers = self.non_driver_workers + ray_worker_outputs = [ + worker.execute_method.remote(sent_method, *args, **kwargs) + for worker in ray_workers + ] + + if async_run_tensor_parallel_workers_only: + # Just return futures + return ray_worker_outputs + + driver_worker_output = [] + # In SPMD mode, the driver worker is the same as any other worker, + # so we only explicitly execute on the driver worker if using a + # non-SPMD worker class. + if not self.use_ray_spmd_worker: + # Start the driver worker after all the ray workers. + driver_worker_output = [ + self.driver_worker.execute_method(sent_method, *args, **kwargs) + ] + + # Get the results of the ray workers. + if self.workers: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return driver_worker_output + ray_worker_outputs + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + + def _check_ray_adag_installation(self): + import pkg_resources + from packaging import version + + required_version = version.parse("2.40") + current_version = version.parse( + pkg_resources.get_distribution("ray").version) + if current_version < required_version: + raise ValueError(f"Ray version {required_version} is " + f"required, but found {current_version}") + + import importlib.util + adag_spec = importlib.util.find_spec( + "ray.experimental.compiled_dag_ref") + if adag_spec is None: + raise ValueError("Ray accelerated DAG is not installed. " + "Run `pip install ray[adag]` to install it.") + + cupy_spec = importlib.util.find_spec("cupy") + if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: + raise ValueError( + "cupy is not installed but required since " + "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set." + "Run `pip install ray[adag]` and check cupy installation.") + + def _compiled_ray_dag(self, enable_asyncio: bool): + assert self.parallel_config.use_ray + self._check_ray_adag_installation() + from ray.dag import InputNode, MultiOutputNode + from ray.experimental.channel.torch_tensor_type import TorchTensorType + + logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s", + envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL) + logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s", + envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM) + with InputNode() as input_data: + # Example DAG: PP=2, TP=4 + # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput # noqa: E501 + # -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput # noqa: E501 + # -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput # noqa: E501 + # -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput # noqa: E501 + + # All workers in the first TP group will take in the + # ExecuteModelRequest as input. + outputs = [input_data for _ in self.pp_tp_workers[0]] + for pp_rank, tp_group in enumerate(self.pp_tp_workers): + # Each PP worker takes in the output of the previous PP worker, + # and the TP group executes in SPMD fashion. + if self.use_v1: + outputs = [ + worker.execute_model. + bind( # type: ignore[attr-defined] + outputs[i]) for i, worker in enumerate(tp_group) + ] + else: + outputs = [ + worker.execute_model_spmd. + bind( # type: ignore[attr-defined] + outputs[i]) for i, worker in enumerate(tp_group) + ] + + last_pp_rank = len(self.pp_tp_workers) - 1 + if pp_rank < last_pp_rank: + # Specify how intermediate tensors should be passed + # between pp stages, no need to specify for the last + # pp stage. + transport = "nccl" \ + if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \ + else "auto" + outputs = [ + output.with_type_hint( + TorchTensorType(transport=transport)) + for output in outputs + ] + + forward_dag = MultiOutputNode(outputs) + + return forward_dag.experimental_compile( + enable_asyncio=enable_asyncio, + _overlap_gpu_communication=envs. + VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM) + + def __del__(self): + self.shutdown() + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if not self.use_ray_spmd_worker: + return await super().execute_model_async(execute_model_req) + + if self.forward_dag is None: + self.forward_dag = self._compiled_ray_dag(enable_asyncio=True) + + serialized_data = self.input_encoder.encode(execute_model_req) + dag_future = await self.forward_dag.execute_async(serialized_data) + output = await dag_future[0] + return self.output_decoder.decode(output) + + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + assert not self.use_ray_spmd_worker, ( + "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") + if not self.tp_driver_workers: + return await self.driver_exec_method("execute_model", + execute_model_req) + if self.pp_locks is None: + # This locks each pipeline parallel stage so multiple virtual + # engines can't execute on the same stage at the same time + # We create the locks here to avoid creating them in the constructor + # which uses a different asyncio loop. + self.pp_locks = [ + asyncio.Lock() + for _ in range(self.parallel_config.pipeline_parallel_size) + ] + + tasks = [ + asyncio.create_task( + _run_task_with_lock(self.driver_exec_method, self.pp_locks[0], + "execute_model", execute_model_req)) + ] + for pp_rank, driver_worker in enumerate(self.tp_driver_workers, + start=1): + tasks.append( + asyncio.create_task( + _run_task_with_lock(driver_worker.execute_method.remote, + self.pp_locks[pp_rank], + "execute_model", execute_model_req))) + + results = await asyncio.gather(*tasks) + + # Only the last PP stage has the final results. + return results[-1] + + async def _start_worker_execution_loop(self): + assert not self.use_ray_spmd_worker, ( + "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1") + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.non_driver_workers + ] + return await asyncio.gather(*coros) + + def check_health(self) -> None: + # Assume that the Ray workers are healthy. + # TODO: check the health of the Ray workers + return diff --git a/.venv/lib/python3.11/site-packages/vllm/executor/ray_utils.py b/.venv/lib/python3.11/site-packages/vllm/executor/ray_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7b30155971a6d353f5b6f18ac894ac2c2b6d1bef --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/executor/ray_utils.py @@ -0,0 +1,378 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from collections import defaultdict +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +import msgspec + +from vllm.config import ParallelConfig +from vllm.executor.msgspec_utils import decode_hook, encode_hook +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.sequence import ExecuteModelRequest, IntermediateTensors +from vllm.utils import get_ip +from vllm.worker.worker_base import WorkerWrapperBase + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + from vllm.v1.outputs import ModelRunnerOutput + +logger = init_logger(__name__) +PG_WAIT_TIMEOUT = 1800 + +try: + import ray + from ray.util import placement_group_table + from ray.util.placement_group import PlacementGroup + try: + from ray._private.state import available_resources_per_node + except ImportError: + # Ray 2.9.x doesn't expose `available_resources_per_node` + from ray._private.state import state as _state + available_resources_per_node = _state._available_resources_per_node + + class RayWorkerWrapper(WorkerWrapperBase): + """Ray wrapper for vllm.worker.Worker, allowing Worker to be + lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + # Since the compiled DAG runs a main execution + # in a different thread that calls cuda.set_device. + # The flag indicates is set_device is called on + # that thread. + self.compiled_dag_cuda_device_set = False + + self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, + dec_hook=decode_hook) + self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) + + def get_node_ip(self) -> str: + return get_ip() + + def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: + node_id = ray.get_runtime_context().get_node_id() + device_key = current_platform.ray_device_key + if not device_key: + raise RuntimeError("current platform %s does not support ray.", + current_platform.device_name) + gpu_ids = ray.get_runtime_context().get_accelerator_ids( + )[device_key] + return node_id, gpu_ids + + def execute_model_spmd( + self, req_or_tuple: Union[bytes, + Tuple[bytes, + Optional[IntermediateTensors]]] + ) -> bytes: + """Execute model in SPMD fashion: used only when SPMD worker and + compiled DAG are both enabled. + + Args: + req_or_tuple: A request or a tuple containing the + request and intermediate tensors. Intermediate tensors are + None unless if it is provided because it is > 0 pipeline + stage. The request is serialized by msgspec. + """ + if isinstance(req_or_tuple, bytes): + serialized_req, intermediate_tensors = req_or_tuple, None + else: + serialized_req, intermediate_tensors = req_or_tuple + + execute_model_req = self.input_decoder.decode(serialized_req) + + # TODO(swang): This is needed right now because Ray aDAG executes + # on a background thread, so we need to reset torch's current + # device. + import torch + if not self.compiled_dag_cuda_device_set: + torch.cuda.set_device(self.worker.device) + self.compiled_dag_cuda_device_set = True + + output = self.worker._execute_model_spmd(execute_model_req, + intermediate_tensors) + # Pipeline model request and output to the next pipeline stage. + if isinstance(output, IntermediateTensors): + output = serialized_req, output + else: + output = self.output_encoder.encode(output) + + return output + + def setup_device_if_necessary(self): + # TODO(swang): This is needed right now because Ray CG executes + # on a background thread, so we need to reset torch's current + # device. + # We can remove this API after it is fixed in compiled graph. + import torch + assert self.worker is not None, "Worker is not initialized" + if not self.compiled_dag_cuda_device_set: + torch.cuda.set_device(self.worker.device) + self.compiled_dag_cuda_device_set = True + + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> "ModelRunnerOutput": + self.setup_device_if_necessary() + assert self.worker is not None, "Worker is not initialized" + output = self.worker.model_runner.execute_model(scheduler_output) + return output + + def override_env_vars(self, vars: Dict[str, str]): + os.environ.update(vars) + + ray_import_err = None + +except ImportError as e: + ray = None # type: ignore + ray_import_err = e + RayWorkerWrapper = None # type: ignore + + +def ray_is_available() -> bool: + """Returns True if Ray is available.""" + return ray is not None + + +def assert_ray_available(): + """Raise an exception if Ray is not available.""" + if ray is None: + raise ValueError("Failed to import Ray, please install Ray with " + "`pip install ray`.") from ray_import_err + + +def _verify_bundles(placement_group: "PlacementGroup", + parallel_config: ParallelConfig, device_str: str): + """Verify a given placement group has bundles located in the right place. + + There are 2 rules. + - Warn if all tensor parallel workers cannot fit in a single node. + - Fail if driver node is not included in a placement group. + """ + assert ray.is_initialized(), ( + "Ray is not initialized although distributed-executor-backend is ray.") + pg_data = placement_group_table(placement_group) + # bundle_idx -> node_id + bundle_to_node_ids = pg_data["bundles_to_node_id"] + # bundle_idx -> bundle (e.g., {"GPU": 1}) + bundles = pg_data["bundles"] + # node_id -> List of bundle (e.g., {"GPU": 1}) + node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) + + for bundle_idx, node_id in bundle_to_node_ids.items(): + node_id_to_bundle[node_id].append(bundles[bundle_idx]) + driver_node_id = ray.get_runtime_context().get_node_id() + + if driver_node_id not in node_id_to_bundle: + raise RuntimeError( + f"driver node id {driver_node_id} is not included in a placement " + f"group {placement_group.id}. Node id -> bundles " + f"{node_id_to_bundle}. " + "You don't have enough GPUs available in a current node. Check " + "`ray status` to see if you have available GPUs in a node " + f"{driver_node_id} before starting an vLLM engine.") + + for node_id, bundles in node_id_to_bundle.items(): + if len(bundles) < parallel_config.tensor_parallel_size: + logger.warning( + "tensor_parallel_size=%d " + "is bigger than a reserved number of %ss (%d " + "%ss) in a node %s. Tensor parallel workers can be " + "spread out to 2+ nodes which can degrade the performance " + "unless you have fast interconnect across nodes, like " + "Infiniband. To resolve this issue, make sure you have more " + "than %d GPUs available at each node.", + parallel_config.tensor_parallel_size, device_str, len(bundles), + device_str, node_id, parallel_config.tensor_parallel_size) + + +def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): + """Wait until a placement group is ready. + + It prints the informative log messages if the placement group is + not created within time. + + """ + # Wait until PG is ready - this will block until all + # requested resources are available, and will timeout + # if they cannot be provisioned. + placement_group_specs = current_placement_group.bundle_specs + + s = time.time() + pg_ready_ref = current_placement_group.ready() + wait_interval = 10 + while time.time() - s < PG_WAIT_TIMEOUT: + ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval) + if len(ready) > 0: + break + + # Exponential backoff for warning print. + wait_interval *= 2 + logger.info( + "Waiting for creating a placement group of specs for " + "%d seconds. specs=%s. Check " + "`ray status` to see if you have enough resources," + " and make sure the IP addresses used by ray cluster" + " are the same as VLLM_HOST_IP environment variable" + " specified in each node if you are running on a multi-node.", + int(time.time() - s), placement_group_specs) + + try: + ray.get(pg_ready_ref, timeout=0) + except ray.exceptions.GetTimeoutError: + raise ValueError( + "Cannot provide a placement group of " + f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " + "`ray status` to make sure the cluster has enough resources." + ) from None + + +def _wait_until_pg_removed(current_placement_group: "PlacementGroup"): + ray.util.remove_placement_group(current_placement_group) + s = time.time() + wait_interval = 10 + while time.time() - s < PG_WAIT_TIMEOUT: + pg = ray.util.get_current_placement_group() + if pg is None: + break + + # Exponential backoff for warning print. + wait_interval *= 2 + logger.info( + "Waiting for removing a placement group of specs for " + "%d seconds.", int(time.time() - s)) + time.sleep(wait_interval) + + +def initialize_ray_cluster( + parallel_config: ParallelConfig, + ray_address: Optional[str] = None, +): + """Initialize the distributed cluster with Ray. + + it will connect to the Ray cluster and create a placement group + for the workers, which includes the specification of the resources + for each distributed worker. + + Args: + parallel_config: The configurations for parallel execution. + ray_address: The address of the Ray cluster. If None, uses + the default Ray cluster address. + """ + assert_ray_available() + from vllm.platforms import current_platform + + # Connect to a ray cluster. + if current_platform.is_rocm() or current_platform.is_xpu(): + # Try to connect existing ray instance and create a new one if not found + try: + ray.init("auto", ignore_reinit_error=True) + except ConnectionError: + logger.warning( + "No existing RAY instance detected. " + "A new instance will be launched with current node resources.") + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init(address=ray_address, ignore_reinit_error=True) + + if parallel_config.placement_group: + # Placement group is already set. + return + + device_str = current_platform.ray_device_key + if not device_str: + raise ValueError( + f"current platform {current_platform.device_name} does not " + "support ray.") + + # Create placement group for worker processes + current_placement_group = ray.util.get_current_placement_group() + if current_placement_group: + # We are in a placement group + bundles = current_placement_group.bundle_specs + # Verify that we can use the placement group. + device_bundles = 0 + for bundle in bundles: + bundle_devices = bundle.get(device_str, 0) + if bundle_devices > 1: + raise ValueError( + "Placement group bundle cannot have more than 1 " + f"{device_str}.") + if bundle_devices: + device_bundles += 1 + if parallel_config.world_size > device_bundles: + raise ValueError( + f"The number of required {device_str}s exceeds the total " + f"number of available {device_str}s in the placement group." + f"Required number of devices: {parallel_config.world_size}. " + f"Total number of devices: {device_bundles}.") + else: + num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) + # Log a warning message and delay resource allocation failure response. + # Avoid immediate rejection to allow user-initiated placement group + # created and wait cluster to be ready + if parallel_config.world_size > num_devices_in_cluster: + logger.warning( + "The number of required %ss exceeds the total " + "number of available %ss in the placement group.", device_str, + device_str) + # Create a new placement group + placement_group_specs: List[Dict[str, float]] = ([{ + device_str: 1.0 + } for _ in range(parallel_config.world_size)]) + + # vLLM engine is also a worker to execute model with an accelerator, + # so it requires to have the device in a current node. Check if + # the current node has at least one device. + current_ip = get_ip() + current_node_id = ray.get_runtime_context().get_node_id() + current_node_resource = available_resources_per_node()[current_node_id] + if current_node_resource.get(device_str, 0) < 1: + raise ValueError( + f"Current node has no {device_str} available. " + f"{current_node_resource=}. vLLM engine cannot start without " + f"{device_str}. Make sure you have at least 1 {device_str} " + f"available in a node {current_node_id=} {current_ip=}.") + # This way, at least bundle is required to be created in a current + # node. + placement_group_specs[0][f"node:{current_ip}"] = 0.001 + + # By default, Ray packs resources as much as possible. + current_placement_group = ray.util.placement_group( + placement_group_specs, strategy="PACK") + _wait_until_pg_ready(current_placement_group) + + assert current_placement_group is not None + _verify_bundles(current_placement_group, parallel_config, device_str) + # Set the placement group in the parallel config + parallel_config.placement_group = current_placement_group + + +def get_num_tpu_nodes() -> int: + from ray._private.accelerators import TPUAcceleratorManager + cluster_resources = ray.cluster_resources() + total_tpus = int(cluster_resources["TPU"]) + tpus_per_node = TPUAcceleratorManager.get_current_node_num_accelerators() + assert total_tpus % tpus_per_node == 0 + return total_tpus // tpus_per_node + + +def get_num_nodes_in_placement_group() -> int: + pg_table = ray.util.placement_group_table() + current_pg = ray.util.get_current_placement_group() + num_nodes = 0 + + if current_pg: + nodes_in_pg = set() + for pg_key, pg in pg_table.items(): + if pg_key == current_pg.id.hex(): + for _, node in pg["bundles_to_node_id"].items(): + nodes_in_pg.add(node) + num_nodes = len(nodes_in_pg) + + return num_nodes diff --git a/.venv/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py b/.venv/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..dcb4a8f27c25213365a4422d912d535e5e845d6f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import torch +import torch.distributed as dist + +import vllm.envs as envs +from vllm.executor.executor_base import ExecutorBase +from vllm.logger import init_logger +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + run_method) +from vllm.worker.worker_base import WorkerWrapperBase + +logger = init_logger(__name__) + + +class UniProcExecutor(ExecutorBase): + + uses_ray: bool = False + + def _init_executor(self) -> None: + """Initialize the worker and load the model. + """ + self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, + rpc_rank=0) + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + local_rank = 0 + rank = 0 + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=(not self.parallel_config) + or (rank % self.parallel_config.tensor_parallel_size == 0), + ) + self.collective_rpc("init_worker", args=([kwargs], )) + self.collective_rpc("init_device") + self.collective_rpc("load_model") + + def collective_rpc(self, + method: Union[str, Callable], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict] = None) -> List[Any]: + if kwargs is None: + kwargs = {} + answer = run_method(self.driver_worker, method, args, kwargs) + return [answer] + + def check_health(self) -> None: + # UniProcExecutor will always be healthy as long as + # it's running. + return + + +UniProcExecutorAsync = UniProcExecutor + + +class ExecutorWithExternalLauncher(UniProcExecutor): + """An executor that uses external launchers to launch engines, + specially designed for torchrun-compatible launchers, for + offline inference with tensor parallelism. + + see https://github.com/vllm-project/vllm/issues/11400 for + the motivation, and examples/offline_inference/torchrun_example.py + for the usage example. + + The key idea: although it is tensor-parallel inference, we only + create one worker per executor, users will launch multiple + engines with torchrun-compatible launchers, and all these engines + work together to process the same prompts. When scheduling is + deterministic, all the engines will generate the same outputs, + and they don't need to synchronize the states with each other. + """ + uses_ray: bool = False + + def _init_executor(self) -> None: + """Initialize the worker and load the model. + """ + assert self.vllm_config.parallel_config.pipeline_parallel_size == 1, \ + ("ExecutorWithExternalLauncher does not " + "support pipeline parallelism.") + assert self.vllm_config.scheduler_config.delay_factor == 0.0, \ + ("ExecutorWithExternalLauncher needs deterministic " + "execution, so it" + "does not support delay_factor in scheduling") + assert not envs.VLLM_USE_V1, \ + ("V1 architecture cannot guarantee deterministic execution, " + "so it is not supported in ExecutorWithExternalLauncher.") + self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, + rpc_rank=0) + # engines are launched in torchrun-compatible launchers + # so we can use the env:// method. + # required env vars: + # - RANK + # - MASTER_ADDR + # - MASTER_PORT + distributed_init_method = "env://" + rank = int(os.environ["RANK"]) + local_rank = rank + is_driver_worker = True + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=is_driver_worker, + ) + self.collective_rpc("init_worker", args=([kwargs], )) + self.collective_rpc("init_device") + self.collective_rpc("load_model") + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """ + Determine the number of available KV blocks. + Add an additional all_reduce to get the min across all ranks. + Note that even if we have the same `gpu_memory_utilization` and + `swap_space`, the available memory in every rank might still + differ because NCCL can take different amounts of memory in + different ranks. Therefore, it is necessary to test if all ranks + agree on the same KV cache configuration. + """ + a, b = super().determine_num_available_blocks() + from vllm.distributed.parallel_state import get_world_group + cpu_group = get_world_group().cpu_group + a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64) + b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64) + dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN) + dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN) + return a_tensor.item(), b_tensor.item() diff --git a/.venv/lib/python3.11/site-packages/vllm/forward_context.py b/.venv/lib/python3.11/site-packages/vllm/forward_context.py new file mode 100644 index 0000000000000000000000000000000000000000..10de8bc593ab805ec06664c2ed0125bdb4a708b7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/forward_context.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 + +import time +from collections import defaultdict +from contextlib import contextmanager +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, Optional + +import torch + +import vllm.envs as envs +from vllm.config import VllmConfig +from vllm.logger import init_logger + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + +logger = init_logger(__name__) + +track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0 +last_logging_time: float = 0 +forward_start_time: float = 0 +batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL +batchsize_forward_time: defaultdict = defaultdict(list) + + +@dataclass +class ForwardContext: + # copy from vllm_config.compilation_config.static_forward_context + attn_layers: Dict[str, Any] + # TODO: extend to support per-layer dynamic forward context + attn_metadata: "AttentionMetadata" # set dynamically for each forward pass + # TODO: remove after making all virtual_engines share the same kv cache + virtual_engine: int # set dynamically for each forward pass + + +_forward_context: Optional[ForwardContext] = None + + +def get_forward_context() -> ForwardContext: + """Get the current forward context.""" + assert _forward_context is not None, ( + "Forward context is not set. " + "Please use `set_forward_context` to set the forward context.") + return _forward_context + + +@contextmanager +def set_forward_context(attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0): + """A context manager that stores the current forward context, + can be attention metadata, etc. + Here we can inject common logic for every model forward pass. + """ + global forward_start_time + need_to_track_batchsize = track_batchsize and attn_metadata is not None + if need_to_track_batchsize: + forward_start_time = time.perf_counter() + global _forward_context + prev_context = _forward_context + _forward_context = ForwardContext( + attn_layers=vllm_config.compilation_config.static_forward_context, + virtual_engine=virtual_engine, + attn_metadata=attn_metadata) + try: + yield + finally: + global last_logging_time, batchsize_logging_interval + if need_to_track_batchsize: + if hasattr(attn_metadata, "num_prefill_tokens"): + # for v0 attention backends + batchsize = attn_metadata.num_prefill_tokens + \ + attn_metadata.num_decode_tokens + else: + # for v1 attention backends + batchsize = attn_metadata.num_input_tokens + # we use synchronous scheduling right now, + # adding a sync point here should not affect + # scheduling of the next batch + torch.cuda.synchronize() + now = time.perf_counter() + # time measurement is in milliseconds + batchsize_forward_time[batchsize].append( + (now - forward_start_time) * 1000) + if now - last_logging_time > batchsize_logging_interval: + last_logging_time = now + forward_stats = [] + for bs, times in batchsize_forward_time.items(): + if len(times) <= 1: + # can be cudagraph / profiling run + continue + medium = torch.quantile(torch.tensor(times), q=0.5).item() + medium = round(medium, 2) + forward_stats.append((bs, len(times), medium)) + forward_stats.sort(key=lambda x: x[1], reverse=True) + if forward_stats: + logger.info(("Batchsize forward time stats " + "(batchsize, count, median_time(ms)): %s"), + forward_stats) + _forward_context = prev_context diff --git a/.venv/lib/python3.11/site-packages/vllm/logger.py b/.venv/lib/python3.11/site-packages/vllm/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..b20d55e3c1019ba7fd8d15bd6950f4c094cc2316 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/logger.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Logging configuration for vLLM.""" +import datetime +import json +import logging +import os +import sys +from functools import lru_cache, partial +from logging import Logger +from logging.config import dictConfig +from os import path +from types import MethodType +from typing import Any, Optional, cast + +import vllm.envs as envs + +VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING +VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH +VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL +VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX + +_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " + "%(filename)s:%(lineno)d] %(message)s") +_DATE_FORMAT = "%m-%d %H:%M:%S" + +DEFAULT_LOGGING_CONFIG = { + "formatters": { + "vllm": { + "class": "vllm.logging_utils.NewLineFormatter", + "datefmt": _DATE_FORMAT, + "format": _FORMAT, + }, + }, + "handlers": { + "vllm": { + "class": "logging.StreamHandler", + "formatter": "vllm", + "level": VLLM_LOGGING_LEVEL, + "stream": "ext://sys.stdout", + }, + }, + "loggers": { + "vllm": { + "handlers": ["vllm"], + "level": "DEBUG", + "propagate": False, + }, + }, + "version": 1, + "disable_existing_loggers": False +} + + +@lru_cache +def _print_info_once(logger: Logger, msg: str) -> None: + # Set the stacklevel to 2 to print the original caller's line info + logger.info(msg, stacklevel=2) + + +@lru_cache +def _print_warning_once(logger: Logger, msg: str) -> None: + # Set the stacklevel to 2 to print the original caller's line info + logger.warning(msg, stacklevel=2) + + +class _VllmLogger(Logger): + """ + Note: + This class is just to provide type information. + We actually patch the methods directly on the :class:`logging.Logger` + instance to avoid conflicting with other libraries such as + `intel_extension_for_pytorch.utils._logger`. + """ + + def info_once(self, msg: str) -> None: + """ + As :meth:`info`, but subsequent calls with the same message + are silently dropped. + """ + _print_info_once(self, msg) + + def warning_once(self, msg: str) -> None: + """ + As :meth:`warning`, but subsequent calls with the same message + are silently dropped. + """ + _print_warning_once(self, msg) + + +def _configure_vllm_root_logger() -> None: + logging_config = dict[str, Any]() + + if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH: + raise RuntimeError( + "VLLM_CONFIGURE_LOGGING evaluated to false, but " + "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH " + "implies VLLM_CONFIGURE_LOGGING. Please enable " + "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.") + + if VLLM_CONFIGURE_LOGGING: + logging_config = DEFAULT_LOGGING_CONFIG + + if VLLM_LOGGING_CONFIG_PATH: + if not path.exists(VLLM_LOGGING_CONFIG_PATH): + raise RuntimeError( + "Could not load logging config. File does not exist: %s", + VLLM_LOGGING_CONFIG_PATH) + with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: + custom_config = json.loads(file.read()) + + if not isinstance(custom_config, dict): + raise ValueError("Invalid logging config. Expected Dict, got %s.", + type(custom_config).__name__) + logging_config = custom_config + + for formatter in logging_config.get("formatters", {}).values(): + # This provides backwards compatibility after #10134. + if formatter.get("class") == "vllm.logging.NewLineFormatter": + formatter["class"] = "vllm.logging_utils.NewLineFormatter" + + if logging_config: + dictConfig(logging_config) + + +def init_logger(name: str) -> _VllmLogger: + """The main purpose of this function is to ensure that loggers are + retrieved in such a way that we can be sure the root vllm logger has + already been configured.""" + + logger = logging.getLogger(name) + + methods_to_patch = { + "info_once": _print_info_once, + "warning_once": _print_warning_once, + } + + for method_name, method in methods_to_patch.items(): + setattr(logger, method_name, MethodType(method, logger)) + + return cast(_VllmLogger, logger) + + +# The root logger is initialized when the module is imported. +# This is thread-safe as the module is only imported once, +# guaranteed by the Python GIL. +_configure_vllm_root_logger() + +logger = init_logger(__name__) + + +def _trace_calls(log_path, root_dir, frame, event, arg=None): + if event in ['call', 'return']: + # Extract the filename, line number, function name, and the code object + filename = frame.f_code.co_filename + lineno = frame.f_lineno + func_name = frame.f_code.co_name + if not filename.startswith(root_dir): + # only log the functions in the vllm root_dir + return + # Log every function call or return + try: + last_frame = frame.f_back + if last_frame is not None: + last_filename = last_frame.f_code.co_filename + last_lineno = last_frame.f_lineno + last_func_name = last_frame.f_code.co_name + else: + # initial frame + last_filename = "" + last_lineno = 0 + last_func_name = "" + with open(log_path, 'a') as f: + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + if event == 'call': + f.write(f"{ts} Call to" + f" {func_name} in {filename}:{lineno}" + f" from {last_func_name} in {last_filename}:" + f"{last_lineno}\n") + else: + f.write(f"{ts} Return from" + f" {func_name} in {filename}:{lineno}" + f" to {last_func_name} in {last_filename}:" + f"{last_lineno}\n") + except NameError: + # modules are deleted during shutdown + pass + return partial(_trace_calls, log_path, root_dir) + + +def enable_trace_function_call(log_file_path: str, + root_dir: Optional[str] = None): + """ + Enable tracing of every function call in code under `root_dir`. + This is useful for debugging hangs or crashes. + `log_file_path` is the path to the log file. + `root_dir` is the root directory of the code to trace. If None, it is the + vllm root directory. + + Note that this call is thread-level, any threads calling this function + will have the trace enabled. Other threads will not be affected. + """ + logger.warning( + "VLLM_TRACE_FUNCTION is enabled. It will record every" + " function executed by Python. This will slow down the code. It " + "is suggested to be used for debugging hang or crashes only.") + logger.info("Trace frame log is saved to %s", log_file_path) + if root_dir is None: + # by default, this is the vllm root directory + root_dir = os.path.dirname(os.path.dirname(__file__)) + sys.settrace(partial(_trace_calls, log_file_path, root_dir)) diff --git a/.venv/lib/python3.11/site-packages/vllm/logits_process.py b/.venv/lib/python3.11/site-packages/vllm/logits_process.py new file mode 100644 index 0000000000000000000000000000000000000000..d02072e8f81894894348c0501eb2ae6ec2081962 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/logits_process.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Callable, List, Tuple, Union + +import torch + +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer + +LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor], + Callable[[List[int], List[int], torch.Tensor], + torch.Tensor]] +"""LogitsProcessor is a function that takes a list +of previously generated tokens, the logits tensor +for the next token and, optionally, prompt tokens as a +first argument, and returns a modified tensor of logits +to sample from.""" + + +def get_bad_words_logits_processors( + bad_words: List[str], + tokenizer: AnyTokenizer) -> List[LogitsProcessor]: + bad_words_ids: List[List[int]] = list() + + for bad_word in bad_words: + # To prohibit words both at the beginning + # and in the middle of text + # (related to add_prefix_space tokenizer parameter) + for add_prefix_space in [False, True]: + prefix = " " if add_prefix_space else "" + prompt = prefix + bad_word.lstrip() + + if isinstance(tokenizer, MistralTokenizer): + # Mistral tokenizers should not add special tokens + prompt_token_ids = tokenizer.encode(prompt=prompt) + else: + prompt_token_ids = tokenizer.encode(text=prompt, + add_special_tokens=False) + + # If no space at the beginning + # or if prefix space produces a new word token + if (not add_prefix_space) or ( + add_prefix_space + and prompt_token_ids[0] != bad_words_ids[-1][0] + and len(prompt_token_ids) == len(bad_words_ids[-1])): + bad_words_ids.append(prompt_token_ids) + + return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)] + + +class NoBadWordsLogitsProcessor: + _SMALLEST_LOGIT = float("-inf") + _NEUTRAL_LOGIT = 0.0 + + def __init__(self, bad_words_ids: List[List[int]]): + self.bad_words_ids = bad_words_ids + self.word_bias: torch.FloatTensor = None + + def __call__( + self, + past_tokens_ids: Union[List[int], Tuple[int]], + logits: torch.FloatTensor, + ) -> torch.Tensor: + if self.word_bias is None: + self._init_word_bias(logits=logits) + + last_token_bias = torch.zeros_like(logits) + + for bad_word_ids in self.bad_words_ids: + if len(bad_word_ids) == 1: # 1-token words already processed + continue + + if len(bad_word_ids) > len(past_tokens_ids) + 1: + continue + + prefix_length = len(bad_word_ids) - 1 + last_token_id = bad_word_ids[-1] + actual_prefix = past_tokens_ids[-prefix_length:] + expected_prefix = bad_word_ids[:prefix_length] + + assert len(actual_prefix) == len(expected_prefix) + + is_match = tuple(actual_prefix) == tuple(expected_prefix) + last_token_bias[last_token_id] += (self._SMALLEST_LOGIT if is_match + else self._NEUTRAL_LOGIT) + + logits = logits + self.word_bias + last_token_bias + + return logits + + def _init_word_bias(self, logits: torch.FloatTensor) -> None: + # Code based on NoBadWordsLogitsProcessor and SequenceBiasLogitsProcessor # noqa: E501 + # from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py + + vocab_size = logits.shape[-1] + + self._check_token_ids_bounds(vocab_size=vocab_size) + + self.word_bias = torch.zeros((vocab_size, ), + dtype=torch.float, + device=logits.device) + + for bad_word_ids in self.bad_words_ids: + if len(bad_word_ids) == 1: + bad_word_id = bad_word_ids[-1] + self.word_bias[bad_word_id] = self._SMALLEST_LOGIT + + def _check_token_ids_bounds(self, vocab_size: int) -> None: + invalid_token_ids = [] + + for bad_word_ids in self.bad_words_ids: + for token_id in bad_word_ids: + if token_id < 0 or token_id >= vocab_size: + invalid_token_ids.append(token_id) + + if len(invalid_token_ids) > 0: + raise ValueError( + f"The model vocabulary size is {vocab_size}," + f" but the following tokens" + f" were specified as bad: {invalid_token_ids}." + f" All token id values should be integers satisfying:" + f" 0 <= token_id < {vocab_size}.") diff --git a/.venv/lib/python3.11/site-packages/vllm/outputs.py b/.venv/lib/python3.11/site-packages/vllm/outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..786380c37f6cb588f10b7be40871b9311171e3b9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/outputs.py @@ -0,0 +1,529 @@ +# SPDX-License-Identifier: Apache-2.0 + +import time +from dataclasses import dataclass +from typing import Dict, Generic, List, MutableSequence, Optional +from typing import Sequence as GenericSequence +from typing import Union + +import torch +from typing_extensions import TypeVar, deprecated + +from vllm.lora.request import LoRARequest +from vllm.multimodal.inputs import MultiModalPlaceholderDict +from vllm.sampling_params import RequestOutputKind +from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, + SequenceGroup, SequenceGroupBase, SequenceStatus) + + +@dataclass +class CompletionOutput: + """The output data of one completion output of a request. + + Args: + index: The index of the output in the request. + text: The generated output text. + token_ids: The token IDs of the generated output text. + cumulative_logprob: The cumulative log probability of the generated + output text. + logprobs: The log probabilities of the top probability words at each + position if the logprobs are requested. + finish_reason: The reason why the sequence is finished. + stop_reason: The stop string or token id that caused the completion + to stop, None if the completion finished for some other reason + including encountering the EOS token. + lora_request: The LoRA request that was used to generate the output. + """ + + index: int + text: str + token_ids: GenericSequence[int] + cumulative_logprob: Optional[float] + logprobs: Optional[SampleLogprobs] + finish_reason: Optional[str] = None + stop_reason: Union[int, str, None] = None + lora_request: Optional[LoRARequest] = None + + def finished(self) -> bool: + return self.finish_reason is not None + + def __repr__(self) -> str: + return (f"CompletionOutput(index={self.index}, " + f"text={self.text!r}, " + f"token_ids={self.token_ids}, " + f"cumulative_logprob={self.cumulative_logprob}, " + f"logprobs={self.logprobs}, " + f"finish_reason={self.finish_reason}, " + f"stop_reason={self.stop_reason})") + + +@dataclass +class PoolingOutput: + """The output data of one pooling output of a request. + + Args: + data: The extracted hidden states. + """ + data: torch.Tensor + + def __repr__(self) -> str: + return (f"PoolingOutput(data={self.data})") + + def __eq__(self, other: object) -> bool: + return (isinstance(other, self.__class__) and bool( + (self.data == other.data).all())) + + @property + @deprecated("`LLM.encode()` now stores raw outputs in the `data` " + "attribute. To return embeddings, use `LLM.embed()`. " + "To return class probabilities, use `LLM.classify()` " + "and access the `probs` attribute. ") + def embedding(self) -> list[float]: + return self.data.tolist() + + +class RequestOutput: + """The output data of a completion request to the LLM. + + Args: + request_id: The unique ID of the request. + prompt: The prompt string of the request. + For encoder/decoder models, this is the + decoder input prompt. + prompt_token_ids: The token IDs of the prompt. + For encoder/decoder models, this is the + decoder input prompt token ids. + prompt_logprobs: The log probabilities to return per prompt token. + outputs: The output sequences of the request. + finished: Whether the whole request is finished. + metrics: Metrics associated with the request. + lora_request: The LoRA request that was used to generate the output. + encoder_prompt: The encoder prompt string of the request. + None if decoder-only. + encoder_prompt_token_ids: The token IDs of the encoder prompt. + None if decoder-only. + num_cached_tokens: The number of tokens with prefix cache hit. + """ + + def __init__( + self, + request_id: str, + prompt: Optional[str], + prompt_token_ids: Optional[List[int]], + prompt_logprobs: Optional[PromptLogprobs], + outputs: List[CompletionOutput], + finished: bool, + metrics: Optional[RequestMetrics] = None, + lora_request: Optional[LoRARequest] = None, + encoder_prompt: Optional[str] = None, + encoder_prompt_token_ids: Optional[List[int]] = None, + num_cached_tokens: Optional[int] = None, + *, + multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None, + ) -> None: + self.request_id = request_id + self.prompt = prompt + self.prompt_token_ids = prompt_token_ids + self.multi_modal_placeholders = multi_modal_placeholders or {} + self.prompt_logprobs = prompt_logprobs + self.outputs = outputs + self.finished = finished + self.metrics = metrics + self.lora_request = lora_request + self.encoder_prompt = encoder_prompt + self.encoder_prompt_token_ids = encoder_prompt_token_ids + self.num_cached_tokens = num_cached_tokens + + @classmethod + def new( + cls, + request_id: str, + prompt: Optional[str], + prompt_token_ids: Optional[List[int]], + text: str, + token_ids: List[int], + finished: bool = False, + ) -> "RequestOutput": + """Initialize a new RequestOutput object.""" + + # TODO: Support `n` > 1. + completion_output = CompletionOutput( + index=0, + text=text, + token_ids=token_ids, + cumulative_logprob=None, + logprobs=None, # TODO + ) + + return RequestOutput( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, # TODO + outputs=[completion_output], + finished=finished, + ) + + def add(self, next_output: "RequestOutput") -> None: + """Merge subsequent RequestOutput into this one""" + + self.prompt = next_output.prompt + self.prompt_token_ids = next_output.prompt_token_ids + self.prompt_logprobs = next_output.prompt_logprobs + self.finished |= next_output.finished + + #TODO assuming n == 1 for now + completion = self.outputs[0] + next_completion = next_output.outputs[0] + completion.text += next_completion.text + if not isinstance(completion.token_ids, MutableSequence): + completion.token_ids = list(completion.token_ids) + completion.token_ids.extend(next_completion.token_ids) + if next_completion.logprobs: + assert completion.logprobs is not None + completion.logprobs.extend(next_completion.logprobs) + completion.cumulative_logprob = next_completion.cumulative_logprob + + @classmethod + def from_seq_group( + cls, seq_group: SequenceGroup, use_cache: bool, + seq_id_to_seq_group: Dict[str, SequenceGroupBase] + ) -> Optional["RequestOutput"]: + finished = seq_group.is_finished() + + if seq_group.request_id in seq_id_to_seq_group: + group: SequenceGroupBase = seq_id_to_seq_group[ + seq_group.request_id] + assembled_seq_group = group.maybe_assemble_group(seq_group) + if finished: + group.finish_seq(seq_group) + if assembled_seq_group is None: + return None + return cls.from_seq_group(assembled_seq_group, use_cache, + seq_id_to_seq_group) + + sampling_params = seq_group.sampling_params + if sampling_params is None: + raise ValueError( + "Sampling parameters are missing for a CompletionRequest.") + + if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and ( + not finished): + return None + + # Init cache (if needed) + if use_cache and seq_group.cached_request_output is None: + seq_group.cached_request_output = RequestOutput( # type: ignore + request_id="", + prompt=None, + prompt_token_ids=[], + prompt_logprobs=None, + outputs=[], + finished=False) + + top_n_seqs = seq_group.get_seqs() + + # Create the outputs. + # NOTE: We need omit logprobs here explicitly because the sequence + # always has the logprobs of the sampled tokens even if the + # logprobs are not requested. + include_logprobs = sampling_params.logprobs is not None + text_buffer_length = sampling_params.output_text_buffer_length + delta = sampling_params.output_kind == RequestOutputKind.DELTA + + outputs = [] + include_prompt = True + # num_cached_tokens should be the same for all the sequences + num_cached_tokens = None + for i, seq in enumerate(top_n_seqs): + output_text = seq.get_output_text_to_return( + text_buffer_length, delta) + + output_token_ids = seq.get_output_token_ids_to_return(delta) + num_output_tokens = 1 if isinstance(output_token_ids, + int) else len(output_token_ids) + num_cached_tokens = seq.data.get_num_cached_tokens() + + output_logprobs = seq.output_logprobs if include_logprobs else None + + if delta: + # Slice logprobs delta if applicable + if output_logprobs: + output_logprobs = output_logprobs[-num_output_tokens:] + # Don't include prompt if this is after the first output + # containing decode token ids + if include_prompt and seq.get_output_len() > num_output_tokens: + include_prompt = False + + if use_cache: + # Get cached output object + cached_outputs = seq_group.cached_request_output.outputs # type: ignore + if i >= len(cached_outputs): + cached_outputs.append( + CompletionOutput(index=i, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + stop_reason=None)) + output = cached_outputs[i] + + # Init cached output object + assert output.index == i + output.text = output_text + + if isinstance(output_token_ids, int): + output.token_ids.clear() + output.token_ids.append(output_token_ids) + else: + output.token_ids = output_token_ids + + output.cumulative_logprob = seq.get_cumulative_logprob() \ + if include_logprobs else None + output.logprobs = output_logprobs + output.finish_reason = SequenceStatus.get_finished_reason( + seq.status) + output.stop_reason = seq.stop_reason + + else: + output = CompletionOutput( + top_n_seqs.index(seq), output_text, [output_token_ids] + if isinstance(output_token_ids, int) else output_token_ids, + seq.get_cumulative_logprob() if include_logprobs else None, + output_logprobs, + SequenceStatus.get_finished_reason(seq.status), + seq.stop_reason) + + outputs.append(output) + + # Every sequence in the sequence group should have the same prompt. + if include_prompt: + prompt = seq_group.prompt + prompt_token_ids = seq_group.prompt_token_ids + encoder_prompt = seq_group.encoder_prompt + encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids + prompt_logprobs = seq_group.prompt_logprobs + else: + prompt = None + prompt_token_ids = None + encoder_prompt = None + encoder_prompt_token_ids = None + prompt_logprobs = None + finished_time = time.time() if finished else None + seq_group.set_finished_time(finished_time) + + init_kwargs = { + "request_id": seq_group.request_id, + "prompt": prompt, + "prompt_token_ids": prompt_token_ids, + "prompt_logprobs": prompt_logprobs, + "outputs": outputs, + "finished": finished, + "metrics": seq_group.metrics, + "lora_request": seq_group.lora_request, + "encoder_prompt": encoder_prompt, + "encoder_prompt_token_ids": encoder_prompt_token_ids, + "num_cached_tokens": num_cached_tokens, + "multi_modal_placeholders": seq_group.multi_modal_placeholders + } + + if use_cache: + request_output = seq_group.cached_request_output + request_output.__init__(**init_kwargs) # type: ignore + else: + request_output = cls(**init_kwargs) # type: ignore + + return request_output + + def __repr__(self) -> str: + return (f"RequestOutput(request_id={self.request_id}, " + f"prompt={self.prompt!r}, " + f"prompt_token_ids={self.prompt_token_ids}, " + f"encoder_prompt={self.encoder_prompt!r}, " + f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, " + f"prompt_logprobs={self.prompt_logprobs}, " + f"outputs={self.outputs}, " + f"finished={self.finished}, " + f"metrics={self.metrics}, " + f"lora_request={self.lora_request}, " + f"num_cached_tokens={self.num_cached_tokens}, " + f"multi_modal_placeholders={self.multi_modal_placeholders})") + + +_O = TypeVar("_O", default=PoolingOutput) + + +class PoolingRequestOutput(Generic[_O]): + """ + The output data of a pooling request to the LLM. + + Args: + request_id (str): A unique identifier for the pooling request. + outputs (PoolingOutput): The pooling results for the given input. + prompt_token_ids (List[int]): A list of token IDs used in the prompt. + finished (bool): A flag indicating whether the pooling is completed. + """ + + def __init__(self, request_id: str, outputs: _O, + prompt_token_ids: List[int], finished: bool): + self.request_id = request_id + self.prompt_token_ids = prompt_token_ids + self.finished = finished + self.outputs = outputs + + @staticmethod + def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput": + pooled_data = seq_group.pooled_data + assert pooled_data is not None + + data = pooled_data.to(dtype=torch.float32, device="cpu") + output = PoolingOutput(data) + prompt_token_ids = seq_group.prompt_token_ids + finished = seq_group.is_finished() + + return PoolingRequestOutput(seq_group.request_id, output, + prompt_token_ids, finished) + + def __repr__(self): + """ + Returns a string representation of an PoolingRequestOutput instance. + + The representation includes the request_id and the number of outputs, + providing a quick overview of the pooling request's results. + + Returns: + str: A string representation of the PoolingRequestOutput instance. + """ + return (f"{type(self).__name__}(request_id={self.request_id!r}, " + f"outputs={self.outputs!r}, " + f"prompt_token_ids={self.prompt_token_ids}, " + f"finished={self.finished})") + + +class RequestOutputFactory: + + @staticmethod + def create(seq_group: SequenceGroup, + seq_id_to_seq_group: Dict[str, SequenceGroupBase], + use_cache: bool = False): + if seq_group.pooled_data is not None: + return PoolingRequestOutput.from_seq_group(seq_group) + else: + return RequestOutput.from_seq_group(seq_group, use_cache, + seq_id_to_seq_group) + + +@dataclass +class EmbeddingOutput: + """The output data of one embedding output of a request. + + Args: + embedding: The embedding vector, which is a list of floats. + Its length depends on the hidden dimension of the model. + """ + embedding: list[float] + + @staticmethod + def from_base(pooling_output: PoolingOutput): + pooled_data = pooling_output.data + if pooled_data.ndim != 1: + raise ValueError("pooled_data should be a 1-D embedding vector") + + return EmbeddingOutput(pooled_data.tolist()) + + @property + def hidden_size(self) -> int: + return len(self.embedding) + + def __repr__(self) -> str: + return f"EmbeddingOutput(hidden_size={self.hidden_size})" + + +class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]): + + @staticmethod + def from_base(request_output: PoolingRequestOutput): + return EmbeddingRequestOutput( + request_id=request_output.request_id, + outputs=EmbeddingOutput.from_base(request_output.outputs), + prompt_token_ids=request_output.prompt_token_ids, + finished=request_output.finished, + ) + + +@dataclass +class ClassificationOutput: + """The output data of one classification output of a request. + + Args: + probs: The probability vector, which is a list of floats. + Its length depends on the number of classes. + """ + probs: list[float] + + @staticmethod + def from_base(pooling_output: PoolingOutput): + pooled_data = pooling_output.data + if pooled_data.ndim != 1: + raise ValueError("pooled_data should be a 1-D probability vector") + + return ClassificationOutput(pooled_data.tolist()) + + @property + def num_classes(self) -> int: + return len(self.probs) + + def __repr__(self) -> str: + return f"ClassificationOutput(num_classes={self.num_classes})" + + +class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]): + + @staticmethod + def from_base(request_output: PoolingRequestOutput): + return ClassificationRequestOutput( + request_id=request_output.request_id, + outputs=ClassificationOutput.from_base(request_output.outputs), + prompt_token_ids=request_output.prompt_token_ids, + finished=request_output.finished, + ) + + +@dataclass +class ScoringOutput: + """The output data of one scoring output of a request. + + Args: + score: The similarity score, which is a scalar value. + """ + score: float + + @staticmethod + def from_base(pooling_output: PoolingOutput): + pooled_data = pooling_output.data + if pooled_data.ndim != 0: + raise ValueError("pooled_data should be a scalar score") + + return ScoringOutput(pooled_data.item()) + + def __repr__(self) -> str: + return f"ScoringOutput(score={self.score})" + + @property + @deprecated("`LLM.score()` now returns scalar scores. " + "Please access it via the `score` attribute. ") + def embedding(self) -> list[float]: + return [self.score] + + +class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]): + + @staticmethod + def from_base(request_output: PoolingRequestOutput): + return ScoringRequestOutput( + request_id=request_output.request_id, + outputs=ScoringOutput.from_base(request_output.outputs), + prompt_token_ids=request_output.prompt_token_ids, + finished=request_output.finished, + ) diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__init__.py b/.venv/lib/python3.11/site-packages/vllm/platforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e4767a378f45f816836e3bcb0cae74db91cecba7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/__init__.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 + +import logging +import traceback +from itertools import chain +from typing import TYPE_CHECKING, Optional + +from vllm.plugins import load_plugins_by_group +from vllm.utils import resolve_obj_by_qualname + +from .interface import _Backend # noqa: F401 +from .interface import CpuArchEnum, Platform, PlatformEnum + +logger = logging.getLogger(__name__) + + +def tpu_platform_plugin() -> Optional[str]: + is_tpu = False + try: + # While it's technically possible to install libtpu on a + # non-TPU machine, this is a very uncommon scenario. Therefore, + # we assume that libtpu is installed if and only if the machine + # has TPUs. + import libtpu # noqa: F401 + is_tpu = True + except Exception: + pass + + return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None + + +def cuda_platform_plugin() -> Optional[str]: + is_cuda = False + + try: + from vllm.utils import import_pynvml + pynvml = import_pynvml() + pynvml.nvmlInit() + try: + if pynvml.nvmlDeviceGetCount() > 0: + is_cuda = True + finally: + pynvml.nvmlShutdown() + except Exception as e: + if "nvml" not in e.__class__.__name__.lower(): + # If the error is not related to NVML, re-raise it. + raise e + + # CUDA is supported on Jetson, but NVML may not be. + import os + + def cuda_is_jetson() -> bool: + return os.path.isfile("/etc/nv_tegra_release") \ + or os.path.exists("/sys/class/tegra-firmware") + + if cuda_is_jetson(): + is_cuda = True + + return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None + + +def rocm_platform_plugin() -> Optional[str]: + is_rocm = False + + try: + import amdsmi + amdsmi.amdsmi_init() + try: + if len(amdsmi.amdsmi_get_processor_handles()) > 0: + is_rocm = True + finally: + amdsmi.amdsmi_shut_down() + except Exception: + pass + + return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None + + +def hpu_platform_plugin() -> Optional[str]: + is_hpu = False + try: + from importlib import util + is_hpu = util.find_spec('habana_frameworks') is not None + except Exception: + pass + + return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None + + +def xpu_platform_plugin() -> Optional[str]: + is_xpu = False + + try: + # installed IPEX if the machine has XPUs. + import intel_extension_for_pytorch # noqa: F401 + import oneccl_bindings_for_pytorch # noqa: F401 + import torch + if hasattr(torch, 'xpu') and torch.xpu.is_available(): + is_xpu = True + except Exception: + pass + + return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None + + +def cpu_platform_plugin() -> Optional[str]: + is_cpu = False + try: + from importlib.metadata import version + is_cpu = "cpu" in version("vllm") + if not is_cpu: + import platform + is_cpu = platform.machine().lower().startswith("arm") + + except Exception: + pass + + return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None + + +def neuron_platform_plugin() -> Optional[str]: + is_neuron = False + try: + import transformers_neuronx # noqa: F401 + is_neuron = True + except ImportError: + pass + + return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None + + +def openvino_platform_plugin() -> Optional[str]: + is_openvino = False + try: + from importlib.metadata import version + is_openvino = "openvino" in version("vllm") + except Exception: + pass + + return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None + + +builtin_platform_plugins = { + 'tpu': tpu_platform_plugin, + 'cuda': cuda_platform_plugin, + 'rocm': rocm_platform_plugin, + 'hpu': hpu_platform_plugin, + 'xpu': xpu_platform_plugin, + 'cpu': cpu_platform_plugin, + 'neuron': neuron_platform_plugin, + 'openvino': openvino_platform_plugin, +} + + +def resolve_current_platform_cls_qualname() -> str: + platform_plugins = load_plugins_by_group('vllm.platform_plugins') + + activated_plugins = [] + + for name, func in chain(builtin_platform_plugins.items(), + platform_plugins.items()): + try: + assert callable(func) + platform_cls_qualname = func() + if platform_cls_qualname is not None: + activated_plugins.append(name) + except Exception: + pass + + activated_builtin_plugins = list( + set(activated_plugins) & set(builtin_platform_plugins.keys())) + activated_oot_plugins = list( + set(activated_plugins) & set(platform_plugins.keys())) + + if len(activated_oot_plugins) >= 2: + raise RuntimeError( + "Only one platform plugin can be activated, but got: " + f"{activated_oot_plugins}") + elif len(activated_oot_plugins) == 1: + platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]() + logger.info("Platform plugin %s is activated", + activated_oot_plugins[0]) + elif len(activated_builtin_plugins) >= 2: + raise RuntimeError( + "Only one platform plugin can be activated, but got: " + f"{activated_builtin_plugins}") + elif len(activated_builtin_plugins) == 1: + platform_cls_qualname = builtin_platform_plugins[ + activated_builtin_plugins[0]]() + logger.info("Automatically detected platform %s.", + activated_builtin_plugins[0]) + else: + platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform" + logger.info( + "No platform detected, vLLM is running on UnspecifiedPlatform") + return platform_cls_qualname + + +_current_platform = None +_init_trace: str = '' + +if TYPE_CHECKING: + current_platform: Platform + + +def __getattr__(name: str): + if name == 'current_platform': + # lazy init current_platform. + # 1. out-of-tree platform plugins need `from vllm.platforms import + # Platform` so that they can inherit `Platform` class. Therefore, + # we cannot resolve `current_platform` during the import of + # `vllm.platforms`. + # 2. when users use out-of-tree platform plugins, they might run + # `import vllm`, some vllm internal code might access + # `current_platform` during the import, and we need to make sure + # `current_platform` is only resolved after the plugins are loaded + # (we have tests for this, if any developer violate this, they will + # see the test failures). + global _current_platform + if _current_platform is None: + platform_cls_qualname = resolve_current_platform_cls_qualname() + _current_platform = resolve_obj_by_qualname( + platform_cls_qualname)() + global _init_trace + _init_trace = "".join(traceback.format_stack()) + return _current_platform + elif name in globals(): + return globals()[name] + else: + raise AttributeError( + f"No attribute named '{name}' exists in {__name__}.") + + +__all__ = [ + 'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum', + "_init_trace" +] diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..625e78a8f51cebd47ac5052abd6a8b222e24e850 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c48c741e507990bf32d7a2c1585f81f28cde08c0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cuda.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cuda.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa707b3dee783986e0a16c0f8a7148acf14b6128 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/cuda.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/hpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/hpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ef7c03dfc141d0bf07acf243a64b13f23a9c19e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/hpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/interface.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/interface.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b47558c4748d824a73c881590794aadd7e156de1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/interface.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/neuron.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/neuron.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18b9870cb15e826264ab681daa4071e12598d2d8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/neuron.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/openvino.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/openvino.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c6197537b933acabd82715c11be4a3701832715 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/openvino.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/rocm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/rocm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0aa01eb945d4efd6d32040fd96ac7b3a761d7086 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/rocm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/tpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/tpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9853550af3bd18998097fa01a4a3cd77ca0f8c71 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/tpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/xpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/xpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76f95a2662f54ebc322c132726dc8154059a201a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/platforms/__pycache__/xpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/cpu.py b/.venv/lib/python3.11/site-packages/vllm/platforms/cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..4e0683b8a2de1bc44c34b452bde5ebb5e310ea8a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/cpu.py @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import TYPE_CHECKING, Optional + +import psutil +import torch + +from vllm.logger import init_logger + +from .interface import Platform, PlatformEnum, _Backend + +logger = init_logger(__name__) + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + + +class CpuPlatform(Platform): + _enum = PlatformEnum.CPU + device_name: str = "cpu" + device_type: str = "cpu" + dispatch_key: str = "CPU" + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return "cpu" + + @classmethod + def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, + dtype: torch.dtype, kv_cache_dtype: Optional[str], + block_size: int, use_v1: bool, + use_mla: bool) -> str: + if selected_backend != _Backend.TORCH_SDPA: + logger.info("Cannot use %s backend on CPU.", selected_backend) + logger.info("Using Torch SDPA backend.") + return "vllm.attention.backends.torch_sdpa.TorchSDPABackend" + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + return psutil.virtual_memory().total + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return False + + @classmethod + def inference_mode(cls): + return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + import vllm.envs as envs + from vllm.utils import GiB_bytes + model_config = vllm_config.model_config + # Reminder: Please update docs/source/features/compatibility_matrix.md + # If the feature combo become valid + if not model_config.enforce_eager: + logger.warning( + "CUDA graph is not supported on CPU, fallback to the eager " + "mode.") + model_config.enforce_eager = True + + cache_config = vllm_config.cache_config + + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE + + if kv_cache_space >= 0: + if kv_cache_space == 0: + cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore + logger.warning( + "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) " + "for CPU backend is not set, using 4 by default.") + else: + cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa + else: + raise RuntimeError( + "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" + f" {kv_cache_space}, expect a positive integer value.") + + scheduler_config = vllm_config.scheduler_config + if ((scheduler_config.chunked_prefill_enabled + or cache_config.enable_prefix_caching) + and model_config.dtype == torch.half): + logger.warning("Chunked-prefill on the CPU backend only does not" + " support fp16 for now, cast to bf16.") + model_config.dtype = torch.bfloat16 + + parallel_config = vllm_config.parallel_config + if (parallel_config.distributed_executor_backend is not None + and parallel_config.distributed_executor_backend != "mp"): + logger.warning(("%s is not supported on CPU, fallback to mp " + "distributed executor backend."), + parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend = "mp" + if parallel_config.worker_cls == "auto": + if vllm_config.speculative_config: + parallel_config.worker_cls = \ + "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = \ + "vllm.worker.cpu_worker.CPUWorker" + else: + parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" + + assert vllm_config.device_config.device_type == "cpu" + + # + # Environment variables for CPU executor + # + + # Disable torch async compiling which won't work with daemonic processes + os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + + # Intel OpenMP setting + ld_prealod_str = os.getenv("LD_PRELOAD", "") + if "libiomp5.so" in ld_prealod_str: + # The time(milliseconds) that a thread should wait after + # completing the execution of a parallel region, before sleeping. + os.environ['KMP_BLOCKTIME'] = "1" + # Prevents the CPU to run into low performance state + os.environ['KMP_TPAUSE'] = "0" + # Provides fine granularity parallelism + os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist" + os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist" + os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" + + # To hint IPEX uses shared memory based AllReduce + os.environ["LOCAL_WORLD_SIZE"] = str( + vllm_config.parallel_config.tensor_parallel_size) + + @classmethod + def is_pin_memory_available(cls) -> bool: + logger.warning("Pin memory is not supported on CPU.") + return False + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU" diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/cuda.py b/.venv/lib/python3.11/site-packages/vllm/platforms/cuda.py new file mode 100644 index 0000000000000000000000000000000000000000..991d55ac861a470807dd24fcb87507373226f6fa --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/cuda.py @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Code inside this file can safely assume cuda platform, e.g. importing +pynvml. However, it should not initialize cuda context. +""" + +import os +from functools import lru_cache, wraps +from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar, + Union) + +import torch +from typing_extensions import ParamSpec + +# import custom ops, trigger op registration +import vllm._C # noqa +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.utils import import_pynvml + +from .interface import DeviceCapability, Platform, PlatformEnum, _Backend + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + +_P = ParamSpec("_P") +_R = TypeVar("_R") + +pynvml = import_pynvml() + +# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models +# see https://github.com/huggingface/diffusers/issues/9704 for details +torch.backends.cuda.enable_cudnn_sdp(False) + + +def device_id_to_physical_device_id(device_id: int) -> int: + if "CUDA_VISIBLE_DEVICES" in os.environ: + device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",") + if device_ids == [""]: + msg = ( + "CUDA_VISIBLE_DEVICES is set to empty string, which means" + " GPU support is disabled. If you are using ray, please unset" + " the environment variable `CUDA_VISIBLE_DEVICES` inside the" + " worker/actor. " + "Check https://github.com/vllm-project/vllm/issues/8402 for" + " more information.") + raise RuntimeError(msg) + physical_device_id = device_ids[device_id] + return int(physical_device_id) + else: + return device_id + + +def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]: + + @wraps(fn) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R: + pynvml.nvmlInit() + try: + return fn(*args, **kwargs) + finally: + pynvml.nvmlShutdown() + + return wrapper + + +class CudaPlatformBase(Platform): + _enum = PlatformEnum.CUDA + device_name: str = "cuda" + device_type: str = "cuda" + dispatch_key: str = "CUDA" + ray_device_key: str = "GPU" + device_control_env_var: str = "CUDA_VISIBLE_DEVICES" + + @classmethod + def get_device_capability(cls, + device_id: int = 0 + ) -> Optional[DeviceCapability]: + raise NotImplementedError + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + raise NotImplementedError + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + raise NotImplementedError + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + if enforce_eager: + logger.warning( + "To see benefits of async output processing, enable CUDA " + "graph. Since, enforce-eager is enabled, async output " + "processor cannot be used") + return False + return True + + @classmethod + def is_full_nvlink(cls, device_ids: List[int]) -> bool: + raise NotImplementedError + + @classmethod + def log_warnings(cls): + pass + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + parallel_config = vllm_config.parallel_config + scheduler_config = vllm_config.scheduler_config + + if parallel_config.worker_cls == "auto": + if scheduler_config.is_multi_step: + if envs.VLLM_USE_V1: + raise NotImplementedError( + "Multi-step scheduling is not supported (and not " + "needed) on VLLM V1. Please launch without " + "--num-scheduler-steps.") + else: + parallel_config.worker_cls = \ + "vllm.worker.multi_step_worker.MultiStepWorker" + elif vllm_config.speculative_config: + if envs.VLLM_USE_V1: + raise NotImplementedError( + "Speculative decoding is not yet supported on VLLM V1." + ) + else: + parallel_config.worker_cls = \ + "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = \ + "vllm.worker.worker.Worker" + else: + if envs.VLLM_USE_V1: + parallel_config.worker_cls = \ + "vllm.v1.worker.gpu_worker.Worker" + else: + parallel_config.worker_cls = "vllm.worker.worker.Worker" + + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + torch.cuda.reset_peak_memory_stats(device) + return torch.cuda.max_memory_allocated(device) + + @classmethod + def get_attn_backend_cls(cls, selected_backend, head_size, dtype, + kv_cache_dtype, block_size, use_v1, + use_mla) -> str: + if use_v1: + logger.info("Using Flash Attention backend on V1 engine.") + return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" + if use_mla: + logger.info("Using Triton MLA backend.") + return "vllm.attention.backends.triton_mla.TritonMLABackend" + if selected_backend == _Backend.FLASHINFER: + logger.info("Using FlashInfer backend.") + return "vllm.attention.backends.flashinfer.FlashInferBackend" + elif selected_backend == _Backend.XFORMERS: + logger.info("Using XFormers backend.") + return "vllm.attention.backends.xformers.XFormersBackend" + elif selected_backend == _Backend.FLASH_ATTN: + pass + elif selected_backend: + raise ValueError( + f"Invalid attention backend for {cls.device_name}, " + f"with use_v1: {use_v1} use_mla: {use_mla}") + + target_backend = _Backend.FLASH_ATTN + if not cls.has_device_capability(80): + # Volta and Turing NVIDIA GPUs. + logger.info( + "Cannot use FlashAttention-2 backend for Volta and Turing " + "GPUs.") + target_backend = _Backend.XFORMERS + elif dtype not in (torch.float16, torch.bfloat16): + logger.info( + "Cannot use FlashAttention-2 backend for dtype other than " + "torch.float16 or torch.bfloat16.") + target_backend = _Backend.XFORMERS + elif kv_cache_dtype is not None and \ + kv_cache_dtype.startswith("fp8"): + logger.info( + "Cannot use FlashAttention-2 backend for FP8 KV cache.") + logger.warning( + "Please use FlashInfer backend with FP8 KV Cache for " + "better performance by setting environment variable " + "VLLM_ATTENTION_BACKEND=FLASHINFER") + target_backend = _Backend.XFORMERS + elif block_size % 16 != 0: + logger.info( + "Cannot use FlashAttention-2 backend for block size not " + "divisible by 16.") + target_backend = _Backend.XFORMERS + + # FlashAttn is valid for the model, checking if the package is + # installed. + if target_backend == _Backend.FLASH_ATTN: + try: + import vllm.vllm_flash_attn # noqa: F401 + from vllm.attention.backends.flash_attn import ( # noqa: F401 + FlashAttentionBackend) + + supported_sizes = \ + FlashAttentionBackend.get_supported_head_sizes() + if head_size not in supported_sizes: + logger.info( + "Cannot use FlashAttention-2 backend for head size %d.", + head_size) + target_backend = _Backend.XFORMERS + except ImportError: + logger.info( + "Cannot use FlashAttention-2 backend because the " + "vllm.vllm_flash_attn package is not found. " + "Make sure that vllm_flash_attn was built and installed " + "(on by default).") + target_backend = _Backend.XFORMERS + + if target_backend == _Backend.XFORMERS: + logger.info("Using XFormers backend.") + return "vllm.attention.backends.xformers.XFormersBackend" + + logger.info("Using Flash Attention backend.") + return "vllm.attention.backends.flash_attn.FlashAttentionBackend" + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" + + +# NVML utils +# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, +# all the related functions work on real physical device ids. +# the major benefit of using NVML is that it will not initialize CUDA +class NvmlCudaPlatform(CudaPlatformBase): + + @classmethod + @lru_cache(maxsize=8) + @with_nvml_context + def get_device_capability(cls, + device_id: int = 0 + ) -> Optional[DeviceCapability]: + try: + physical_device_id = device_id_to_physical_device_id(device_id) + handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) + major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) + return DeviceCapability(major=major, minor=minor) + except RuntimeError: + return None + + @classmethod + @lru_cache(maxsize=8) + @with_nvml_context + def has_device_capability( + cls, + capability: Union[Tuple[int, int], int], + device_id: int = 0, + ) -> bool: + try: + return super().has_device_capability(capability, device_id) + except RuntimeError: + return False + + @classmethod + @lru_cache(maxsize=8) + @with_nvml_context + def get_device_name(cls, device_id: int = 0) -> str: + physical_device_id = device_id_to_physical_device_id(device_id) + return cls._get_physical_device_name(physical_device_id) + + @classmethod + @lru_cache(maxsize=8) + @with_nvml_context + def get_device_uuid(cls, device_id: int = 0) -> str: + physical_device_id = device_id_to_physical_device_id(device_id) + handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) + return pynvml.nvmlDeviceGetUUID(handle) + + @classmethod + @lru_cache(maxsize=8) + @with_nvml_context + def get_device_total_memory(cls, device_id: int = 0) -> int: + physical_device_id = device_id_to_physical_device_id(device_id) + handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) + return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total) + + @classmethod + @with_nvml_context + def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: + """ + query if the set of gpus are fully connected by nvlink (1 hop) + """ + handles = [ + pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids + ] + for i, handle in enumerate(handles): + for j, peer_handle in enumerate(handles): + if i < j: + try: + p2p_status = pynvml.nvmlDeviceGetP2PStatus( + handle, + peer_handle, + pynvml.NVML_P2P_CAPS_INDEX_NVLINK, + ) + if p2p_status != pynvml.NVML_P2P_STATUS_OK: + return False + except pynvml.NVMLError: + logger.exception( + "NVLink detection failed. This is normal if" + " your machine has no NVLink equipped.") + return False + return True + + @classmethod + def _get_physical_device_name(cls, device_id: int = 0) -> str: + handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) + return pynvml.nvmlDeviceGetName(handle) + + @classmethod + @with_nvml_context + def log_warnings(cls): + device_ids: int = pynvml.nvmlDeviceGetCount() + if device_ids > 1: + device_names = [ + cls._get_physical_device_name(i) for i in range(device_ids) + ] + if (len(set(device_names)) > 1 + and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"): + logger.warning( + "Detected different devices in the system: \n%s\nPlease" + " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to " + "avoid unexpected behavior.", + "\n".join(device_names), + ) + + +class NonNvmlCudaPlatform(CudaPlatformBase): + + @classmethod + def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: + major, minor = torch.cuda.get_device_capability(device_id) + return DeviceCapability(major=major, minor=minor) + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return torch.cuda.get_device_name(device_id) + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + device_props = torch.cuda.get_device_properties(device_id) + return device_props.total_memory + + @classmethod + def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: + logger.exception( + "NVLink detection not possible, as context support was" + " not found. Assuming no NVLink available.") + return False + + +# Autodetect either NVML-enabled or non-NVML platform +# based on whether NVML is available. +nvml_available = False +try: + try: + pynvml.nvmlInit() + nvml_available = True + except Exception: + # On Jetson, NVML is not supported. + nvml_available = False +finally: + if nvml_available: + pynvml.nvmlShutdown() + +CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform + +try: + from sphinx.ext.autodoc.mock import _MockModule + + if not isinstance(pynvml, _MockModule): + CudaPlatform.log_warnings() +except ModuleNotFoundError: + CudaPlatform.log_warnings() diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/hpu.py b/.venv/lib/python3.11/site-packages/vllm/platforms/hpu.py new file mode 100644 index 0000000000000000000000000000000000000000..78ddb67bb3fa31d9c7334bc61eea32a100765258 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/hpu.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm import envs +from vllm.logger import init_logger + +from .interface import Platform, PlatformEnum, _Backend + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + + +class HpuPlatform(Platform): + _enum = PlatformEnum.HPU + device_name: str = "hpu" + device_type: str = "hpu" + dispatch_key: str = "HPU" + ray_device_key: str = "HPU" + device_control_env_var: str = "HABANA_VISIBLE_MODULES" + + @classmethod + def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, + dtype: torch.dtype, kv_cache_dtype: Optional[str], + block_size: int, use_v1: bool, + use_mla: bool) -> str: + logger.info("Using HPUAttention backend.") + return "vllm.attention.backends.hpu_attn.HPUAttentionBackend" + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return True + + @staticmethod + def inference_mode(): + return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + + scheduler_config = vllm_config.scheduler_config + if scheduler_config.is_multi_step: + raise NotImplementedError( + "Multi-step execution is not implemented for HPU") + + if vllm_config.speculative_config is not None: + raise NotImplementedError( + "Speculative decoding is not implemented for HPU") + + parallel_config = vllm_config.parallel_config + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" + + # NOTE(kzawora): default block size for Gaudi should be 128 + # smaller sizes still work, but very inefficiently + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 128 + if (parallel_config.distributed_executor_backend == 'mp' + and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", + None) is not None: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Using " + "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " + "as it was explicitly requested.") + else: + logger.warning( + "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "To override that behavior, please set " + "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + @classmethod + def is_pin_memory_available(cls): + logger.warning("Pin memory is not supported on HPU.") + return False + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU" diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/interface.py b/.venv/lib/python3.11/site-packages/vllm/platforms/interface.py new file mode 100644 index 0000000000000000000000000000000000000000..211e288b125da39c3a9aa9bb155c0622885289d7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/interface.py @@ -0,0 +1,308 @@ +# SPDX-License-Identifier: Apache-2.0 + +import enum +import platform +import random +from platform import uname +from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union + +import numpy as np +import torch + +from vllm.logger import init_logger + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + + +def in_wsl() -> bool: + # Reference: https://github.com/microsoft/WSL/issues/4071 + return "microsoft" in " ".join(uname()).lower() + + +class _Backend(enum.Enum): + FLASH_ATTN = enum.auto() + FLASH_ATTN_VLLM_V1 = enum.auto() + XFORMERS = enum.auto() + ROCM_FLASH = enum.auto() + TORCH_SDPA = enum.auto() + OPENVINO = enum.auto() + FLASHINFER = enum.auto() + TRITON_MLA = enum.auto() + HPU_ATTN = enum.auto() + PALLAS = enum.auto() + IPEX = enum.auto() + BLOCK_SPARSE_FLASH_ATTN = enum.auto() + NO_ATTENTION = enum.auto() + + +class PlatformEnum(enum.Enum): + CUDA = enum.auto() + ROCM = enum.auto() + TPU = enum.auto() + HPU = enum.auto() + XPU = enum.auto() + CPU = enum.auto() + NEURON = enum.auto() + OPENVINO = enum.auto() + OOT = enum.auto() + UNSPECIFIED = enum.auto() + + +class CpuArchEnum(enum.Enum): + X86 = enum.auto() + ARM = enum.auto() + POWERPC = enum.auto() + OTHER = enum.auto() + UNKNOWN = enum.auto() + + +class DeviceCapability(NamedTuple): + major: int + minor: int + + def as_version_str(self) -> str: + return f"{self.major}.{self.minor}" + + def to_int(self) -> int: + """ + Express device capability as an integer ````. + + It is assumed that the minor version is always a single digit. + """ + assert 0 <= self.minor < 10 + return self.major * 10 + self.minor + + +class Platform: + _enum: PlatformEnum + device_name: str + device_type: str + + # available dispatch keys: + # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa + # use "CPU" as a fallback for platforms not registered in PyTorch + dispatch_key: str = "CPU" + + # available ray device keys: + # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa + # empty string means the device does not support ray + ray_device_key: str = "" + + # platform-agnostic way to specify the device control environment variable, + # .e.g. CUDA_VISIBLE_DEVICES for CUDA. + # hint: search for "get_visible_accelerator_ids_env_var" in + # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa + device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER" + + # The torch.compile backend for compiling simple and + # standalone functions. The default value is "inductor" to keep + # the same behavior as PyTorch. + # NOTE: for the forward part of the model, vLLM has another separate + # compilation strategy. + simple_compile_backend: str = "inductor" + + supported_quantization: list[str] = [] + + def is_cuda(self) -> bool: + return self._enum == PlatformEnum.CUDA + + def is_rocm(self) -> bool: + return self._enum == PlatformEnum.ROCM + + def is_tpu(self) -> bool: + return self._enum == PlatformEnum.TPU + + def is_hpu(self) -> bool: + return self._enum == PlatformEnum.HPU + + def is_xpu(self) -> bool: + return self._enum == PlatformEnum.XPU + + def is_cpu(self) -> bool: + return self._enum == PlatformEnum.CPU + + def is_neuron(self) -> bool: + return self._enum == PlatformEnum.NEURON + + def is_openvino(self) -> bool: + return self._enum == PlatformEnum.OPENVINO + + def is_out_of_tree(self) -> bool: + return self._enum == PlatformEnum.OOT + + def is_cuda_alike(self) -> bool: + """Stateless version of :func:`torch.cuda.is_available`.""" + return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) + + @classmethod + def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, + dtype: torch.dtype, kv_cache_dtype: Optional[str], + block_size: int, use_v1: bool, + use_mla: bool) -> str: + """Get the attention backend class of a device.""" + return "" + + @classmethod + def get_device_capability( + cls, + device_id: int = 0, + ) -> Optional[DeviceCapability]: + """Stateless version of :func:`torch.cuda.get_device_capability`.""" + return None + + @classmethod + def has_device_capability( + cls, + capability: Union[Tuple[int, int], int], + device_id: int = 0, + ) -> bool: + """ + Test whether this platform is compatible with a device capability. + + The ``capability`` argument can either be: + + - A tuple ``(major, minor)``. + - An integer ````. (See :meth:`DeviceCapability.to_int`) + """ + current_capability = cls.get_device_capability(device_id=device_id) + if current_capability is None: + return False + + if isinstance(capability, tuple): + return current_capability >= capability + + return current_capability.to_int() >= capability + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + """Get the name of a device.""" + raise NotImplementedError + + @classmethod + def get_device_uuid(cls, device_id: int = 0) -> str: + """Get the uuid of a device, e.g. the PCI bus ID.""" + raise NotImplementedError + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + """Get the total memory of a device in bytes.""" + raise NotImplementedError + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + """ + Check if the current platform supports async output. + """ + raise NotImplementedError + + @classmethod + def inference_mode(cls): + """A device-specific wrapper of `torch.inference_mode`. + + This wrapper is recommended because some hardware backends such as TPU + do not support `torch.inference_mode`. In such a case, they will fall + back to `torch.no_grad` by overriding this method. + """ + return torch.inference_mode(mode=True) + + @classmethod + def seed_everything(cls, seed: int) -> None: + """ + Set the seed of each random module. + `torch.manual_seed` will set seed on all devices. + + Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20 + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + """ + Check and update the configuration for the current platform. + + It can raise an exception if the configuration is not compatible with + the current platform, or it can update the configuration to make it + compatible with the current platform. + + The config is passed by reference, so it can be modified in place. + """ + pass + + @classmethod + def verify_model_arch(cls, model_arch: str) -> None: + """ + Verify whether the current platform supports the specified model + architecture. + + - This will raise an Error or Warning based on the model support on + the current platform. + - By default all models are considered supported. + """ + pass + + @classmethod + def verify_quantization(cls, quant: str) -> None: + """ + Verify whether the quantization is supported by the current platform. + """ + if cls.supported_quantization and \ + quant not in cls.supported_quantization: + raise ValueError( + f"{quant} quantization is currently not supported in " + f"{cls.device_name}.") + + @classmethod + def get_cpu_architecture(cls) -> CpuArchEnum: + """ + Determine the CPU architecture of the current system. + Returns CpuArchEnum indicating the architecture type. + """ + machine = platform.machine().lower() + + if machine in ("x86_64", "amd64", "i386", "i686"): + return CpuArchEnum.X86 + elif machine.startswith("arm") or machine.startswith("aarch"): + return CpuArchEnum.ARM + elif machine.startswith("ppc"): + return CpuArchEnum.POWERPC + + return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN + + @classmethod + def is_pin_memory_available(cls) -> bool: + """Checks whether pin memory is available on the current platform.""" + if in_wsl(): + # Pinning memory in WSL is not supported. + # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications + logger.warning("Using 'pin_memory=False' as WSL is detected. " + "This may slow down the performance.") + return False + return True + + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + """ + Return the memory usage in bytes. + """ + raise NotImplementedError + + @classmethod + def get_punica_wrapper(cls) -> str: + """ + Return the punica wrapper for current platform. + """ + raise NotImplementedError + + +class UnspecifiedPlatform(Platform): + _enum = PlatformEnum.UNSPECIFIED + device_type = "" diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/neuron.py b/.venv/lib/python3.11/site-packages/vllm/platforms/neuron.py new file mode 100644 index 0000000000000000000000000000000000000000..5a03f5f7acbc1330453bacc46aab526eabbb5767 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/neuron.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import TYPE_CHECKING, Optional + +from vllm.logger import init_logger + +from .interface import Platform, PlatformEnum + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + + +class NeuronPlatform(Platform): + _enum = PlatformEnum.NEURON + device_name: str = "neuron" + device_type: str = "neuron" + ray_device_key: str = "neuron_cores" + supported_quantization: list[str] = ["neuron_quant"] + device_control_env_var: str = "NEURON_RT_VISIBLE_CORES" + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return "neuron" + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return False + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + parallel_config = vllm_config.parallel_config + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = \ + "vllm.worker.neuron_worker.NeuronWorker" + + if parallel_config.world_size > 1: + parallel_config.distributed_executor_backend = "uni" + + assert (vllm_config.lora_config + is None), "LoRA is not supported for Neuron backend." + assert (not vllm_config.speculative_config + ), "Speculative decoding not yet supported for Neuron backend." + + cache_config = vllm_config.cache_config + if cache_config: + # neuron needs block_size = max_model_len + vllm_config.cache_config.block_size = \ + vllm_config.model_config.max_model_len + + @classmethod + def is_pin_memory_available(cls) -> bool: + logger.warning("Pin memory is not supported on Neuron.") + return False diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/openvino.py b/.venv/lib/python3.11/site-packages/vllm/platforms/openvino.py new file mode 100644 index 0000000000000000000000000000000000000000..41221de0afe509556cf8a01b84dcbcb14c137e80 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/openvino.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import TYPE_CHECKING, Optional + +import torch + +import vllm.envs as envs +from vllm.logger import init_logger + +from .interface import Platform, PlatformEnum, _Backend + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + +try: + import openvino as ov + import openvino.properties.hint as hints +except ImportError as e: + logger.warning("Failed to import OpenVINO with %r", e) + + +class OpenVinoPlatform(Platform): + _enum = PlatformEnum.OPENVINO + device_name: str = "openvino" + device_type: str = "openvino" + dispatch_key: str = "CPU" + + @classmethod + def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, + dtype: torch.dtype, kv_cache_dtype: Optional[str], + block_size: int, use_v1: bool, + use_mla: bool) -> str: + if selected_backend != _Backend.OPENVINO: + logger.info("Cannot use %s backend on OpenVINO.", selected_backend) + logger.info("Using OpenVINO Attention backend.") + return "vllm.attention.backends.openvino.OpenVINOAttentionBackend" + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return "openvino" + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return False + + @classmethod + def inference_mode(cls): + return torch.inference_mode(mode=True) + + @classmethod + def is_openvino_cpu(cls) -> bool: + return "CPU" in envs.VLLM_OPENVINO_DEVICE + + @classmethod + def is_openvino_gpu(cls) -> bool: + return "GPU" in envs.VLLM_OPENVINO_DEVICE + + @classmethod + def is_pin_memory_available(cls) -> bool: + logger.warning("Pin memory is not supported on OpenViNO.") + return False + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + from vllm.utils import GiB_bytes + + parallel_config = vllm_config.parallel_config + assert (parallel_config.world_size == 1 + ), "OpenVINO only supports single CPU socket currently." + + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = \ + "vllm.worker.openvino_worker.OpenVINOWorker" + + # check and update model config + model_config = vllm_config.model_config + if model_config.dtype != torch.float32: + logger.warning( + f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}." # noqa: G004, E501 + ) + model_config.dtype = torch.float32 + if not model_config.enforce_eager: + logger.warning( + "CUDA graph is not supported on OpenVINO backend, fallback to " + "the eager mode.") + model_config.enforce_eager = True + + # check and update cache config + ov_core = ov.Core() + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + + if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": + if not OpenVinoPlatform.is_openvino_cpu(): + logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is" + "ignored for GPU, f16 data type will be used.") + cache_config.cache_dtype = ov.Type.f16 + else: + logger.info("KV cache type is overridden to u8 via " + "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.") + cache_config.cache_dtype = ov.Type.u8 + else: + if OpenVinoPlatform.is_openvino_cpu(): + ov_device = envs.VLLM_OPENVINO_DEVICE + inference_precision = ov_core.get_property( + ov_device, hints.inference_precision) + if inference_precision == ov.Type.bf16: + cache_config.cache_dtype = ov.Type.bf16 + else: + cache_config.cache_dtype = ov.Type.f16 + else: + cache_config.cache_dtype = ov.Type.f16 + + if OpenVinoPlatform.is_openvino_cpu(): + if cache_config.block_size != 32: + logger.info( + f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}" # noqa: G004, E501 + ) + cache_config.block_size = 32 + else: + if cache_config.block_size != 16: + logger.info( + f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}" # noqa: G004, E501 + ) + cache_config.block_size = 16 + + kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE + if kv_cache_space >= 0: + if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu(): + cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore + logger.warning( + "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) " + "for OpenVINO backend is not set, using 4 by default.") + else: + cache_config.openvino_kvcache_space_bytes = ( # type: ignore + kv_cache_space * GiB_bytes) + else: + raise RuntimeError( + "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE" + f" {kv_cache_space}, expect a positive integer value.") + + assert vllm_config.device_config.device_type == "openvino" + assert vllm_config.lora_config is None, \ + "OpenVINO backend doesn't support LoRA" + assert cls.is_openvino_cpu() or \ + cls.is_openvino_gpu(), \ + "OpenVINO backend supports only CPU and GPU devices" diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/rocm.py b/.venv/lib/python3.11/site-packages/vllm/platforms/rocm.py new file mode 100644 index 0000000000000000000000000000000000000000..035766289aebd56a8cbc02416638cf6771b119d0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/rocm.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +from functools import lru_cache +from typing import TYPE_CHECKING, Dict, List, Optional + +import torch + +import vllm.envs as envs +from vllm.logger import init_logger + +from .interface import DeviceCapability, Platform, PlatformEnum, _Backend + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + +try: + import vllm._C # noqa: F401 +except ImportError as e: + logger.warning("Failed to import from vllm._C with %r", e) + +# import custom ops, trigger op registration +try: + import vllm._rocm_C # noqa: F401 +except ImportError as e: + logger.warning("Failed to import from vllm._rocm_C with %r", e) + +if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]: + logger.warning("`fork` method is not supported by ROCm. " + "VLLM_WORKER_MULTIPROC_METHOD is overridden to" + " `spawn` instead.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +# Models not supported by ROCm. +_ROCM_UNSUPPORTED_MODELS: List[str] = [] + +# Models partially supported by ROCm. +# Architecture -> Reason. +_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in " + "Triton flash attention. For half-precision SWA support, " + "please use CK flash attention by setting " + "`VLLM_USE_TRITON_FLASH_ATTN=0`") +_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { + "Qwen2ForCausalLM": + _ROCM_SWA_REASON, + "MistralForCausalLM": + _ROCM_SWA_REASON, + "MixtralForCausalLM": + _ROCM_SWA_REASON, + "PaliGemmaForConditionalGeneration": + ("ROCm flash attention does not yet " + "fully support 32-bit precision on PaliGemma"), + "Phi3VForCausalLM": + ("ROCm Triton flash attention may run into compilation errors due to " + "excessive use of shared memory. If this happens, disable Triton FA " + "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") +} + + +class RocmPlatform(Platform): + _enum = PlatformEnum.ROCM + device_name: str = "rocm" + device_type: str = "cuda" + dispatch_key: str = "CUDA" + ray_device_key: str = "GPU" + # rocm shares the same device control env var as CUDA + device_control_env_var: str = "CUDA_VISIBLE_DEVICES" + + supported_quantization: list[str] = [ + "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", + "fbgemm_fp8", "gguf", "quark" + ] + + @classmethod + def get_attn_backend_cls(cls, selected_backend, head_size, dtype, + kv_cache_dtype, block_size, use_v1, + use_mla) -> str: + if use_mla: + logger.info("Using Triton MLA backend.") + return "vllm.attention.backends.triton_mla.TritonMLABackend" + selected_backend = (_Backend.ROCM_FLASH if selected_backend + == _Backend.FLASH_ATTN else selected_backend) + if selected_backend == _Backend.ROCM_FLASH: + if not cls.has_device_capability(90): + # not Instinct series GPUs. + logger.info("flash_attn is not supported on NAVI GPUs.") + else: + logger.info("%s is not supported in AMD GPUs.", selected_backend) + logger.info("Using ROCmFlashAttention backend.") + return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend" # noqa: E501 + + @classmethod + @lru_cache(maxsize=8) + def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: + major, minor = torch.cuda.get_device_capability(device_id) + return DeviceCapability(major=major, minor=minor) + + @classmethod + @lru_cache(maxsize=8) + def get_device_name(cls, device_id: int = 0) -> str: + return torch.cuda.get_device_name(device_id) + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + device_props = torch.cuda.get_device_properties(device_id) + return device_props.total_memory + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + if enforce_eager: + logger.warning( + "To see benefits of async output processing, enable CUDA " + "graph. Since, enforce-eager is enabled, async output " + "processor cannot be used") + return False + return True + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + + parallel_config = vllm_config.parallel_config + scheduler_config = vllm_config.scheduler_config + if parallel_config.worker_cls == "auto": + if scheduler_config.is_multi_step: + parallel_config.worker_cls = \ + "vllm.worker.multi_step_worker.MultiStepWorker" + elif vllm_config.speculative_config: + parallel_config.worker_cls = \ + "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = \ + "vllm.worker.worker.Worker" + else: + parallel_config.worker_cls = "vllm.worker.worker.Worker" + + @classmethod + def verify_model_arch(cls, model_arch: str) -> None: + if model_arch in _ROCM_UNSUPPORTED_MODELS: + raise ValueError(f"Model architecture '{model_arch}' is not " + "supported by ROCm for now.") + + if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: + msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] + logger.warning( + "Model architecture '%s' is partially " + "supported by ROCm: %s", model_arch, msg) + + @classmethod + def verify_quantization(cls, quant: str) -> None: + super().verify_quantization(quant) + if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ: + logger.warning( + "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" + " is not set, enabling VLLM_USE_TRITON_AWQ.") + envs.VLLM_USE_TRITON_AWQ = True + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" + + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + torch.cuda.reset_peak_memory_stats(device) + return torch.cuda.max_memory_allocated(device) diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/tpu.py b/.venv/lib/python3.11/site-packages/vllm/platforms/tpu.py new file mode 100644 index 0000000000000000000000000000000000000000..fffc61bbaaca0365b8f2a6eddfa7a834f4176903 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/tpu.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm.logger import init_logger + +from .interface import Platform, PlatformEnum, _Backend + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + + +class TpuPlatform(Platform): + _enum = PlatformEnum.TPU + device_name: str = "tpu" + device_type: str = "tpu" + dispatch_key: str = "XLA" + ray_device_key: str = "TPU" + device_control_env_var: str = "TPU_VISIBLE_CHIPS" + + supported_quantization: list[str] = [ + "tpu_int8", "compressed-tensors", "compressed_tensors" + ] + + @classmethod + def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, + dtype: torch.dtype, kv_cache_dtype: Optional[str], + block_size: int, use_v1: bool, + use_mla: bool) -> str: + if selected_backend != _Backend.PALLAS: + logger.info("Cannot use %s backend on TPU.", selected_backend) + logger.info("Using Pallas backend.") + return "vllm.attention.backends.pallas.PallasAttentionBackend" + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + raise NotImplementedError + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + raise NotImplementedError + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return True + + @classmethod + def inference_mode(cls): + return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + from vllm.config import CompilationLevel + + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + + compilation_config = vllm_config.compilation_config + if compilation_config.level == CompilationLevel.NO_COMPILATION: + # TPU does not support NO_COMPILATION + compilation_config.level = CompilationLevel.DYNAMO_ONCE + assert compilation_config.level < CompilationLevel.PIECEWISE,\ + "TPU does not support Inductor." + + if compilation_config.backend == "": + compilation_config.backend = "openxla" + + assert vllm_config.speculative_config is None, \ + "TPU does not support speculative decoding" + + assert not vllm_config.scheduler_config.chunked_prefill_enabled, ( + "Chunked prefill is not yet supported for TPU backend") + assert not vllm_config.speculative_config, ( + "Speculative decoding is not yet supported for TPU backend") + if vllm_config.model_config.dtype in (torch.float16, torch.float32): + logger.warning( + "The TPU backend currently does not support %s. " + "Using bfloat16 instead.", vllm_config.model_config.dtype) + vllm_config.model_config.dtype = torch.bfloat16 + + parallel_config = vllm_config.parallel_config + scheduler_config = vllm_config.scheduler_config + if parallel_config.worker_cls == "auto": + if scheduler_config.is_multi_step: + parallel_config.worker_cls = \ + "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker" + else: + parallel_config.worker_cls = "vllm.worker.tpu_worker.TPUWorker" diff --git a/.venv/lib/python3.11/site-packages/vllm/platforms/xpu.py b/.venv/lib/python3.11/site-packages/vllm/platforms/xpu.py new file mode 100644 index 0000000000000000000000000000000000000000..81bc85f9415e83f8b593c61efa74be307ffe0379 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/platforms/xpu.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm.logger import init_logger + +from .interface import DeviceCapability, Platform, PlatformEnum, _Backend + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + + +class XPUPlatform(Platform): + _enum = PlatformEnum.XPU + device_name: str = "xpu" + device_type: str = "xpu" + dispatch_key: str = "XPU" + # Intel XPU's device key is "GPU" for Ray. + # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501 + ray_device_key: str = "GPU" + device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR" + + @classmethod + def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, + dtype: torch.dtype, kv_cache_dtype: Optional[str], + block_size: int, use_v1: bool, + use_mla: bool) -> str: + if selected_backend != _Backend.IPEX: + logger.info("Cannot use %s backend on XPU.", selected_backend) + logger.info("Using IPEX attention backend.") + return "vllm.attention.backends.ipex_attn.IpexAttnBackend" + + @staticmethod + def get_device_capability(device_id: int = 0) -> DeviceCapability: + major, minor, *_ = torch.xpu.get_device_capability( + device_id)['version'].split('.') + return DeviceCapability(major=int(major), minor=int(minor)) + + @staticmethod + def get_device_name(device_id: int = 0) -> str: + return torch.xpu.get_device_name(device_id) + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + device_props = torch.xpu.get_device_properties(device_id) + return device_props.total_memory + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return True + + @staticmethod + def inference_mode(): + return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + + # check and update model config + model_config = vllm_config.model_config + if model_config.dtype == torch.bfloat16: + bf16_supported = cls.device_support_bf16() + if not bf16_supported: + logger.warning( + "bfloat16 is only supported on Intel Data Center GPU, " + "Intel Arc GPU is not supported yet. Your device is %s," + "which is not supported. will fallback to float16", + cls.get_device_name()) + model_config.dtype = torch.float16 + if not model_config.enforce_eager: + logger.warning( + "CUDA graph is not supported on XPU, fallback to the eager " + "mode.") + model_config.enforce_eager = True + + if vllm_config.speculative_config is not None: + raise NotImplementedError( + "XPU does not support speculative decoding") + + if vllm_config.device_config is not None: + assert vllm_config.device_config.device_type == "xpu" + + # check and update parallel config + parallel_config = vllm_config.parallel_config + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" + + if parallel_config.distributed_executor_backend is None: + parallel_config.distributed_executor_backend = "ray" + elif parallel_config.distributed_executor_backend == "mp": + # FIXME(kunshang): + # spawn needs calling `if __name__ == '__main__':`` + # fork is not supported for xpu start new process. + logger.error( + "Both start methods (spawn and fork) have issue " + "on XPU if you use mp backend, setting it to ray instead.") + parallel_config.distributed_executor_backend = "ray" + + elif parallel_config.distributed_executor_backend != "ray": + logger.warning( + "%s is not supported on XPU, fallback to ray distributed" + " executor backend.", + parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend = "ray" + + @classmethod + def is_pin_memory_available(cls): + logger.warning("Pin memory is not supported on XPU.") + return False + + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + torch.xpu.reset_peak_memory_stats(device) + return torch.xpu.max_memory_allocated(device) + + @classmethod + def device_support_bf16(cls) -> bool: + device_name = cls.get_device_name().lower() + if device_name.count("arc") > 0: + return False + elif device_name.count("data center gpu") > 0: + return True + else: + logger.warning("Unknown device name %s, always use float16", + device_name) + return False diff --git a/.venv/lib/python3.11/site-packages/vllm/pooling_params.py b/.venv/lib/python3.11/site-packages/vllm/pooling_params.py new file mode 100644 index 0000000000000000000000000000000000000000..061232eb11830bcef3acfc68281c467dcd4d478a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/pooling_params.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Optional + +import msgspec + + +class PoolingParams( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] + """API parameters for pooling models. This is currently a placeholder. + + Attributes: + additional_data: Any additional data needed for pooling. + """ + additional_data: Optional[Any] = None + + def clone(self) -> "PoolingParams": + """Returns a deep copy of the PoolingParams instance.""" + return PoolingParams(additional_data=self.additional_data) + + def __repr__(self) -> str: + return (f"PoolingParams(" + f"additional_metadata={self.additional_data})") diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__init__.py b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75df534e55b7ee5cb976bf735e0daca8858bd271 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/layers.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/layers.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d1ae9095cd55c5e4e4c2e6ab5a875c7419c8984 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/layers.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/models.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cf7330901e1f5dd94868a3b94e19cdb854b43fb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/models.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/request.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/request.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc4fcc070a6fd87bee942b5fe0943ecb6fdf875a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/request.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ece6f5547db73ad4383cf65f45818270b5bc643 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/worker_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/worker_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..125af6713809de3e93d6cd62493e0b8419029f3c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/__pycache__/worker_manager.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/layers.py b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f9f16919b7fc7bbb89899b615dc06e415ae6bd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/layers.py @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import Optional + +import torch +from torch import nn + +from vllm.adapter_commons.layers import AdapterMapping +from vllm.config import PromptAdapterConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) + + +@dataclass +class PromptAdapterMapping(AdapterMapping): + pass + + +class VocabParallelEmbeddingWithPromptAdapter(nn.Module): + + def __init__(self, base_layer: VocabParallelEmbedding) -> None: + super().__init__() + self.base_layer = base_layer + self.emb_layer = self.base_layer + if 'LoRA' in base_layer.__class__.__name__: + self.emb_layer = self.base_layer.base_layer + + def create_prompt_adapter_weights( + self, prompt_adapter_config: PromptAdapterConfig): + self.embeddings_tensors = torch.zeros( + ( + prompt_adapter_config.max_prompt_adapters, + prompt_adapter_config.max_prompt_adapter_token, + self.emb_layer.embedding_dim, + ), + dtype=self.emb_layer.weight.dtype, + device=self.emb_layer.weight.device, + ) + self.adapter_lengths = torch.zeros( + prompt_adapter_config.max_prompt_adapters, + dtype=torch.long, + device=self.emb_layer.weight.device) + + self.indices_gpu: torch.Tensor + self.embedding_indices_gpu: torch.Tensor + + def reset_prompt_adapter(self, index: int): + self.embeddings_tensors[index] = 0 + + def set_prompt_adapter( + self, + index: int, + adapter_model: Optional[torch.Tensor], + ): + self.reset_prompt_adapter(index) + if adapter_model is not None: + length = adapter_model.shape[0] + self.embeddings_tensors[index, :length] = adapter_model + self.adapter_lengths[index] = length + + def set_mapping( + self, + prompt_indices: torch.Tensor, + prompt_embedding_indices: torch.Tensor, + ): + self.indices_gpu = prompt_indices.to( + device=self.emb_layer.weight.device) + self.embedding_indices_gpu = prompt_embedding_indices.to( + device=self.emb_layer.weight.device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + hidden_states = self.base_layer(x) + if self.embedding_indices_gpu.ndim > 1: + valid_mask = self.indices_gpu != -1 + gathered_embeddings = self.embeddings_tensors[ + self.embedding_indices_gpu[:, 0], + self.embedding_indices_gpu[:, 1]] + + # Update hidden states + hidden_states[valid_mask] = gathered_embeddings + return hidden_states \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/models.py b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/models.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba7d0896f95ac1c395f9ffecec89cb900dbce49 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/models.py @@ -0,0 +1,357 @@ +# SPDX-License-Identifier: Apache-2.0 + +import logging +import math +from typing import Any, Callable, Dict, List, Optional, Type + +import torch +from torch import nn + +from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel, + AdapterModelManager) +from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter, + get_adapter, list_adapters, + remove_adapter, set_adapter_mapping) +from vllm.config import PromptAdapterConfig +from vllm.prompt_adapter.layers import ( + VocabParallelEmbeddingWithPromptAdapter) # yapf: disable +from vllm.prompt_adapter.layers import PromptAdapterMapping +from vllm.prompt_adapter.utils import load_peft_weights + +logger = logging.getLogger(__name__) + +_GLOBAL_PROMPT_ADAPTER_ID = 0 + + +def get_prompt_adapter_id(): + global _GLOBAL_PROMPT_ADAPTER_ID + _GLOBAL_PROMPT_ADAPTER_ID += 1 + return _GLOBAL_PROMPT_ADAPTER_ID + + +def convert_to_embedding_indices(indices): + embedding_indices = [] + count = 0 + + for value in indices: + if value == -1: + count = 0 + else: + embedding_indices.append([value, count]) + count += 1 + + return torch.tensor(embedding_indices) + + +def convert_mapping( + mapping: PromptAdapterMapping, + prompt_adapter_index_to_id: List[Optional[int]], +) -> torch.Tensor: + """Converts PromptAdapterMapping to index tensors. + + Args: + mapping: PromptAdapterMapping mapping rows in a + batch to PromptAdapter ids. + prompt_adapter_index_to_id: List mapping PromptAdapter + ids to PromptAdapter indices. + + Returns: + pa_indices: Tensor of shape [batch_size] mapping batch rows to + PromptAdapter indices. + """ + id_to_index = { + id_: idx + for idx, id_ in enumerate(prompt_adapter_index_to_id) + if id_ is not None + } + pa_indices = ([ + id_to_index.get(id_, -1) if id_ > 0 else -1 + for id_ in mapping.index_mapping + ]) + + pa_embedding_mapping = convert_to_embedding_indices(pa_indices) + pa_indices = torch.tensor(pa_indices) + return pa_indices, pa_embedding_mapping + + +class PromptAdapterModel(AdapterModel): + + def __init__(self, + prompt_adapter_id=None, + num_virtual_tokens=None, + prompt_embedding=None) -> None: + self.id = prompt_adapter_id + self.prompt_embedding = prompt_embedding + self.num_virtual_tokens = num_virtual_tokens + + @classmethod + def from_local_checkpoint( + cls, + adapter_model_path: str, + prompt_adapter_id: int, + num_virtual_tokens: int, + config: PromptAdapterConfig, + device: str = "cuda", + ) -> "PromptAdapterModel": + + if num_virtual_tokens > config.max_prompt_adapter_token: + raise ValueError( + f'num_virtual_tokens ({num_virtual_tokens}) should be <= ' + f'max_prompt_adapter_token({config.max_prompt_adapter_token})') + + adapters_weights = load_peft_weights(adapter_model_path, device) + prompt_embedding = adapters_weights["prompt_embeddings"].to( + config.prompt_adapter_dtype) + + return cls(prompt_adapter_id, num_virtual_tokens, prompt_embedding) + + +class PromptAdapterModelManager(AdapterModelManager): + """A manager that manages multiple Prompt Adapter models.""" + + def __init__( + self, + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + prompt_adapter_config: PromptAdapterConfig, + ): + """Create a PromptAdapterModel and adapter for a given model. + + Args: + model: the model to be adapted. + max_num_seqs: the maximum number of sequences model can run in a + single batch. + max_num_batched_tokens: the maximum number of tokens model can run + in a single batch. + prompt_adapter_config: the PromptAdapter config, + """ + self.model: nn.Module = model + # Dict instead of a Set for compatibility with LRUCache. + self.prompt_adapter_index_to_id: List[ + Optional[int]] = [None] * self.prompt_adapter_slots + self.max_num_seqs = max_num_seqs + self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 + self.prompt_adapter_config = prompt_adapter_config + self.model.prompt_adapter_manager = self + self.adapter_type = 'PromptAdapter' + + self.base_indices = torch.tensor([-1]) + self.base_embedding_indices = torch.tensor([]) + + self.modules: Dict[str, nn.Module] = {} + self._create_prompt_adapter_modules() + self._last_mapping: Optional[PromptAdapterMapping] = None + + @property + def prompt_adapter_slots(self) -> int: + return self.prompt_adapter_config.max_prompt_adapters + + @property + def adapter_slots(self) -> int: + return self.prompt_adapter_slots + + @property + def capacity(self) -> int: + return self.prompt_adapter_config.max_cpu_prompt_adapters + + def activate_adapter( + self, + prompt_adapter_id: int, + ) -> bool: + """Move PromptAdapter into a GPU buffer + to be used in the forward pass.""" + if prompt_adapter_id in self._active_adapters: + return False + first_free_slot = next( + ((i, prompt_adapter_id) for i, prompt_adapter_id in enumerate( + self.prompt_adapter_index_to_id) if prompt_adapter_id is None), + None) + if first_free_slot is None: + raise ValueError("No free prompt_adapter slots") + index, _ = first_free_slot + self._active_adapters[prompt_adapter_id] = None + prompt_adapter_model = (self._registered_adapters[prompt_adapter_id]) + logger.debug("Activating prompt_adapter. int id: %d, slot index: %d", + prompt_adapter_model.id, index) + self.prompt_adapter_index_to_id[index] = prompt_adapter_model.id + for _, v in self.modules.items(): + v.set_prompt_adapter(index, prompt_adapter_model.prompt_embedding) + return True + + def _deactivate_adapter(self, prompt_adapter_id: int): + try: + index = self.prompt_adapter_index_to_id.index(prompt_adapter_id) + self.prompt_adapter_index_to_id[index] = None + for _, v in self.modules.items(): + v.reset_prompt_adapter(index) + except ValueError: + pass + + def _add_adapter(self, prompt_adapter: PromptAdapterModel): + self._registered_adapters[prompt_adapter.id] = prompt_adapter + + def _set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None: + base_indices, base_embedding_indices = convert_mapping( + mapping, self.prompt_adapter_index_to_id) + for k, v in self.modules.items(): + v.set_mapping(base_indices, base_embedding_indices) + + def _create_prompt_adapter_modules(self): + for module_name, module in self.model.named_modules( + remove_duplicate=False): + if "VocabParallel" in module.__class__.__name__: + new_module = VocabParallelEmbeddingWithPromptAdapter(module) + new_module.create_prompt_adapter_weights( + self.prompt_adapter_config) + replaced_module = self.replace_submodule( + self.model, module_name, new_module) + self.register_module(module.__class__.__name__, + replaced_module) + replaced_module.set_mapping(self.base_indices, + self.base_embedding_indices) + break + + def replace_submodule(self, model: nn.Module, module_name: str, + new_module: nn.Module) -> nn.Module: + """Replace a submodule in a model with a new module.""" + parent = model.get_submodule(".".join(module_name.split(".")[:-1])) + target_name = module_name.split(".")[-1] + setattr(parent, target_name, new_module) + return new_module + + def register_module(self, module_name: str, module: nn.Module): + self.modules[module_name] = module + + def pin_adapter(self, prompt_adapter_id: int) -> bool: + """Pin a PromptAdapterModel in the manager cache.""" + raise NotImplementedError( + "Pinning is not supported in PromptAdapterModelManager." + "Use LRUCachePromptAdapterModelManager for pinning" + ) # type: ignore + + def remove_all_adapters(self): + """Remove all PromptAdapterModel from the manager.""" + self._registered_adapters.clear() + self.prompt_adapter_index_to_id = [None] * self.prompt_adapter_slots + self._active_adapters.clear() + + def deactivate_adapter(self, adapter_id: int) -> bool: + return deactivate_adapter(adapter_id, self._active_adapters, + self._deactivate_adapter) + + def add_adapter(self, adapter: PromptAdapterModel) -> bool: + return add_adapter(adapter, self._registered_adapters, self.capacity, + self._add_adapter) + + def set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None: + self._last_mapping = set_adapter_mapping(mapping, self._last_mapping, + self._set_adapter_mapping) + + def remove_adapter(self, adapter_id: int) -> bool: + return remove_adapter(adapter_id, self._registered_adapters, + self.deactivate_adapter) + + def list_adapters(self) -> Dict[int, Any]: + return list_adapters(self._registered_adapters) + + def get_adapter(self, adapter_id: int) -> Optional[Any]: + return get_adapter(adapter_id, self._registered_adapters) + + +class PromptAdapterLRUCache(AdapterLRUCache[PromptAdapterModel]): + + def __init__(self, capacity: int, + deactivate_prompt_adapter_fn: Callable[[int], bool]): + super().__init__(capacity, deactivate_prompt_adapter_fn) + + +class LRUCachePromptAdapterModelManager(PromptAdapterModelManager): + """A model manager that manages multiple prompt_adapters with LRU cache.""" + + def __init__( + self, + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + prompt_adapter_config: PromptAdapterConfig, + ): + self.prompt_adapter_config = prompt_adapter_config + super().__init__(model, max_num_seqs, max_num_batched_tokens, + prompt_adapter_config) + self._registered_adapters = PromptAdapterLRUCache( + self.capacity, self.deactivate_adapter) + self._active_adapters = PromptAdapterLRUCache( + self.prompt_adapter_slots, self._deactivate_adapter) + + def list_adapters(self) -> Dict[int, PromptAdapterModel]: + """List all registered PromptAdapterModel.""" + return dict(self._registered_adapters.cache) + + def add_adapter(self, prompt_adapter: PromptAdapterModel) -> bool: + """Add a PromptAdapterModel to the manager.""" + if prompt_adapter.id not in self._registered_adapters: + self._add_adapter(prompt_adapter) + was_added = True + else: + # We always touch to update the LRU cache order + self._registered_adapters.touch(prompt_adapter.id) + was_added = False + return was_added + + def activate_adapter( + self, + prompt_adapter_id: int, + ) -> bool: + if prompt_adapter_id not in self._active_adapters and len( + self._active_adapters) >= self.prompt_adapter_slots: + self._active_adapters.remove_oldest() + result = super().activate_adapter(prompt_adapter_id) + # We always touch to update the LRU cache order + self._active_adapters.touch(prompt_adapter_id) + return result + + def remove_oldest_adapter(self) -> bool: + if len(self._registered_adapters) > 0: + self._registered_adapters.remove_oldest() + return True + return False + + def pin_adapter(self, prompt_adapter_id: int) -> bool: + """Pin a PromptAdapterModel in the manager cache.""" + self._pin_prompt_adapter_in_cpu_cache(prompt_adapter_id) + self._pin_prompt_adapter_in_gpu_cache(prompt_adapter_id) + return True + + def _pin_prompt_adapter_in_cpu_cache(self, prompt_adapter_id: int): + try: + self._registered_adapters.pin(prompt_adapter_id) + except ValueError as err: + raise ValueError( + "Pinning failed. " + f"Prompt Adapter {prompt_adapter_id} is not registered." + ) from err + + def _pin_prompt_adapter_in_gpu_cache(self, prompt_adapter_id: int): + if prompt_adapter_id not in self._active_adapters: + # move adapter to gpu if not already active + self.activate_adapter(prompt_adapter_id) + self._active_adapters.pin(prompt_adapter_id) + + +def create_prompt_adapter_manager( + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + prompt_adapter_config: PromptAdapterConfig, + prompt_adapter_manager_cls: Type[ + PromptAdapterModelManager] = PromptAdapterModelManager, + **kwargs) -> PromptAdapterModelManager: + """Create a PromptAdapterModel for a given model.""" + prompt_adapter_manager = prompt_adapter_manager_cls( + model=model, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + prompt_adapter_config=prompt_adapter_config, + **kwargs) + return prompt_adapter_manager diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/request.py b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/request.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb8e61d786a0f81bf617f33a3a06957cdab02ac --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/request.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 + +import msgspec + +from vllm.adapter_commons.request import AdapterRequest + + +class PromptAdapterRequest( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + frozen=True): # type: ignore[call-arg] + """ + Request for a Prompt adapter. + """ + __metaclass__ = AdapterRequest + + prompt_adapter_name: str + prompt_adapter_id: int + prompt_adapter_local_path: str + prompt_adapter_num_virtual_tokens: int + + def __hash__(self): + return super().__hash__() + + @property + def adapter_id(self): + return self.prompt_adapter_id + + @property + def name(self): + return self.prompt_adapter_name + + @property + def local_path(self): + return self.prompt_adapter_local_path diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/utils.py b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dd179ab938f8346f2d48881532fa4e7d5b59798a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/utils.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 + +# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420 + +import os +from typing import Optional + +import torch +from huggingface_hub import file_exists, hf_hub_download +from huggingface_hub.utils import EntryNotFoundError +from safetensors.torch import load_file as safe_load_file + +from vllm.platforms import current_platform + +WEIGHTS_NAME = "adapter_model.bin" +SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors" + + +# Get current device name based on available devices +def infer_device() -> str: + if current_platform.is_cuda_alike(): + return "cuda" + return "cpu" + + +def load_peft_weights(model_id: str, + device: Optional[str] = None, + **hf_hub_download_kwargs) -> dict: + r""" + A helper method to load the PEFT weights from the HuggingFace Hub or locally + + Args: + model_id (`str`): + The local path to the adapter weights or the name of the adapter to + load from the HuggingFace Hub. + device (`str`): + The device to load the weights onto. + hf_hub_download_kwargs (`dict`): + Additional arguments to pass to the `hf_hub_download` method when + loading from the HuggingFace Hub. + """ + path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if + hf_hub_download_kwargs.get("subfolder") is not None else model_id) + + if device is None: + device = infer_device() + + if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)): + filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME) + use_safetensors = True + elif os.path.exists(os.path.join(path, WEIGHTS_NAME)): + filename = os.path.join(path, WEIGHTS_NAME) + use_safetensors = False + else: + token = hf_hub_download_kwargs.get("token") + if token is None: + token = hf_hub_download_kwargs.get("use_auth_token") + + hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"], + SAFETENSORS_WEIGHTS_NAME) + if hf_hub_download_kwargs.get("subfolder") is not None + else SAFETENSORS_WEIGHTS_NAME) + has_remote_safetensors_file = file_exists( + repo_id=model_id, + filename=hub_filename, + revision=hf_hub_download_kwargs.get("revision"), + repo_type=hf_hub_download_kwargs.get("repo_type"), + token=token, + ) + use_safetensors = has_remote_safetensors_file + + if has_remote_safetensors_file: + # Priority 1: load safetensors weights + filename = hf_hub_download( + model_id, + SAFETENSORS_WEIGHTS_NAME, + **hf_hub_download_kwargs, + ) + else: + try: + filename = hf_hub_download(model_id, WEIGHTS_NAME, + **hf_hub_download_kwargs) + except EntryNotFoundError: + raise ValueError( # noqa: B904 + f"Can't find weights for {model_id} in {model_id} or \ + in the Hugging Face Hub. " + f"Please check that the file {WEIGHTS_NAME} or \ + {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.") + + if use_safetensors: + adapters_weights = safe_load_file(filename, device=device) + else: + adapters_weights = torch.load(filename, + map_location=torch.device(device), + weights_only=True) + + return adapters_weights diff --git a/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/worker_manager.py b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/worker_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..28dcc16871120e03646df779e7bddd2b824d64ff --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/prompt_adapter/worker_manager.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 + +import logging +from typing import Any, Optional, Set, Type + +import torch + +from vllm.adapter_commons.utils import (add_adapter_worker, + apply_adapters_worker, + list_adapters_worker, + set_active_adapters_worker) +from vllm.adapter_commons.worker_manager import AbstractWorkerManager +from vllm.config import PromptAdapterConfig +from vllm.prompt_adapter.models import (LRUCachePromptAdapterModelManager, + PromptAdapterModel, + PromptAdapterModelManager, + create_prompt_adapter_manager) +from vllm.prompt_adapter.request import PromptAdapterRequest + +logger = logging.getLogger(__name__) + + +class WorkerPromptAdapterManager(AbstractWorkerManager): + """WorkerPromptAdapterManager that manages + prompt_adapter models on the worker side. + + Every request, the requested prompt_adapters will be + loaded (unless they are already loaded), + and every other prompt_adapter will be unloaded.""" + + _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager + + def __init__( + self, + max_num_seqs: int, + max_num_batched_tokens: int, + device: torch.device, + prompt_adapter_config: PromptAdapterConfig, + prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel + ): + self._adapter_manager: PromptAdapterModelManager + self.max_num_seqs = max_num_seqs + self.max_num_batched_tokens = max_num_batched_tokens + self._prompt_adapter_model_cls = prompt_adapter_model_cls + self.prompt_adapter_config = prompt_adapter_config + super().__init__(device) + + @property + def is_enabled(self) -> bool: + return True + + def create_prompt_adapter_manager( + self, + model: torch.nn.Module, + ) -> Any: + prompt_adapter_manager = create_prompt_adapter_manager( + model, + max_num_seqs=self.max_num_seqs, + max_num_batched_tokens=self.max_num_batched_tokens, + prompt_adapter_config=self.prompt_adapter_config, + prompt_adapter_manager_cls=self._manager_cls, + ) + self._adapter_manager = prompt_adapter_manager + return prompt_adapter_manager.model + + def _load_adapter( + self, prompt_adapter_request: PromptAdapterRequest + ) -> PromptAdapterModel: + try: + prompt_adapter = ( + self._prompt_adapter_model_cls.from_local_checkpoint( + prompt_adapter_request.prompt_adapter_local_path, + prompt_adapter_id=prompt_adapter_request.prompt_adapter_id, + num_virtual_tokens=prompt_adapter_request. + prompt_adapter_num_virtual_tokens, + config=self.prompt_adapter_config, + device=str(self.device), + )) + except Exception as e: + raise RuntimeError( + f"Loading prompt_adapter " + f"{prompt_adapter_request.prompt_adapter_local_path}" + f" failed") from e + return prompt_adapter + + def add_dummy_prompt_adapter( + self, prompt_adapter_request: PromptAdapterRequest) -> bool: + return True + + def pin_adapter(self, adapter_id: int) -> bool: + return self._adapter_manager.pin_adapter(adapter_id) + + def set_active_adapters(self, requests: Set[Any], + mapping: Optional[Any]) -> None: + set_active_adapters_worker(requests, mapping, self._apply_adapters, + self._adapter_manager.set_adapter_mapping) + + def add_adapter(self, adapter_request: Any) -> bool: + return add_adapter_worker(adapter_request, self.list_adapters, + self._load_adapter, + self._adapter_manager.add_adapter, + self._adapter_manager.activate_adapter) + + def _apply_adapters(self, adapter_requests: Set[Any]) -> None: + apply_adapters_worker(adapter_requests, self.list_adapters, + self._adapter_manager.adapter_slots, + self.remove_adapter, self.add_adapter) + + def remove_adapter(self, adapter_id: int) -> bool: + return self._adapter_manager.remove_adapter(adapter_id) + + def remove_all_adapters(self): + self._adapter_manager.remove_all_adapters() + + def list_adapters(self) -> Set[int]: + return list_adapters_worker(self._adapter_manager.list_adapters) + + +class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager): + """WorkerPromptAdapterManager that manages + prompt_adapter models on the worker side. + + Uses an LRU Cache. Every request, the requested + prompt_adapters will be loaded (unless they are already loaded) + and least recently used prompt_adapters will + be unloaded if the cache is above capacity.""" + + _prompt_adapter_manager_cls: Type[ + LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager + + def create_prompt_adapter_manager( + self, + model: torch.nn.Module, + ) -> Any: + prompt_adapter_manager = create_prompt_adapter_manager( + model, + max_num_seqs=self.max_num_seqs, + max_num_batched_tokens=self.max_num_batched_tokens, + prompt_adapter_config=self.prompt_adapter_config, + prompt_adapter_manager_cls=self._prompt_adapter_manager_cls) + self._adapter_manager: LRUCachePromptAdapterModelManager = ( + prompt_adapter_manager) + return prompt_adapter_manager.model + + def _apply_adapters( + self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None: + prompt_adapters_map = { + prompt_adapter_request.prompt_adapter_id: prompt_adapter_request + for prompt_adapter_request in prompt_adapter_requests + if prompt_adapter_request + } + if len(prompt_adapters_map + ) > self._adapter_manager.prompt_adapter_slots: + raise RuntimeError( + f"Number of requested prompt_adapters " + f"({len(prompt_adapters_map)}) is greater " + "than the number of GPU prompt_adapter slots " + f"({self._adapter_manager.prompt_adapter_slots}).") + for prompt_adapter in prompt_adapters_map.values(): + self.add_adapter(prompt_adapter) + + def add_adapter(self, + prompt_adapter_request: PromptAdapterRequest) -> bool: + if prompt_adapter_request.prompt_adapter_id not in self.list_adapters( + ): + # Remove before we load the new prompt_adapter to save memory + if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: + self._adapter_manager.remove_oldest_adapter() + prompt_adapter = self._load_adapter(prompt_adapter_request) + loaded = self._adapter_manager.add_adapter(prompt_adapter) + else: + # If the prompt_adapter is already loaded, just touch it to + # update its position in the caches + loaded = self._adapter_manager.get_adapter( + prompt_adapter_request.prompt_adapter_id) is not None + self._adapter_manager.activate_adapter( + prompt_adapter_request.prompt_adapter_id) + return loaded diff --git a/.venv/lib/python3.11/site-packages/vllm/py.typed b/.venv/lib/python3.11/site-packages/vllm/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..33b3ad73cac6fcd1624ea060d44334b79da3ccd0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/py.typed @@ -0,0 +1,2 @@ +# Marker file for PEP 561. +# The vllm package uses inline types. diff --git a/.venv/lib/python3.11/site-packages/vllm/sampling_params.py b/.venv/lib/python3.11/site-packages/vllm/sampling_params.py new file mode 100644 index 0000000000000000000000000000000000000000..97f9e2129573147efc4ed4b7e00a316774e994fe --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/sampling_params.py @@ -0,0 +1,506 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Sampling parameters for text generation.""" +import copy +from dataclasses import dataclass +from enum import Enum, IntEnum +from functools import cached_property +from typing import Any, Dict, List, Optional, Set, Union + +import msgspec +from pydantic import BaseModel +from typing_extensions import Annotated + +from vllm.logger import init_logger +from vllm.logits_process import LogitsProcessor + +logger = init_logger(__name__) + +_SAMPLING_EPS = 1e-5 +_MAX_TEMP = 1e-2 + + +class SamplingType(IntEnum): + GREEDY = 0 + RANDOM = 1 + RANDOM_SEED = 2 + + +# maybe make msgspec? +@dataclass +class GuidedDecodingParams: + """One of these fields will be used to build a logit processor.""" + json: Optional[Union[str, Dict]] = None + regex: Optional[str] = None + choice: Optional[List[str]] = None + grammar: Optional[str] = None + json_object: Optional[bool] = None + """These are other options that can be set""" + backend: Optional[str] = None + whitespace_pattern: Optional[str] = None + + @staticmethod + def from_optional( + json: Optional[Union[Dict, BaseModel, str]] = None, + regex: Optional[str] = None, + choice: Optional[List[str]] = None, + grammar: Optional[str] = None, + json_object: Optional[bool] = None, + backend: Optional[str] = None, + whitespace_pattern: Optional[str] = None, + ) -> Optional["GuidedDecodingParams"]: + if all(arg is None + for arg in (json, regex, choice, grammar, json_object)): + return None + # Extract json schemas from pydantic models + if isinstance(json, (BaseModel, type(BaseModel))): + json = json.model_json_schema() + return GuidedDecodingParams( + json=json, + regex=regex, + choice=choice, + grammar=grammar, + json_object=json_object, + backend=backend, + whitespace_pattern=whitespace_pattern, + ) + + def __post_init__(self): + """Validate that some fields are mutually exclusive.""" + guide_count = sum([ + self.json is not None, self.regex is not None, self.choice + is not None, self.grammar is not None, self.json_object is not None + ]) + if guide_count > 1: + raise ValueError( + "You can only use one kind of guided decoding but multiple are " + f"specified: {self.__dict__}") + + +class RequestOutputKind(Enum): + # Return entire output so far in every RequestOutput + CUMULATIVE = 0 + # Return only deltas in each RequestOutput + DELTA = 1 + # Do not return intermediate RequestOuputs + FINAL_ONLY = 2 + + +class SamplingParams( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + # required for @cached_property. + dict=True): # type: ignore[call-arg] + """Sampling parameters for text generation. + + Overall, we follow the sampling parameters from the OpenAI text completion + API (https://platform.openai.com/docs/api-reference/completions/create). + In addition, we support beam search, which is not supported by OpenAI. + + Args: + n: Number of output sequences to return for the given prompt. + best_of: Number of output sequences that are generated from the prompt. + From these `best_of` sequences, the top `n` sequences are returned. + `best_of` must be greater than or equal to `n`. By default, + `best_of` is set to `n`. + presence_penalty: Float that penalizes new tokens based on whether they + appear in the generated text so far. Values > 0 encourage the model + to use new tokens, while values < 0 encourage the model to repeat + tokens. + frequency_penalty: Float that penalizes new tokens based on their + frequency in the generated text so far. Values > 0 encourage the + model to use new tokens, while values < 0 encourage the model to + repeat tokens. + repetition_penalty: Float that penalizes new tokens based on whether + they appear in the prompt and the generated text so far. Values > 1 + encourage the model to use new tokens, while values < 1 encourage + the model to repeat tokens. + temperature: Float that controls the randomness of the sampling. Lower + values make the model more deterministic, while higher values make + the model more random. Zero means greedy sampling. + top_p: Float that controls the cumulative probability of the top tokens + to consider. Must be in (0, 1]. Set to 1 to consider all tokens. + top_k: Integer that controls the number of top tokens to consider. Set + to -1 to consider all tokens. + min_p: Float that represents the minimum probability for a token to be + considered, relative to the probability of the most likely token. + Must be in [0, 1]. Set to 0 to disable this. + seed: Random seed to use for the generation. + stop: List of strings that stop the generation when they are generated. + The returned output will not contain the stop strings. + stop_token_ids: List of tokens that stop the generation when they are + generated. The returned output will contain the stop tokens unless + the stop tokens are special tokens. + bad_words: List of words that are not allowed to be generated. + More precisely, only the last token of a corresponding + token sequence is not allowed when the next generated token + can complete the sequence. + include_stop_str_in_output: Whether to include the stop strings in + output text. Defaults to False. + ignore_eos: Whether to ignore the EOS token and continue generating + tokens after the EOS token is generated. + max_tokens: Maximum number of tokens to generate per output sequence. + min_tokens: Minimum number of tokens to generate per output sequence + before EOS or stop_token_ids can be generated + logprobs: Number of log probabilities to return per output token. + When set to None, no probability is returned. If set to a non-None + value, the result includes the log probabilities of the specified + number of most likely tokens, as well as the chosen tokens. + Note that the implementation follows the OpenAI API: The API will + always return the log probability of the sampled token, so there + may be up to `logprobs+1` elements in the response. + prompt_logprobs: Number of log probabilities to return per prompt token. + detokenize: Whether to detokenize the output. Defaults to True. + skip_special_tokens: Whether to skip special tokens in the output. + spaces_between_special_tokens: Whether to add spaces between special + tokens in the output. Defaults to True. + logits_processors: List of functions that modify logits based on + previously generated tokens, and optionally prompt tokens as + a first argument. + truncate_prompt_tokens: If set to an integer k, will use only the last k + tokens from the prompt (i.e., left truncation). Defaults to None + (i.e., no truncation). + guided_decoding: If provided, the engine will construct a guided + decoding logits processor from these parameters. Defaults to None. + logit_bias: If provided, the engine will construct a logits processor + that applies these logit biases. Defaults to None. + allowed_token_ids: If provided, the engine will construct a logits + processor which only retains scores for the given token ids. + Defaults to None. + """ + + n: int = 1 + best_of: Optional[int] = None + _real_n: Optional[int] = None + presence_penalty: float = 0.0 + frequency_penalty: float = 0.0 + repetition_penalty: float = 1.0 + temperature: float = 1.0 + top_p: float = 1.0 + top_k: int = -1 + min_p: float = 0.0 + seed: Optional[int] = None + stop: Optional[Union[str, List[str]]] = None + stop_token_ids: Optional[List[int]] = None + bad_words: Optional[List[str]] = None + ignore_eos: bool = False + max_tokens: Optional[int] = 16 + min_tokens: int = 0 + logprobs: Optional[int] = None + prompt_logprobs: Optional[int] = None + # NOTE: This parameter is only exposed at the engine level for now. + # It is not exposed in the OpenAI API server, as the OpenAI API does + # not support returning only a list of token IDs. + detokenize: bool = True + skip_special_tokens: bool = True + spaces_between_special_tokens: bool = True + # Optional[List[LogitsProcessor]] type. We use Any here because + # Optional[List[LogitsProcessor]] type is not supported by msgspec. + logits_processors: Optional[Any] = None + include_stop_str_in_output: bool = False + truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None + output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE + + # The below fields are not supposed to be used as an input. + # They are set in post_init. + output_text_buffer_length: int = 0 + _all_stop_token_ids: Set[int] = msgspec.field(default_factory=set) + + # Fields used to construct logits processors + guided_decoding: Optional[GuidedDecodingParams] = None + logit_bias: Optional[Dict[int, float]] = None + allowed_token_ids: Optional[List[int]] = None + + @staticmethod + def from_optional( + n: Optional[int] = 1, + best_of: Optional[int] = None, + presence_penalty: Optional[float] = 0.0, + frequency_penalty: Optional[float] = 0.0, + repetition_penalty: Optional[float] = 1.0, + temperature: Optional[float] = 1.0, + top_p: Optional[float] = 1.0, + top_k: int = -1, + min_p: float = 0.0, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stop_token_ids: Optional[List[int]] = None, + bad_words: Optional[List[str]] = None, + include_stop_str_in_output: bool = False, + ignore_eos: bool = False, + max_tokens: Optional[int] = 16, + min_tokens: int = 0, + logprobs: Optional[int] = None, + prompt_logprobs: Optional[int] = None, + detokenize: bool = True, + skip_special_tokens: bool = True, + spaces_between_special_tokens: bool = True, + logits_processors: Optional[List[LogitsProcessor]] = None, + truncate_prompt_tokens: Optional[Annotated[int, + msgspec.Meta(ge=1)]] = None, + output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, + guided_decoding: Optional[GuidedDecodingParams] = None, + logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]] = None, + allowed_token_ids: Optional[List[int]] = None, + ) -> "SamplingParams": + if logit_bias is not None: + logit_bias = { + int(token): bias + for token, bias in logit_bias.items() + } + + return SamplingParams( + n=1 if n is None else n, + best_of=best_of, + presence_penalty=0.0 + if presence_penalty is None else presence_penalty, + frequency_penalty=0.0 + if frequency_penalty is None else frequency_penalty, + repetition_penalty=1.0 + if repetition_penalty is None else repetition_penalty, + temperature=1.0 if temperature is None else temperature, + top_p=1.0 if top_p is None else top_p, + top_k=top_k, + min_p=min_p, + seed=seed, + stop=stop, + stop_token_ids=stop_token_ids, + bad_words=bad_words, + include_stop_str_in_output=include_stop_str_in_output, + ignore_eos=ignore_eos, + max_tokens=max_tokens, + min_tokens=min_tokens, + logprobs=logprobs, + prompt_logprobs=prompt_logprobs, + detokenize=detokenize, + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + logits_processors=logits_processors, + truncate_prompt_tokens=truncate_prompt_tokens, + output_kind=output_kind, + guided_decoding=guided_decoding, + logit_bias=logit_bias, + allowed_token_ids=allowed_token_ids, + ) + + def __post_init__(self) -> None: + # how we deal with `best_of``: + # if `best_of`` is not set, we default to `n`; + # if `best_of`` is set, we set `n`` to `best_of`, + # and set `_real_n`` to the original `n`. + # when we return the result, we will check + # if we need to return `n` or `_real_n` results + if self.best_of: + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") + if not self._real_n: + self._real_n = self.n + self.n = self.best_of + + if 0 < self.temperature < _MAX_TEMP: + logger.warning( + "temperature %s is less than %s, which may cause numerical " + "errors nan or inf in tensors. We have maxed it out to %s.", + self.temperature, _MAX_TEMP, _MAX_TEMP) + self.temperature = max(self.temperature, _MAX_TEMP) + + if self.seed == -1: + self.seed = None + else: + self.seed = self.seed + + if self.stop is None: + self.stop = [] + elif isinstance(self.stop, str): + self.stop = [self.stop] + else: + self.stop = list(self.stop) + + if self.stop_token_ids is None: + self.stop_token_ids = [] + else: + self.stop_token_ids = list(self.stop_token_ids) + + if self.bad_words is None: + self.bad_words = [] + else: + self.bad_words = list(self.bad_words) + + self.logprobs = 1 if self.logprobs is True else self.logprobs + self.prompt_logprobs = (1 if self.prompt_logprobs is True else + self.prompt_logprobs) + + # Number of characters to hold back for stop string evaluation + # until sequence is finished. + if self.stop and not self.include_stop_str_in_output: + self.output_text_buffer_length = max(len(s) for s in self.stop) - 1 + + self._verify_args() + + if self.temperature < _SAMPLING_EPS: + # Zero temperature means greedy sampling. + self.top_p = 1.0 + self.top_k = -1 + self.min_p = 0.0 + self._verify_greedy_sampling() + # eos_token_id is added to this by the engine + self._all_stop_token_ids = set(self.stop_token_ids) + + def _verify_args(self) -> None: + if not isinstance(self.n, int): + raise ValueError(f"n must be an int, but is of " + f"type {type(self.n)}") + if self.n < 1: + raise ValueError(f"n must be at least 1, got {self.n}.") + if not -2.0 <= self.presence_penalty <= 2.0: + raise ValueError("presence_penalty must be in [-2, 2], got " + f"{self.presence_penalty}.") + if not -2.0 <= self.frequency_penalty <= 2.0: + raise ValueError("frequency_penalty must be in [-2, 2], got " + f"{self.frequency_penalty}.") + if not 0.0 < self.repetition_penalty <= 2.0: + raise ValueError("repetition_penalty must be in (0, 2], got " + f"{self.repetition_penalty}.") + if self.temperature < 0.0: + raise ValueError( + f"temperature must be non-negative, got {self.temperature}.") + if not 0.0 < self.top_p <= 1.0: + raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.") + if self.top_k < -1 or self.top_k == 0: + raise ValueError(f"top_k must be -1 (disable), or at least 1, " + f"got {self.top_k}.") + if not isinstance(self.top_k, int): + raise TypeError( + f"top_k must be an integer, got {type(self.top_k).__name__}") + if not 0.0 <= self.min_p <= 1.0: + raise ValueError("min_p must be in [0, 1], got " + f"{self.min_p}.") + if self.max_tokens is not None and self.max_tokens < 1: + raise ValueError( + f"max_tokens must be at least 1, got {self.max_tokens}.") + if self.min_tokens < 0: + raise ValueError(f"min_tokens must be greater than or equal to 0, " + f"got {self.min_tokens}.") + if self.max_tokens is not None and self.min_tokens > self.max_tokens: + raise ValueError( + f"min_tokens must be less than or equal to " + f"max_tokens={self.max_tokens}, got {self.min_tokens}.") + if self.logprobs is not None and self.logprobs < 0: + raise ValueError( + f"logprobs must be non-negative, got {self.logprobs}.") + if self.prompt_logprobs is not None and self.prompt_logprobs < 0: + raise ValueError(f"prompt_logprobs must be non-negative, got " + f"{self.prompt_logprobs}.") + if (self.truncate_prompt_tokens is not None + and self.truncate_prompt_tokens < 1): + raise ValueError(f"truncate_prompt_tokens must be >= 1, " + f"got {self.truncate_prompt_tokens}") + assert isinstance(self.stop, list) + if any(not stop_str for stop_str in self.stop): + raise ValueError("stop cannot contain an empty string.") + if self.stop and not self.detokenize: + raise ValueError( + "stop strings are only supported when detokenize is True. " + "Set detokenize=True to use stop.") + if self.best_of != self._real_n and self.output_kind == ( + RequestOutputKind.DELTA): + raise ValueError("best_of must equal n to use output_kind=DELTA") + + def _verify_greedy_sampling(self) -> None: + if self.n > 1: + raise ValueError("n must be 1 when using greedy sampling, " + f"got {self.n}.") + + def update_from_generation_config( + self, + generation_config: Dict[str, Any], + model_eos_token_id: Optional[int] = None) -> None: + """Update if there are non-default values from generation_config""" + + if model_eos_token_id is not None: + # Add the eos token id into the sampling_params to support + # min_tokens processing. + self._all_stop_token_ids.add(model_eos_token_id) + + # Update eos_token_id for generation + if (eos_ids := generation_config.get("eos_token_id")) is not None: + # it can be either int or list of int + eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids) + if model_eos_token_id is not None: + # We don't need to include the primary eos_token_id in + # stop_token_ids since it's handled separately for stopping + # purposes. + eos_ids.discard(model_eos_token_id) + if eos_ids: + self._all_stop_token_ids.update(eos_ids) + if not self.ignore_eos: + eos_ids.update(self.stop_token_ids) + self.stop_token_ids = list(eos_ids) + + @cached_property + def sampling_type(self) -> SamplingType: + if self.temperature < _SAMPLING_EPS: + return SamplingType.GREEDY + if self.seed is not None: + return SamplingType.RANDOM_SEED + return SamplingType.RANDOM + + @property + def all_stop_token_ids(self) -> Set[int]: + return self._all_stop_token_ids + + def clone(self) -> "SamplingParams": + """Deep copy, but maybe not the LogitsProcessor objects. + + LogitsProcessor objects may contain an arbitrary, nontrivial amount of + data that is expensive to copy. However, if not copied, the processor + needs to support parallel decoding for multiple sequences + See https://github.com/vllm-project/vllm/issues/3087 + """ + + logit_processor_refs = None if self.logits_processors is None else { + id(lp): lp.clone() if hasattr(lp, 'clone') else lp + for lp in self.logits_processors + } + return copy.deepcopy(self, memo=logit_processor_refs) + + def __repr__(self) -> str: + return ( + f"SamplingParams(n={self.n}, " + f"presence_penalty={self.presence_penalty}, " + f"frequency_penalty={self.frequency_penalty}, " + f"repetition_penalty={self.repetition_penalty}, " + f"temperature={self.temperature}, " + f"top_p={self.top_p}, " + f"top_k={self.top_k}, " + f"min_p={self.min_p}, " + f"seed={self.seed}, " + f"stop={self.stop}, " + f"stop_token_ids={self.stop_token_ids}, " + f"bad_words={self.bad_words}, " + f"include_stop_str_in_output={self.include_stop_str_in_output}, " + f"ignore_eos={self.ignore_eos}, " + f"max_tokens={self.max_tokens}, " + f"min_tokens={self.min_tokens}, " + f"logprobs={self.logprobs}, " + f"prompt_logprobs={self.prompt_logprobs}, " + f"skip_special_tokens={self.skip_special_tokens}, " + "spaces_between_special_tokens=" + f"{self.spaces_between_special_tokens}, " + f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " + f"guided_decoding={self.guided_decoding})") + + +class BeamSearchParams( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + # required for @cached_property. + dict=True): # type: ignore[call-arg] + """Beam search parameters for text generation.""" + beam_width: int + max_tokens: int + ignore_eos: bool = False + temperature: float = 0.0 + length_penalty: float = 1.0 + include_stop_str_in_output: bool = False diff --git a/.venv/lib/python3.11/site-packages/vllm/scalar_type.py b/.venv/lib/python3.11/site-packages/vllm/scalar_type.py new file mode 100644 index 0000000000000000000000000000000000000000..9f6e85920ac7bcccca9221e508a71af8d56fbde2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/scalar_type.py @@ -0,0 +1,332 @@ +# SPDX-License-Identifier: Apache-2.0 + +import functools +import struct +from dataclasses import dataclass +from enum import Enum +from typing import Optional, Union + + +# Mirrors enum in `core/scalar_type.hpp` +class NanRepr(Enum): + NONE = 0 # nans are not supported + IEEE_754 = 1 # nans are: Exp all 1s, mantissa not all 0s + EXTD_RANGE_MAX_MIN = 2 # nans are: Exp all 1s, mantissa all 1s + + +# This ScalarType class is a parallel implementation of the C++ ScalarType +# class found in csrc/core/scalar_type.hpp. These two classes should be kept +# in sync until the inductor fully supports custom C++ classes. +@dataclass(frozen=True) +class ScalarType: + """ + ScalarType can represent a wide range of floating point and integer + types, in particular it can be used to represent sub-byte data types + (something that torch.dtype currently does not support). It is also + capable of representing types with a bias, i.e.: + `stored_value = value + bias`, + this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias + of 8). The implementation for this class can be found in + csrc/core/scalar_type.hpp, these type signatures should be kept in sync + with that file. + """ + + exponent: int + """ + Number of bits in the exponent if this is a floating point type + (zero if this an integer type) + """ + + mantissa: int + """ + Number of bits in the mantissa if this is a floating point type, + or the number bits representing an integer excluding the sign bit if + this an integer type. + """ + + signed: bool + "If the type is signed (i.e. has a sign bit)" + + bias: int + """ + bias used to encode the values in this scalar type + (value = stored_value - bias, default 0) for example if we store the + type as an unsigned integer with a bias of 128 then the value 0 will be + stored as 128 and -1 will be stored as 127 and 1 will be stored as 129. + """ + + _finite_values_only: bool = False + """ + Private: if infs are supported, used `has_infs()` instead. + """ + + nan_repr: NanRepr = NanRepr.IEEE_754 + """ + How NaNs are represent in this scalar type, returns NanRepr value. + (not applicable for integer types) + """ + + def _floating_point_max_int(self) -> int: + assert ( + self.mantissa <= 52 and self.exponent <= 11 + ), f"Cannot represent max/min as a double for type {self.__str__()}" + + max_mantissa = (1 << self.mantissa) - 1 + if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN: + max_mantissa = max_mantissa - 1 + + max_exponent = (1 << self.exponent) - 2 + if (self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN + or self.nan_repr == NanRepr.NONE): + assert ( + self.exponent < 11 + ), f"Cannot represent max/min as a double for type {self.__str__()}" + max_exponent = max_exponent + 1 + + # adjust the exponent to match that of a double + # for now we assume the exponent bias is the standard 2^(e-1) -1, (where + # e is the exponent bits), there is some precedent for non-standard + # biases, example `float8_e4m3b11fnuz` here: + # https://github.com/jax-ml/ml_dtypes but to avoid premature over + # complication we are just assuming the standard exponent bias until + # there is a need to support non-standard biases + exponent_bias = (1 << (self.exponent - 1)) - 1 + exponent_bias_double = (1 << 10) - 1 # double e = 11 + + max_exponent_double = (max_exponent - exponent_bias + + exponent_bias_double) + + # shift the mantissa and exponent into the proper positions for an + # IEEE double and bitwise-or them together. + return (max_mantissa << + (52 - self.mantissa)) | (max_exponent_double << 52) + + def _floating_point_max(self) -> float: + double_raw = self._floating_point_max_int() + return struct.unpack('!d', struct.pack('!Q', double_raw))[0] + + def _raw_max(self) -> Union[int, float]: + if self.is_floating_point(): + return self._floating_point_max() + else: + assert (self.size_bits < 64 or self.size_bits == 64 + and self.is_signed()), "Cannot represent max as an int" + return (1 << self.mantissa) - 1 + + def _raw_min(self) -> Union[int, float]: + if self.is_floating_point(): + assert self.is_signed( + ), "We currently assume all floating point types are signed" + sign_bit_double = 1 << 63 + + max_raw = self._floating_point_max_int() + min_raw = max_raw | sign_bit_double + return struct.unpack('!d', struct.pack('!Q', min_raw))[0] + else: + assert (not self.is_signed() or self.size_bits + <= 64), "Cannot represent min as a int64_t" + + if self.is_signed(): + return -(1 << (self.size_bits - 1)) + else: + return 0 + + @functools.cached_property + def id(self) -> int: + """ + Convert the ScalarType to an int which can be passed to pytorch custom + ops. This layout of the int must be kept in sync with the C++ + ScalarType's from_id method. + """ + val = 0 + offset = 0 + + def or_and_advance(member, bit_width): + nonlocal val + nonlocal offset + bit_mask = (1 << bit_width) - 1 + val = val | (int(member) & bit_mask) << offset + offset = offset + bit_width + + or_and_advance(self.exponent, 8) + or_and_advance(self.mantissa, 8) + or_and_advance(self.signed, 1) + or_and_advance(self.bias, 32) + or_and_advance(self._finite_values_only, 1) + or_and_advance(self.nan_repr.value, 8) + + assert offset <= 64, \ + f"ScalarType fields too big {offset} to fit into an int64" + + return val + + @property + def size_bits(self) -> int: + return self.exponent + self.mantissa + int(self.signed) + + def min(self) -> Union[int, float]: + """ + Min representable value for this scalar type. + (accounting for bias if there is one) + """ + return self._raw_min() - self.bias + + def max(self) -> Union[int, float]: + """ + Max representable value for this scalar type. + (accounting for bias if there is one) + """ + return self._raw_max() - self.bias + + def is_signed(self) -> bool: + """ + If the type is signed (i.e. has a sign bit), same as `signed` + added for consistency with: + https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html + """ + return self.signed + + def is_floating_point(self) -> bool: + "If the type is a floating point type" + return self.exponent != 0 + + def is_integer(self) -> bool: + "If the type is an integer type" + return self.exponent == 0 + + def has_bias(self) -> bool: + "If the type has a non-zero bias" + return self.bias != 0 + + def has_infs(self) -> bool: + "If the type is floating point and supports infinity" + return not self._finite_values_only + + def has_nans(self) -> bool: + return self.nan_repr != NanRepr.NONE.value + + def is_ieee_754(self) -> bool: + """ + If the type is a floating point type that follows IEEE 754 + conventions + """ + return self.nan_repr == NanRepr.IEEE_754.value and \ + not self._finite_values_only + + def __str__(self) -> str: + """ + naming generally follows: https://github.com/jax-ml/ml_dtypes + for floating point types (leading f) the scheme is: + `float_em[flags]` + flags: + - no-flags: means it follows IEEE 754 conventions + - f: means finite values only (no infinities) + - n: means nans are supported (non-standard encoding) + for integer types the scheme is: + `[u]int[b]` + - if bias is not present it means its zero + """ + if self.is_floating_point(): + ret = "float" + str(self.size_bits) + "_e" + str( + self.exponent) + "m" + str(self.mantissa) + + if not self.is_ieee_754(): + if self._finite_values_only: + ret = ret + "f" + if self.nan_repr != NanRepr.NONE: + ret = ret + "n" + + return ret + else: + ret = ("int" if self.is_signed() else "uint") + str(self.size_bits) + if self.has_bias(): + ret = ret + "b" + str(self.bias) + return ret + + def __repr__(self) -> str: + return "ScalarType." + self.__str__() + + # __len__ needs to be defined (and has to throw TypeError) for pytorch's + # opcheck to work. + def __len__(self) -> int: + raise TypeError + + # + # Convenience Constructors + # + + @classmethod + def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType': + "Create a signed integer scalar type (size_bits includes sign-bit)." + ret = cls(0, size_bits - 1, True, bias if bias else 0) + ret.id # noqa B018: make sure the id is cached + return ret + + @classmethod + def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType': + """Create a unsigned integer scalar type.""" + ret = cls(0, size_bits, False, bias if bias else 0) + ret.id # noqa B018: make sure the id is cached + return ret + + @classmethod + def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType': + """ + Create a standard floating point type + (i.e. follows IEEE 754 conventions). + """ + assert (mantissa > 0 and exponent > 0) + ret = cls(exponent, mantissa, True, 0) + ret.id # noqa B018: make sure the id is cached + return ret + + @classmethod + def float_(cls, exponent: int, mantissa: int, finite_values_only: bool, + nan_repr: NanRepr) -> 'ScalarType': + """ + Create a non-standard floating point type + (i.e. does not follow IEEE 754 conventions). + """ + assert (mantissa > 0 and exponent > 0) + assert (nan_repr != NanRepr.IEEE_754), ( + "use `float_IEEE754` constructor for floating point types that " + "follow IEEE 754 conventions") + ret = cls(exponent, mantissa, True, 0, finite_values_only, nan_repr) + ret.id # noqa B018: make sure the id is cached + return ret + + +# naming generally follows: https://github.com/jax-ml/ml_dtypes +# for floating point types (leading f) the scheme is: +# `float_em[flags]` +# flags: +# - no-flags: means it follows IEEE 754 conventions +# - f: means finite values only (no infinities) +# - n: means nans are supported (non-standard encoding) +# for integer types the scheme is: +# `[u]int[b]` +# - if bias is not present it means its zero + + +class scalar_types: + int4 = ScalarType.int_(4, None) + uint4 = ScalarType.uint(4, None) + int8 = ScalarType.int_(8, None) + uint8 = ScalarType.uint(8, None) + float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN) + float8_e5m2 = ScalarType.float_IEEE754(5, 2) + float16_e8m7 = ScalarType.float_IEEE754(8, 7) + float16_e5m10 = ScalarType.float_IEEE754(5, 10) + + # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main + float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE) + + # "gptq" types + uint2b2 = ScalarType.uint(2, 2) + uint3b4 = ScalarType.uint(3, 4) + uint4b8 = ScalarType.uint(4, 8) + uint8b128 = ScalarType.uint(8, 128) + + # colloquial names + bfloat16 = float16_e8m7 + float16 = float16_e5m10 diff --git a/.venv/lib/python3.11/site-packages/vllm/scripts.py b/.venv/lib/python3.11/site-packages/vllm/scripts.py new file mode 100644 index 0000000000000000000000000000000000000000..467cab28f02ae48a6b0a48d575a7ead2befdb5b9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/scripts.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: Apache-2.0 + +# The CLI entrypoint to vLLM. +import argparse +import os +import signal +import sys +from typing import List, Optional + +import uvloop +from openai import OpenAI +from openai.types.chat import ChatCompletionMessageParam + +import vllm.version +from vllm.engine.arg_utils import EngineArgs +from vllm.entrypoints.openai.api_server import run_server +from vllm.entrypoints.openai.cli_args import (make_arg_parser, + validate_parsed_serve_args) +from vllm.logger import init_logger +from vllm.utils import FlexibleArgumentParser + +logger = init_logger(__name__) + + +def register_signal_handlers(): + + def signal_handler(sig, frame): + sys.exit(0) + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTSTP, signal_handler) + + +def serve(args: argparse.Namespace) -> None: + # The default value of `--model` + if args.model != EngineArgs.model: + raise ValueError( + "With `vllm serve`, you should provide the model as a " + "positional argument instead of via the `--model` option.") + + # EngineArgs expects the model name to be passed as --model. + args.model = args.model_tag + + uvloop.run(run_server(args)) + + +def interactive_cli(args: argparse.Namespace) -> None: + register_signal_handlers() + + base_url = args.url + api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY") + openai_client = OpenAI(api_key=api_key, base_url=base_url) + + if args.model_name: + model_name = args.model_name + else: + available_models = openai_client.models.list() + model_name = available_models.data[0].id + + print(f"Using model: {model_name}") + + if args.command == "complete": + complete(model_name, openai_client) + elif args.command == "chat": + chat(args.system_prompt, model_name, openai_client) + + +def complete(model_name: str, client: OpenAI) -> None: + print("Please enter prompt to complete:") + while True: + input_prompt = input("> ") + + completion = client.completions.create(model=model_name, + prompt=input_prompt) + output = completion.choices[0].text + print(output) + + +def chat(system_prompt: Optional[str], model_name: str, + client: OpenAI) -> None: + conversation: List[ChatCompletionMessageParam] = [] + if system_prompt is not None: + conversation.append({"role": "system", "content": system_prompt}) + + print("Please enter a message for the chat model:") + while True: + input_message = input("> ") + conversation.append({"role": "user", "content": input_message}) + + chat_completion = client.chat.completions.create(model=model_name, + messages=conversation) + + response_message = chat_completion.choices[0].message + output = response_message.content + + conversation.append(response_message) # type: ignore + print(output) + + +def _add_query_options( + parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + parser.add_argument( + "--url", + type=str, + default="http://localhost:8000/v1", + help="url of the running OpenAI-Compatible RESTful API server") + parser.add_argument( + "--model-name", + type=str, + default=None, + help=("The model name used in prompt completion, default to " + "the first model in list models API call.")) + parser.add_argument( + "--api-key", + type=str, + default=None, + help=( + "API key for OpenAI services. If provided, this api key " + "will overwrite the api key obtained through environment variables." + )) + return parser + + +def env_setup(): + # The safest multiprocessing method is `spawn`, as the default `fork` method + # is not compatible with some accelerators. The default method will be + # changing in future versions of Python, so we should use it explicitly when + # possible. + # + # We only set it here in the CLI entrypoint, because changing to `spawn` + # could break some existing code using vLLM as a library. `spawn` will cause + # unexpected behavior if the code is not protected by + # `if __name__ == "__main__":`. + # + # References: + # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods + # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing + # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors + # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders + if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ: + logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def main(): + env_setup() + + parser = FlexibleArgumentParser(description="vLLM CLI") + parser.add_argument('-v', + '--version', + action='version', + version=vllm.version.__version__) + + subparsers = parser.add_subparsers(required=True, dest="subparser") + + serve_parser = subparsers.add_parser( + "serve", + help="Start the vLLM OpenAI Compatible API server", + usage="vllm serve [options]") + serve_parser.add_argument("model_tag", + type=str, + help="The model tag to serve") + serve_parser.add_argument( + "--config", + type=str, + default='', + required=False, + help="Read CLI options from a config file." + "Must be a YAML with the following options:" + "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" + ) + + serve_parser = make_arg_parser(serve_parser) + serve_parser.set_defaults(dispatch_function=serve) + + complete_parser = subparsers.add_parser( + "complete", + help=("Generate text completions based on the given prompt " + "via the running API server"), + usage="vllm complete [options]") + _add_query_options(complete_parser) + complete_parser.set_defaults(dispatch_function=interactive_cli, + command="complete") + + chat_parser = subparsers.add_parser( + "chat", + help="Generate chat completions via the running API server", + usage="vllm chat [options]") + _add_query_options(chat_parser) + chat_parser.add_argument( + "--system-prompt", + type=str, + default=None, + help=("The system prompt to be added to the chat template, " + "used for models that support system prompts.")) + chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat") + + args = parser.parse_args() + if args.subparser == "serve": + validate_parsed_serve_args(args) + + # One of the sub commands should be executed. + if hasattr(args, "dispatch_function"): + args.dispatch_function(args) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/.venv/lib/python3.11/site-packages/vllm/sequence.py b/.venv/lib/python3.11/site-packages/vllm/sequence.py new file mode 100644 index 0000000000000000000000000000000000000000..534b9e60610a274af35a7f3c23d1ca801e5dd647 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/sequence.py @@ -0,0 +1,1480 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Sequence and its related classes.""" +import copy +import enum +from abc import ABC, abstractmethod +from array import array +from collections import defaultdict +from dataclasses import dataclass, field +from functools import reduce +from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional +from typing import Sequence as GenericSequence +from typing import Set, Tuple, Union + +import msgspec +import torch + +from vllm.inputs import SingletonInputs, SingletonInputsAdapter +from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import RequestOutputKind, SamplingParams + +VLLM_TOKEN_ID_ARRAY_TYPE = "l" + +VLLM_INVALID_TOKEN_ID = -1 + + +def array_full(token_id: int, count: int): + """:class:`array` equivalent of :func:`numpy.full`.""" + return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count + + +# We use dataclass for now because it is used for +# openai server output, and msgspec is not serializable. +# TODO(sang): Fix it. +@dataclass +class Logprob: + """Infos for supporting OpenAI compatible logprobs and token ranks. + + Attributes: + logprob: The logprob of chosen token + rank: The vocab rank of chosen token (>=1) + decoded_token: The decoded chosen token index + """ + logprob: float + rank: Optional[int] = None + decoded_token: Optional[str] = None + + +# {token_id -> logprob} per each sequence group. None if the corresponding +# sequence group doesn't require prompt logprob. +PromptLogprobs = List[Optional[Dict[int, Logprob]]] +# {token_id -> logprob} for each sequence group. +SampleLogprobs = List[Dict[int, Logprob]] + + +class SequenceStatus(enum.IntEnum): + """Status of a sequence.""" + WAITING = 0 + RUNNING = 1 + SWAPPED = 2 + # Note: anything after SWAPPED (2) will be considered + # as a finished status. + FINISHED_STOPPED = 3 + FINISHED_LENGTH_CAPPED = 4 + FINISHED_ABORTED = 5 + FINISHED_IGNORED = 6 + + @staticmethod + def is_finished(status: "SequenceStatus") -> bool: + return status > SequenceStatus.SWAPPED + + @staticmethod + def get_finished_reason(status: "SequenceStatus") -> Union[str, None]: + if status == SequenceStatus.FINISHED_STOPPED: + finish_reason = "stop" + elif status == SequenceStatus.FINISHED_LENGTH_CAPPED: + finish_reason = "length" + elif status == SequenceStatus.FINISHED_ABORTED: + finish_reason = "abort" + elif status == SequenceStatus.FINISHED_IGNORED: + # The ignored sequences are the sequences whose prompt lengths + # are longer than the model's length cap. Therefore, the stop + # reason should also be "length" as in OpenAI API. + finish_reason = "length" + else: + finish_reason = None + return finish_reason + + +class SequenceStage(enum.Enum): + PREFILL = enum.auto() + DECODE = enum.auto() + + +@dataclass +class RequestMetrics: + """Metrics associated with a request. + + Attributes: + arrival_time: The time when the request arrived. + first_scheduled_time: The time when the request was first scheduled. + first_token_time: The time when the first token was generated. + time_in_queue: The time the request spent in the queue. + finished_time: The time when the request was finished. + scheduler_time: The time spent in the scheduler when this request was + being considered by the scheduler. + model_forward_time: The time spent in the model forward pass when this + request was in the batch. + model_execute_time: The time spent in the model execute function. This + will include model forward, block/sync across + workers, cpu-gpu sync time and sampling time. + """ + arrival_time: float + last_token_time: float + first_scheduled_time: Optional[float] + first_token_time: Optional[float] + time_in_queue: Optional[float] + finished_time: Optional[float] = None + scheduler_time: Optional[float] = None + model_forward_time: Optional[float] = None + model_execute_time: Optional[float] = None + + +class SequenceDataDelta( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] + """Delta SequenceData to send to workers per step.""" + # A new token to be appended to existing SequenceData. + new_output_token_ids: List[int] + # Overwriting existing `cumulative_logprob` + new_cumulative_logprob: float + # Overwriting existing `num_computed_tokens`. + new_num_computed_tokens: int + # Overwriting existing `stage`. + new_stage: SequenceStage + + +class SequenceData(msgspec.Struct, + omit_defaults=True): # type: ignore[call-arg] + """Data associated with a sequence. + + Args: + prompt_token_ids: The token IDs of the prompt. + output_token_ids: The token IDs of the output. Set to an empty list if + None. + + Attributes: + prompt_token_ids: The token IDs of the prompt. + output_token_ids: The token IDs of the output. + cumulative_logprob: The cumulative log probability of the output. + """ + # NOTE: we cannot use Union[List, array] because msgspec cannot support + # union of 2 list types. + _prompt_token_ids: array + _output_token_ids: array = msgspec.field( + default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, [])) + + ### The below fields should not be passed as an argument ### + _cumulative_logprob: float = 0.0 + _prompt_token_ids_tuple: Tuple[int, + ...] = msgspec.field(default_factory=tuple) + # The number of tokens that are computed (that run against the model). + _num_computed_tokens: int = 0 + # The number of tokens with prefix cache hit. + _num_cached_tokens: int = 0 + _stage: SequenceStage = SequenceStage.PREFILL + _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) + + # It is used to get delta input. It is reset when `get_delta_and_reset` + # is called. + _new_appended_tokens: List[int] = msgspec.field(default_factory=list) + + # It is used to compute mrope_position_ids. + _mrope_position_delta: Optional[int] = None + + @staticmethod + def from_prompt_token_counts( + *token_counts: Tuple[int, int]) -> "SequenceData": + """ + Construct a :class:`SequenceData` instance by concatenating + prompt token sequences. + + Each tuple represents one token sequence, expressed in the form + :code:`(token_id, count)`. + """ + if len(token_counts) == 0: + return SequenceData.from_seqs([]) + + prompt_token_ids_arr = reduce( + array.__iadd__, + (array_full(token_id, count) for token_id, count in token_counts), + ) + + return SequenceData(prompt_token_ids_arr) + + @staticmethod + def from_seqs( + prompt_token_ids: GenericSequence[int], + output_token_ids: Optional[GenericSequence[int]] = None, + ) -> "SequenceData": + """ + Construct a :class:`SequenceData` instance from prompt and output + token sequences. + """ + prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, + prompt_token_ids) + + if output_token_ids is None: + return SequenceData(prompt_token_ids_arr) + + output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, + output_token_ids) + + return SequenceData(prompt_token_ids_arr, + _output_token_ids=output_token_ids_arr) + + def __post_init__(self) -> None: + assert self._prompt_token_ids.typecode == "l" + assert self._output_token_ids.typecode == "l" + self._prompt_token_ids_tuple: Tuple[int, ...] = tuple( + self._prompt_token_ids) + self._update_cached_all_tokens() + + def _update_cached_all_tokens(self): + assert isinstance(self._prompt_token_ids, array) + assert isinstance(self._output_token_ids, array) + self._cached_all_token_ids: List[int] = list(self._prompt_token_ids + + self._output_token_ids) + + @property + def cumulative_logprob(self) -> float: + return self._cumulative_logprob + + @property + def prompt_token_ids(self) -> Tuple[int, ...]: + return self._prompt_token_ids_tuple + + @prompt_token_ids.setter + def prompt_token_ids(self, new_prompt_token_ids) -> None: + raise NotImplementedError + + @property + def prompt_token_ids_array(self) -> array: + """Return the prompt token ids in array type. + + Note that the array is in "I" type, and it is not compatible + with torch.long (2 bytes vs 4 bytes). So beware of the usage. + """ + return self._prompt_token_ids + + @property + def output_token_ids(self) -> Tuple[int, ...]: + return tuple(self._output_token_ids) + + @output_token_ids.setter + def output_token_ids(self, + new_output_token_ids: GenericSequence[int]) -> None: + self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, + new_output_token_ids) + self._update_cached_all_tokens() + + @property + def output_token_ids_array(self) -> array: + """Return the prompt token ids in array type. + + Note that the array is in "I" type, and it is not compatible + with torch.long (2 bytes vs 4 bytes). So beware of the usage. + """ + assert isinstance(self._output_token_ids, array) + return self._output_token_ids + + @property + def mrope_position_delta(self) -> Optional[int]: + return self._mrope_position_delta + + @mrope_position_delta.setter + def mrope_position_delta(self, new_mrope_position_delta): + self._mrope_position_delta = new_mrope_position_delta + + def append_token_id(self, token_id: int, logprob: float) -> None: + self._output_token_ids.append(token_id) + self._new_appended_tokens.append(token_id) + self._cached_all_token_ids.append(token_id) + self._cumulative_logprob += logprob + + def get_len(self) -> int: + return len(self._output_token_ids) + len(self._prompt_token_ids) + + def get_prompt_len(self) -> int: + return len(self._prompt_token_ids) + + def get_output_len(self) -> int: + return len(self._output_token_ids) + + def get_token_ids(self) -> List[int]: + return self._cached_all_token_ids + + def get_prefix_token_ids( + self, num_tokens: int + ) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]: + """Get prefix tokens, and make the return value hashable""" + prompt_length = self.get_prompt_len() + if num_tokens > prompt_length: + return (self._prompt_token_ids_tuple, + tuple(self._output_token_ids[:num_tokens - prompt_length])) + else: + return (self._prompt_token_ids_tuple[:num_tokens], None) + + def get_num_computed_tokens(self) -> int: + """Return the number of prefill tokens that are already computed.""" + return self._num_computed_tokens + + def update_num_computed_tokens(self, num_new_computed_tokens: int): + """Update number of tokens computed so far.""" + self._num_computed_tokens += num_new_computed_tokens + assert self._num_computed_tokens <= self.get_len(), ( + self._num_computed_tokens, self.get_len()) + # If all tokens are computed, it means it is in decoding phase. + if self.get_num_uncomputed_tokens() == 0: + self._stage = SequenceStage.DECODE + + def get_num_cached_tokens(self) -> int: + """Return the number of tokens with prefix cache hit.""" + return self._num_cached_tokens + + def update_num_cached_tokens(self, num_cached_tokens: int): + """Update the number of tokens with prefix cache hit.""" + self._num_cached_tokens = num_cached_tokens + + def reset_state_for_recompute(self) -> None: + """Reset the number of computed tokens from this sequence. It is + supposed to be called when a sequence needs to be started from + the beginning again (e.g., sequence is preempted). + """ + self._num_computed_tokens = 0 + self._stage = SequenceStage.PREFILL + self._new_appended_tokens = [] + + def get_num_uncomputed_tokens(self) -> int: + """Return the number of prefill tokens that are not computed.""" + # we use `get_len()` which includes prompt_len + output_len instead + # of prompt_len here. This is because during recompute we need to + # prefill for both prompt and output. + return self.get_len() - self.get_num_computed_tokens() + + def get_last_token_id(self) -> int: + if not self._output_token_ids: + return self._prompt_token_ids[-1] + return self._output_token_ids[-1] + + def get_prompt_token_ids(self) -> Tuple[int, ...]: + return self.prompt_token_ids + + def get_output_token_ids(self) -> Tuple[int, ...]: + return self.output_token_ids + + def get_delta_and_reset(self) -> SequenceDataDelta: + delta = SequenceDataDelta(self._new_appended_tokens, + self._cumulative_logprob, + self.get_num_computed_tokens(), self.stage) + # Reset delta state. + self._new_appended_tokens = [] + return delta + + def apply_delta(self, delta: SequenceDataDelta): + self._num_computed_tokens = delta.new_num_computed_tokens + self._cumulative_logprob = delta.new_cumulative_logprob + self._stage = delta.new_stage + self._output_token_ids.extend(delta.new_output_token_ids) + self._cached_all_token_ids.extend(delta.new_output_token_ids) + + @property + def stage(self) -> SequenceStage: + return self._stage + + def __repr__(self) -> str: + return (f"SequenceData(" + f"prompt_token_ids={self._prompt_token_ids}, " + f"output_token_ids={self.output_token_ids}, " + f"cumulative_logprob={self.cumulative_logprob}, " + f"get_num_computed_tokens={self.get_num_computed_tokens()}") + + +class Sequence: + """Stores the data, status, and block information of a sequence. + + The sequence is constructed from the :data:`DecoderOnlyInputs` + (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) + instance passed in through the :code:`inputs` constructor argument. + + Args: + seq_id: The ID of the sequence. + inputs: The inputs of the sequence. + block_size: The block size of the sequence. Should be the same as the + block size used by the block manager and cache engine. + eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. + lora_request: LoRA request. + prompt_adapter_request: Prompt Adapter request. + """ + + def __init__( + self, + seq_id: int, + inputs: SingletonInputs, + block_size: int, + eos_token_id: Optional[int] = None, + lora_request: Optional[LoRARequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> None: + self.seq_id = seq_id + self.inputs = SingletonInputsAdapter(inputs) + self.block_size = block_size + self.eos_token_id = eos_token_id + self.lora_request = lora_request + self.prompt_adapter_request = prompt_adapter_request + + self.data = SequenceData.from_seqs(self.prompt_token_ids) + self.output_logprobs: SampleLogprobs = [] + self.output_text = "" + + self.status = SequenceStatus.WAITING + self.stop_reason: Union[int, str, None] = None + + # These are used to keep track of delta outputs + self._last_output_token_ids_offset: int = 0 + self._last_output_text_offset: int = 0 + + # Used for incremental detokenization + self.prefix_offset = 0 + self.read_offset = 0 + # Input + output tokens + self.tokens: Optional[List[str]] = None + + @property + def n_blocks(self) -> int: + return (self.get_len() + self.block_size - 1) // self.block_size + + @property + def prompt(self) -> Optional[str]: + return self.inputs.prompt + + @property + def prompt_token_ids(self) -> List[int]: + return self.inputs.prompt_token_ids + + @property + def prompt_embeds(self) -> Optional[torch.Tensor]: + return self.inputs.prompt_embeds + + @property + def token_type_ids(self) -> List[int]: + return self.inputs.token_type_ids + + @property + def multi_modal_data(self) -> "MultiModalDataDict": + return self.inputs.multi_modal_data + + @property + def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: + return self.inputs.multi_modal_placeholders + + @property + def mm_processor_kwargs(self) -> Dict[str, Any]: + return self.inputs.mm_processor_kwargs + + @property + def lora_int_id(self) -> int: + return self.lora_request.lora_int_id if self.lora_request else 0 + + @property + def prompt_adapter_id(self) -> int: + return self.prompt_adapter_request.prompt_adapter_id \ + if self.prompt_adapter_request else 0 + + def get_output_text_to_return(self, buffer_length: int, + delta: bool) -> str: + """If delta is True, only new text since the last call to + this method is returned""" + + # We return the full output text if the sequence is finished. + truncate = buffer_length and not self.is_finished() + if not delta: + return self.output_text[:-buffer_length] if truncate else ( + self.output_text) + length = len(self.output_text) + if truncate: + length -= buffer_length + last_offset = self._last_output_text_offset + if last_offset < length: + self._last_output_text_offset = length + return self.output_text[last_offset:length] + return "" + + def get_output_token_ids_to_return( + self, delta: bool) -> Union[GenericSequence[int], int]: + """If delta is True, only new tokens since the last call to + this method are returned""" + if not delta: + return self.get_output_token_ids() + + output_len = self.get_output_len() + + # Get the number of new tokens + num_new_tokens = output_len - self._last_output_token_ids_offset + self._last_output_token_ids_offset = output_len + + # Return new tokens + if num_new_tokens == 1: + # Optimization for single decode token case + # (which is what we have most of the time) + return self.data._cached_all_token_ids[-1] + + if num_new_tokens == 0: + return [] + + return self.data._cached_all_token_ids[-num_new_tokens:] + + def hash_of_block(self, logical_idx: int) -> int: + # TODO This can produce incorrect hash when block size > prompt size + + # Compute the number of tokens in the sequence + # TODO: The current hashing function is O(L^2). We should optimize + # this in the future. + num_tokens = self.num_hashed_tokens_of_block(logical_idx) + hashed_tokens = self.data.get_prefix_token_ids(num_tokens) + return hash((hashed_tokens, self.lora_int_id)) + + def extra_hash(self) -> Optional[int]: + """ + This function computes an extra hash for a sequence, specifically + designed for prefix caching mode. The final sequence hash is determined + by applying token_ids from the sequence's blocks. + """ + if self.prompt_adapter_id == 0 and self.lora_int_id == 0: + return None + + # NOTE: If there are additional factors influencing the block aside from + # token_ids, include them as input parameters to the hash. + return hash((self.prompt_adapter_id, self.lora_int_id)) + + def num_hashed_tokens_of_block(self, logical_idx: int): + return logical_idx * self.block_size + self.block_size + + def reset_state_for_recompute(self): + """Reset the sequence states for recomputation.""" + self.data.reset_state_for_recompute() + + def append_token_id(self, token_id: int, logprobs: Dict[int, + Logprob]) -> None: + assert token_id in logprobs + self.output_logprobs.append(logprobs) + self.data.append_token_id(token_id, logprobs[token_id].logprob) + + def get_len(self) -> int: + return self.data.get_len() + + def get_prompt_len(self) -> int: + return self.data.get_prompt_len() + + def get_output_len(self) -> int: + return self.data.get_output_len() + + def get_token_ids(self) -> List[int]: + return self.data.get_token_ids() + + def get_prompt_token_ids(self) -> Tuple[int, ...]: + return self.data.get_prompt_token_ids() + + def get_last_token_id(self) -> int: + return self.data.get_last_token_id() + + def get_output_token_ids(self) -> Tuple[int, ...]: + return self.data.get_output_token_ids() + + def get_cumulative_logprob(self) -> float: + return self.data.cumulative_logprob + + def is_finished(self) -> bool: + return SequenceStatus.is_finished(self.status) + + def fork(self, new_seq_id: int) -> "Sequence": + new_seq = copy.deepcopy(self) + new_seq.seq_id = new_seq_id + return new_seq + + def get_num_new_tokens(self) -> int: + """Get the number of new tokens to be computed. + + Returns: + The new number of tokens to be computed. I.e., 1 for decode, or + the remaining prompt size for prefill. + """ + if self.data.stage == SequenceStage.DECODE: + return 1 + return self.data.get_num_uncomputed_tokens() + + def get_num_computed_tokens(self) -> int: + return self.data.get_num_computed_tokens() + + def is_prefill(self) -> bool: + return self.data.stage == SequenceStage.PREFILL + + def __repr__(self) -> str: + return (f"Sequence(seq_id={self.seq_id}, " + f"status={self.status.name}, " + f"num_blocks={self.n_blocks}, ") + + +class SequenceGroupState(msgspec.Struct, + omit_defaults=True): # type: ignore[call-arg] + """Mutable state tied to a specific sequence group""" + + # for multi-step decoding + num_steps: int = 1 + current_step: int = 0 + + @property + def remaining_steps(self) -> int: + return self.num_steps - self.current_step + + +class SequenceGroup: + """A group of sequences that are generated from the same prompt. + + Args: + request_id: The ID of the request. + seqs: The list of sequences. + sampling_params: The sampling parameters used to generate the outputs. + arrival_time: The arrival time of the request. + lora_request: LoRA request. + pooling_params: The parameters used to generate the pooler + for a pooling model. + pooled_data: The extracted hidden states from a pooling model. + encoder_seq: Optional, the single encoder sequence. Should be None + unless you are working with an encoder/decoder model. + trace_headers: OpenTelemetry trace headers. + prompt_adapter_request: Prompt Adapter request. + priority: User-defined priority of the request. + """ + + def __init__( + self, + request_id: str, + seqs: List[Sequence], + arrival_time: float, + sampling_params: Optional[SamplingParams] = None, + lora_request: Optional[LoRARequest] = None, + pooling_params: Optional[PoolingParams] = None, + pooled_data: Optional[torch.Tensor] = None, + encoder_seq: Optional[Sequence] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> None: + self.request_id = request_id + self.seqs = seqs + self.first_seq = seqs[0] + self.arrival_time = arrival_time + self.is_single_seq = len(seqs) == 1 + self.seqs_dict = {seq.seq_id: seq for seq in seqs} + + self.sampling_params = sampling_params + self.metrics = RequestMetrics(arrival_time=arrival_time, + last_token_time=arrival_time, + first_scheduled_time=None, + first_token_time=None, + time_in_queue=None) + self.last_token_latency = 0.0 + self.lora_request = lora_request + self.prompt_logprobs: Optional[PromptLogprobs] = None + self.state = SequenceGroupState() + self.pooling_params = pooling_params + self.pooled_data = pooled_data + self.prompt_adapter_request = prompt_adapter_request + self.encoder_seq = encoder_seq + self.trace_headers = trace_headers + self.priority = priority + + self.cached_request_output = None + + @property + def prompt(self) -> Optional[str]: + return self.first_seq.prompt + + @property + def prompt_token_ids(self) -> List[int]: + return self.first_seq.prompt_token_ids + + @property + def encoder_prompt(self) -> Optional[str]: + # There are either 0 or 1 encoder sequences + # If one is present, its prompt is distinct + # from the decoder's. + return (self.encoder_seq.prompt + if self.encoder_seq is not None else None) + + @property + def encoder_prompt_token_ids(self) -> Optional[List[int]]: + # There are either 0 or 1 encoder sequences + # If one is present, its prompt token ids are + # distinct from the decoder's. + return (self.encoder_seq.prompt_token_ids + if self.encoder_seq is not None else None) + + @property + def token_type_ids(self) -> Optional[List[int]]: + return self.first_seq.token_type_ids + + @property + def multi_modal_data(self) -> MultiModalDataDict: + if self.first_seq.multi_modal_data: + return self.first_seq.multi_modal_data + elif self.encoder_seq is not None: + return self.encoder_seq.multi_modal_data + return {} + + @property + def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: + if self.first_seq.multi_modal_data: + return self.first_seq.multi_modal_placeholders + elif self.encoder_seq is not None: + return self.encoder_seq.multi_modal_placeholders + return {} + + @property + def mm_processor_kwargs(self) -> Dict[str, Any]: + if self.first_seq.multi_modal_data: + return self.first_seq.mm_processor_kwargs + elif self.encoder_seq is not None: + return self.encoder_seq.mm_processor_kwargs + return {} + + @property + def lora_int_id(self) -> int: + return self.lora_request.lora_int_id if self.lora_request else 0 + + @property + def prompt_adapter_id(self) -> int: + return self.prompt_adapter_request.prompt_adapter_id \ + if self.prompt_adapter_request else 0 + + @property + def prompt_adapter_num_virtual_tokens(self) -> int: + return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\ + if self.prompt_adapter_request else 0 + + def init_multi_step(self, num_steps: int) -> None: + self.state.num_steps = num_steps + self.state.current_step = 0 + + def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int, + num_scheduler_steps: int, + is_multi_step: bool, + enable_chunking: bool) -> None: + + if not is_multi_step: + self.init_multi_step(num_steps=num_scheduler_steps) + return + + # Multi-Step case + is_prefill = self.is_prefill() + + # The asserts below reflect the expectations of the current system. + if is_prefill and enable_chunking: + assert num_lookahead_slots == num_scheduler_steps + self.init_multi_step(num_steps=num_lookahead_slots) + else: + is_decode: bool = not is_prefill + # If it is a prefill, num_lookahead_slots must be 0 + assert num_lookahead_slots == 0 or is_decode + # If it is a decode, num_lookahead_slots + 1 must match + # the scheduler steps. + assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill + self.init_multi_step(num_steps=num_lookahead_slots + 1) + + def set_last_token_time(self, now: float) -> None: + """Sets the last token time for Request level timings.""" + # If still in prefill phase, assertion fails. + assert not self.is_prefill(), ( + "seq_group.set_last_token_time() should not be called " + "if the seq_group is in prefill phase.") + self.last_token_latency = now - self.metrics.last_token_time + self.metrics.last_token_time = now + + def get_last_token_latency(self) -> float: + """Returns the latency of the last token.""" + assert not self.is_prefill(), ( + "seq_group.get_last_token_latency() should not be called " + "if the seq_group is in prefill phase.") + return self.last_token_latency + + def maybe_set_first_token_time(self, time: float) -> None: + """Sets the first token time for Request level timings.""" + # Note: in a case where a sequence_group is swapped and + # recomputed, the time between iterations is counted + # in TPOT, rather than recalculating TTFT (since from the ) + # POV of the user, there is simply a long generation delay. + if (self.metrics.first_token_time is None + and self.first_seq.get_output_len() == 1): + self.metrics.first_token_time = time + + def maybe_set_first_scheduled_time(self, time: float) -> None: + """Sets the first scheduled time and time in queue for Request + level timings.""" + if self.metrics.first_scheduled_time is None: + self.metrics.first_scheduled_time = time + self.metrics.time_in_queue = time - self.metrics.arrival_time + + def set_finished_time(self, time: Optional[float]) -> None: + """Sets the finished time for Request level timings.""" + self.metrics.finished_time = time + + def get_max_num_running_seqs(self) -> int: + """The maximum number of sequences running in parallel in the remaining + lifetime of the request.""" + if self.is_single_seq: + return 0 if self.first_seq.is_finished() else 1 + return self.num_seqs() - self.num_finished_seqs() + + def get_seqs( + self, + status: Optional[SequenceStatus] = None, + ) -> List[Sequence]: + if status is None: + return self.seqs + + if self.is_single_seq: + return self.seqs if self.first_seq.status == status else [] + + return [seq for seq in self.seqs if seq.status == status] + + def is_encoder_decoder(self) -> bool: + return self.encoder_seq is not None + + def get_encoder_seq(self) -> Optional[Sequence]: + return self.encoder_seq + + def get_finished_seqs(self) -> List[Sequence]: + if self.is_single_seq: + return self.seqs if self.first_seq.is_finished() else [] + + return [seq for seq in self.seqs if seq.is_finished()] + + def update_num_computed_tokens(self, num_new_computed_tokens: int): + """Update number of tokens computed so far.""" + for seq in self.seqs: + if not seq.is_finished(): + seq.data.update_num_computed_tokens(num_new_computed_tokens) + + def get_num_uncomputed_tokens(self) -> int: + num_uncomputed_tokens = 0 + for seq in self.seqs: + if not seq.is_finished(): + num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() + return num_uncomputed_tokens + + def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: + # Optimization. We don't need to call get_seqs if we don't need to + # filter by states. + if status is None: + return len(self.seqs) + + if self.is_single_seq: + return 1 if self.seqs[0].status == status else 0 + + return len(self.get_seqs(status)) + + def num_finished_seqs(self) -> int: + if self.is_single_seq: + return 1 if self.seqs[0].is_finished() else 0 + return len(self.get_finished_seqs()) + + def is_finished(self) -> bool: + if self.is_single_seq: + return self.first_seq.is_finished() + return all(seq.is_finished() for seq in self.seqs) + + def is_prefill(self) -> bool: + return self.first_seq.is_prefill() + + def __repr__(self) -> str: + return (f"SequenceGroup(request_id={self.request_id}, " + f"sampling_params={self.sampling_params}, " + f"num_seqs={len(self.seqs)})") + + +class SequenceGroupMetadataDelta( + msgspec.Struct, + tag=True, # type: ignore[call-arg] + array_like=True, # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] + """Delta of SequenceGroupMetadata. + + After sending the first SequenceGroupMetadata, vLLM scheduler + only sends delta to reduce the data payload size. + """ + seq_data_delta: Dict[int, SequenceDataDelta] + request_id: str + block_tables: Dict[int, List[int]] + is_prompt: bool + do_sample: bool = True + token_chunk_size: Optional[int] = None + computed_block_nums: Optional[List[int]] = None + state: Optional[SequenceGroupState] = msgspec.field( + default_factory=lambda: SequenceGroupState()) + + +class SequenceGroupMetadata( + msgspec.Struct, + tag=True, # type: ignore[call-arg] + array_like=True, # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] + """Metadata for a sequence group. Used to create `AttentionMetadata`. + + Args: + request_id: The ID of the request. + is_prompt: Whether the request is at prompt stage. + seq_data: The sequence data. (Seq id -> sequence data) + sampling_params: The sampling parameters used to generate the outputs. + block_tables: The block tables. (Seq id -> list of physical block + numbers) + do_sample: True if sampling is required. Sampling is not required when + e.g., prefill is chunked, and the current iteration only computes + query tokens for prefill, we don't need sampling. + token_chunk_size: The number of tokens to be processed (per sequence). + None if chunking is not required. + lora_request: LoRA request. + computed_block_nums: The block numbers that are already computed, + used in prefix caching. + state: Internal state tied to this sequence group. + multi_modal_data: Multi modal data. + mm_processor_kwargs: Multimodal input processor / mapper overrides. + encoder_seq_data: Optional sequence data for encoder prompt + (SequenceGroup.encoder_seq). Should be None + unless you are working with an encoder/decoder + model. + cross_block_table: Optional cross-attention block table associated + with the encoder prompt + (SequenceGroup.encoder_seq). Should be None + unless you are working with an encoder/decoder + model. + prompt_adapter_request: Prompt Adapter request. + """ + + request_id: str + is_prompt: bool + seq_data: Dict[int, SequenceData] + sampling_params: Optional[SamplingParams] + block_tables: Dict[int, List[int]] + do_sample: bool = True + pooling_params: Optional[PoolingParams] = None + lora_request: Optional[LoRARequest] = None + computed_block_nums: Optional[List[int]] = None + state: Optional[SequenceGroupState] = msgspec.field( + default_factory=lambda: SequenceGroupState()) + # "MultiModalDataDict" types. We have to use Any due to msgspec + # doesn't allow to have union of 2 different dicts. + token_type_ids: Optional[List[int]] = None + multi_modal_data: Optional[Any] = None + multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None + mm_processor_kwargs: Optional[Dict[str, Any]] = None + encoder_seq_data: Optional[SequenceData] = None + cross_block_table: Optional[List[int]] = None + prompt_adapter_request: Optional[PromptAdapterRequest] = None + token_chunk_size: Optional[int] = None + + ### Stateful fields that are lazily defined. ### + # The number of speculative tokens adopted in this request. + # None means specuative decoding is not used. + # Zero means speculative decoding is disabled for some reasons. + # TODO: We should maintain this states out of the sequence group. + num_speculative_tokens: Optional[int] = None + + def __post_init__(self): + if self.seq_data is not None and self.token_chunk_size is None: + if self.is_prompt: + self.token_chunk_size = next(iter( + self.seq_data.values())).get_len() + else: + self.token_chunk_size = 1 + + @property + def lora_int_id(self) -> int: + return self.lora_request.lora_int_id if self.lora_request else 0 + + @property + def prompt_adapter_id(self) -> int: + return self.prompt_adapter_request.prompt_adapter_id \ + if self.prompt_adapter_request else 0 + + @property + def prompt_adapter_num_virtual_tokens(self) -> int: + return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \ + if self.prompt_adapter_request else 0 + + # Multi-Step Chunked-Prefill property + @property + def is_single_step_prompt(self) -> bool: + # do_sample is true, only when the token_chunk_size matches the + # num_uncomputed_tokens of the sequence. This indicates that + # the prompt will finish processing in a single `execute_model` + # step. + return self.is_prompt and self.do_sample + + def get_first_seq_id(self) -> int: + # This is an efficient way of fetching the seq_id when + # we know this SequenceGroup has only one sequence. + return next(iter(self.seq_data)) + + def apply_delta(self, + sequence_group_metadata_delta: SequenceGroupMetadataDelta): + for id, delta in sequence_group_metadata_delta.seq_data_delta.items(): + self.seq_data[id].apply_delta(delta) + assert self.request_id == sequence_group_metadata_delta.request_id + self.block_tables = sequence_group_metadata_delta.block_tables + self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size + self.do_sample = sequence_group_metadata_delta.do_sample + self.is_prompt = sequence_group_metadata_delta.is_prompt + + def finish_step(self) -> None: + assert self.state is not None + assert self.state.current_step < self.state.num_steps, \ + f"current step {self.state.current_step}, num_steps {self.state.num_steps}" # noqa + self.state.current_step += 1 + + +class SequenceOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] + """The model output associated with a sequence. + + Args: + parent_seq_id: The ID of the parent sequence (for forking in beam + search). + output_token: The output token ID. + logprobs: The logprobs of the output token. + (Token id -> logP(x_i+1 | x_0, ..., x_i)) + """ + parent_seq_id: int + output_token: int + logprobs: Dict[int, Logprob] + + def __repr__(self) -> str: + return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " + f"output_token={self.output_token}, " + f"logprobs={self.logprobs})") + + def __eq__(self, other: object) -> bool: + if not isinstance(other, SequenceOutput): + raise NotImplementedError() + equal = (self.parent_seq_id == other.parent_seq_id + and self.output_token == other.output_token) + log_probs_equal = other.logprobs == self.logprobs + return equal and log_probs_equal + + +class SequenceGroupOutput(ABC): + """The base class for model outputs associated with a sequence group.""" + + @abstractmethod + def __repr__(self) -> str: + pass + + @abstractmethod + def __eq__(self, other: object) -> bool: + pass + + +class CompletionSequenceGroupOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] + """The model output associated with a completion sequence group.""" + __metaclass__ = SequenceGroupOutput + samples: List[SequenceOutput] + # Prompt logprob for each prompt query token. + prompt_logprobs: Optional[PromptLogprobs] + + def __repr__(self) -> str: + return (f"CompletionSequenceGroupOutput(samples={self.samples}, " + f"prompt_logprobs={self.prompt_logprobs})") + + def __eq__(self, other: object) -> bool: + if not isinstance(other, CompletionSequenceGroupOutput): + raise NotImplementedError() + return (self.samples == other.samples + and self.prompt_logprobs == other.prompt_logprobs) + + +class PoolingSequenceGroupOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True, # type: ignore[call-arg] +): + """The model output associated with a pooling sequence group.""" + __metaclass__ = SequenceGroupOutput + # Annotated as Any to be compatible with msgspec + # The actual type is in SequenceGroup.pooled_data + data: Any + + def __repr__(self) -> str: + return f"PoolingSequenceGroupOutput(data={self.data}" + + def __eq__(self, other: object) -> bool: + if not isinstance(other, PoolingSequenceGroupOutput): + raise NotImplementedError() + return self.data == other.data + + +# cannot use msgspec.Struct here because Dynamo does not support it +@dataclass +class IntermediateTensors: + """For all pipeline stages except the last, we need to return the hidden + states and residuals to be sent to the next stage. This data structure + contains the hidden states and residuals for a request. + """ + + tensors: Dict[str, torch.Tensor] + + def __init__(self, tensors): + # manually define this function, so that + # Dynamo knows `IntermediateTensors()` comes from this file. + # Otherwise, dataclass will generate this function by evaluating + # a string, and we will lose the information about the source file. + self.tensors = tensors + + def __getitem__(self, key: Union[str, slice]): + if isinstance(key, str): + return self.tensors[key] + elif isinstance(key, slice): + return self.__class__({k: v[key] for k, v in self.tensors.items()}) + + def __setitem__(self, key: str, value: torch.Tensor): + self.tensors[key] = value + + def __len__(self): + return len(self.tensors) + + def __eq__(self, other: object): + return isinstance(other, self.__class__) and self + + def __repr__(self) -> str: + return f"IntermediateTensors(tensors={self.tensors})" + + +class PoolerOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] + """The output from a pooling operation in the pooling model.""" + outputs: List[PoolingSequenceGroupOutput] + + def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput: + return self.outputs[idx] + + def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput): + self.outputs[idx] = value + + def __len__(self): + return len(self.outputs) + + def __eq__(self, other: object): + return isinstance(other, + self.__class__) and self.outputs == other.outputs + + +def get_all_seq_ids( + seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: + """Given a list of SequenceGroupMetadata, create a list of all + sequence ids. + """ + return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data] + + +def get_all_seq_ids_and_request_ids( + seq_group_metadata_list: List[SequenceGroupMetadata] +) -> Tuple[List[int], Dict[str, Set[int]]]: + """Given a list of SequenceGroupMetadata, create a list of all + sequence ids. + """ + seq_ids: List[int] = [] + request_id_seq_ids_mapping: DefaultDict[str, Set[int]] = defaultdict(set) + for sg in seq_group_metadata_list: + for seq_id in sg.seq_data: + seq_ids.append(seq_id) + request_id_seq_ids_mapping[sg.request_id].add(seq_id) + return seq_ids, request_id_seq_ids_mapping + + +class HiddenStates(msgspec.Struct, array_like=True, + omit_defaults=True): # type: ignore[call-arg] + """Hidden states corresponding to in-progress sequences. + Used in speculative decoding to pass hidden states from + the target model to the proposer model. + + seq_ids are the sequence ids of each entry of the batch + dimension of the hidden_states tensor""" + # Scorer hidden states. For prefill step, it is used for hidden states of + # all tokens, whereas for decode step, it use used for last accepted tokens. + hidden_states: torch.Tensor + # The sequence group metadata list. Only needed for decode step. + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None + # Scorer hidden states of the 2nd last token proposed by the proposer ( + # irrespective of whether it was accepted or not). Only used for cases when + # last proposed token is accepted (i.e., in case of bonus tokens). For the + # case of no bonus tokens, these are ignored. + second_last_token_hidden_states: Optional[torch.Tensor] = None + + _seq_ids: List[int] = msgspec.field(default_factory=list) + + def __post_init__(self): + if self.seq_group_metadata_list is not None: + assert len(self.seq_group_metadata_list) == len(self.hidden_states) + self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list) + + @property + def seq_ids(self) -> List[int]: + return self._seq_ids + + def update(self, + hidden_states: torch.Tensor, + seq_group_metadata_list: List[SequenceGroupMetadata], + second_last_token_hidden_states: Optional[torch.Tensor] = None): + """Update hidden states from target model invocation. Only used for + decode steps""" + assert len(seq_group_metadata_list) == len(hidden_states) + self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) + self.hidden_states = torch.cat([self.hidden_states, hidden_states]) + + if self.second_last_token_hidden_states is not None: + # Adding dummy hidden_states to this to maintain same shape + self.second_last_token_hidden_states = torch.cat([ + self.second_last_token_hidden_states, + torch.zeros_like(hidden_states) + if second_last_token_hidden_states is None else + second_last_token_hidden_states + ]) + + def prune(self, + seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: + """Prune to provided list of sequence ids. Only used for decode steps. + """ + # Currently this prunes all seq_ids not present in + # seq_group_metadata_list which might cause problems where a sequence + # may be "paused" then "resumed" later. This should only prune sequences + # which are confirmed to be aborted. + seq_ids = get_all_seq_ids(seq_group_metadata_list) + if seq_ids != self._seq_ids: + # Batch contents changed - prune removed sequences. + index = [self._seq_ids.index(seq_id) for seq_id in seq_ids] + self.hidden_states = self.hidden_states[index] + if self.second_last_token_hidden_states is not None: + self.second_last_token_hidden_states = self\ + .second_last_token_hidden_states[index] + self._seq_ids = seq_ids + + def expand_with_bonus_tokens( + self, seq_with_bonus_token_in_last_step: set) -> None: + """Expand hidden states for sequences with bonus tokens. This is in + alignment with `MultiStepWorker._expand_execute_model_request`.""" + if self.second_last_token_hidden_states is None \ + or not seq_with_bonus_token_in_last_step: + return + + index = [] + for seq_id in self._seq_ids: + i = self._seq_ids.index(seq_id) + if seq_id in seq_with_bonus_token_in_last_step: + index.append(i + len(self._seq_ids)) + index.append(i) + + self.hidden_states = torch.cat( + [self.hidden_states, self.second_last_token_hidden_states])[index] + + +class ExecuteModelRequest( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] + """The model execution request, containing CPU metadata only. The LLM + engine should create an instance of this class for each request batch.""" + # The sequence group metadata list. + seq_group_metadata_list: List[Union[SequenceGroupMetadata, + SequenceGroupMetadataDelta]] + # Blocks to swap in. List of CPU -> GPU block number. + blocks_to_swap_in: List[Tuple[int, + int]] = msgspec.field(default_factory=list) + # Blocks to swap out. List of GPU -> CPU block number. + blocks_to_swap_out: List[Tuple[int, + int]] = msgspec.field(default_factory=list) + # Blocks to copy. Source to dest block. + blocks_to_copy: List[Tuple[int, int]] = msgspec.field(default_factory=list) + # Virtual engine ID for pipeline parallel. + virtual_engine: int = 0 + # The number of slots for lookahead decoding. + num_lookahead_slots: int = 0 + # The number of requests in the running queue. + running_queue_size: int = 0 + # Optional hidden states from prior step. + previous_hidden_states: Optional[HiddenStates] = None + # The number of forward steps to run. + num_steps: int = 1 + # Finished request ids since last step. + finished_requests_ids: List[str] = msgspec.field(default_factory=list) + # The last sampled token ids for multi step decoding. + last_sampled_token_ids: Optional[torch.Tensor] = None + # Async callback + async_callback: Optional[Callable] = None + + @property + def is_first_multi_step(self) -> bool: + # TODO(will) make this be able to handle batches with variable number of + # steps + assert len(self.seq_group_metadata_list) > 0 + first_seq_group = self.seq_group_metadata_list[0] + assert first_seq_group.state is not None + return first_seq_group.state.current_step == 0 + + @property + def is_last_step(self) -> bool: + # TODO(will) make this be able to handle batches with variable number of + # steps + assert len(self.seq_group_metadata_list) > 0 + first_seq_group = self.seq_group_metadata_list[0] + assert first_seq_group.state is not None + return first_seq_group.state.remaining_steps == 1 + + @property + def current_step(self) -> int: + # TODO(will) make this be able to handle batches with variable number of + # steps + assert len(self.seq_group_metadata_list) > 0 + state = self.seq_group_metadata_list[0].state + assert state is not None + return state.current_step + + def clone( + self, seq_group_metadata_list: List[Union[SequenceGroupMetadata, + SequenceGroupMetadataDelta]] + ) -> "ExecuteModelRequest": + """Clone the request with a new sequence group metadata list.""" + return ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=self.blocks_to_swap_in.copy(), + blocks_to_swap_out=self.blocks_to_swap_out.copy(), + blocks_to_copy=self.blocks_to_copy.copy(), + virtual_engine=self.virtual_engine, + num_lookahead_slots=self.num_lookahead_slots, + running_queue_size=self.running_queue_size, + previous_hidden_states=self.previous_hidden_states, + num_steps=self.num_steps, + finished_requests_ids=self.finished_requests_ids, + last_sampled_token_ids=self.last_sampled_token_ids.clone() + if self.last_sampled_token_ids is not None else None, + async_callback=self.async_callback) + + +@dataclass +class SequenceGroupBase: + group_id: str # the original request id before splitting + + assembled_seq_group: Optional[SequenceGroup] = None + + # seq id to a unique index inside this group + seq_id_to_index: Dict[str, int] = field(default_factory=dict) + + # seq ids to be finished + to_be_finished: Dict[str, SequenceGroup] = field(default_factory=dict) + + # seq id to finished sequences + finished_reqs: Dict[str, SequenceGroup] = field(default_factory=dict) + + streaming: bool = False + + output_produced: bool = False + + @staticmethod + def add_request(request_id: str, engine, params, *args, **kwargs): + """When we are ready to add a request with request_id and params + into the engine, we can split the request into multiple requests. + """ + raise NotImplementedError + + def finish_seq(self, seq: SequenceGroup): + """The sequence `seq` finishes, we should record the information. + """ + del self.to_be_finished[seq.request_id] + self.finished_reqs[seq.request_id] = seq + + def maybe_assemble_group( + self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: + """Assemble the sequence group, for producing the final + output, or adding request in the engine again. + """ + raise NotImplementedError + + +class ParallelSampleSequenceGroup(SequenceGroupBase): + + @staticmethod + def add_request(request_id: str, engine, params, **kwargs): + original_params = params + group = ParallelSampleSequenceGroup(request_id) + seqs = [] + for i in range(original_params.n): + request_id_i = f"{request_id}_parallel_sample_{i}" + group.seq_id_to_index[request_id_i] = i + params = copy.deepcopy(original_params) + params.n = 1 + if params.seed is not None: + params.seed += i + seq_group = engine._add_processed_request( + request_id_i, + params=params, + **kwargs, + ) # type: ignore + assert seq_group is not None + engine.seq_id_to_seq_group[request_id_i] = group + group.to_be_finished[request_id_i] = seq_group + seqs.append(seq_group.seqs[0]) + + # for parallel sampling, the `assembled_seq_group` is always + # available, since we have all the sequences ready, and they + # will not change. + group.assembled_seq_group = SequenceGroup( + request_id=request_id, + seqs=seqs, + arrival_time=seq_group.arrival_time, + sampling_params=original_params, + lora_request=seq_group.lora_request, + pooling_params=seq_group.pooling_params, + pooled_data=seq_group.pooled_data, + encoder_seq=seq_group.encoder_seq, + trace_headers=seq_group.trace_headers, + prompt_adapter_request=seq_group.prompt_adapter_request, + priority=seq_group.priority, + ) + + group.streaming = params.output_kind == RequestOutputKind.DELTA + group.output_produced = False + + def maybe_assemble_group( + self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: + + # in the streaming mode, we will return the assembled sequence + # for the first remaining sequence, and then return None for the + # rest of sequences + if self.streaming: + first_remaining_id = next(iter(self.to_be_finished)) + if seq_group.request_id == first_remaining_id: + return self.assembled_seq_group + return None + + # in the non-streaming mode, we will return the assembled sequence + # when the last sequences finishes, and then return None for the + # rest of the time + if (len(self.to_be_finished) == 1 + and seq_group.request_id in self.to_be_finished + and seq_group.is_finished()): + assert self.assembled_seq_group is not None + params = self.assembled_seq_group.sampling_params + assert isinstance(params, SamplingParams) + if not self.output_produced: + self.output_produced = True + if params._real_n is not None: + # Get the top-n sequences. + n = params._real_n or params.n + seqs = self.assembled_seq_group.seqs + sorting_key = lambda seq: seq.get_cumulative_logprob() + sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) + top_n_seqs = sorted_seqs[:n] + self.assembled_seq_group.seqs = top_n_seqs + return self.assembled_seq_group + if self.output_produced: + return None + return None diff --git a/.venv/lib/python3.11/site-packages/vllm/tracing.py b/.venv/lib/python3.11/site-packages/vllm/tracing.py new file mode 100644 index 0000000000000000000000000000000000000000..bf069ad84fd42b815452b6dd72cb3fedde20e053 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/tracing.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Mapping, Optional + +from vllm.logger import init_logger +from vllm.utils import run_once + +TRACE_HEADERS = ["traceparent", "tracestate"] + +logger = init_logger(__name__) + +_is_otel_imported = False +otel_import_error_traceback: Optional[str] = None +try: + from opentelemetry.context.context import Context + from opentelemetry.sdk.environment_variables import ( + OTEL_EXPORTER_OTLP_TRACES_PROTOCOL) + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider + from opentelemetry.trace.propagation.tracecontext import ( + TraceContextTextMapPropagator) + _is_otel_imported = True +except ImportError: + # Capture and format traceback to provide detailed context for the import + # error. Only the string representation of the error is retained to avoid + # memory leaks. + # See https://github.com/vllm-project/vllm/pull/7266#discussion_r1707395458 + import traceback + otel_import_error_traceback = traceback.format_exc() + + class Context: # type: ignore + pass + + class BaseSpanAttributes: # type: ignore + pass + + class SpanKind: # type: ignore + pass + + class Tracer: # type: ignore + pass + + +def is_otel_available() -> bool: + return _is_otel_imported + + +def init_tracer(instrumenting_module_name: str, + otlp_traces_endpoint: str) -> Optional[Tracer]: + if not is_otel_available(): + raise ValueError( + "OpenTelemetry is not available. Unable to initialize " + "a tracer. Ensure OpenTelemetry packages are installed. " + f"Original error:\n{otel_import_error_traceback}") + trace_provider = TracerProvider() + + span_exporter = get_span_exporter(otlp_traces_endpoint) + trace_provider.add_span_processor(BatchSpanProcessor(span_exporter)) + set_tracer_provider(trace_provider) + + tracer = trace_provider.get_tracer(instrumenting_module_name) + return tracer + + +def get_span_exporter(endpoint): + protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc") + if protocol == "grpc": + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter) + elif protocol == "http/protobuf": + from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( + OTLPSpanExporter) # type: ignore + else: + raise ValueError( + f"Unsupported OTLP protocol '{protocol}' is configured") + + return OTLPSpanExporter(endpoint=endpoint) + + +def extract_trace_context( + headers: Optional[Mapping[str, str]]) -> Optional[Context]: + if is_otel_available(): + headers = headers or {} + return TraceContextTextMapPropagator().extract(headers) + else: + return None + + +def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]: + + return {h: headers[h] for h in TRACE_HEADERS if h in headers} + + +class SpanAttributes: + # Attribute names copied from here to avoid version conflicts: + # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md + GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens" + GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens" + GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens" + GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p" + GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature" + GEN_AI_RESPONSE_MODEL = "gen_ai.response.model" + # Attribute names added until they are added to the semantic conventions: + GEN_AI_REQUEST_ID = "gen_ai.request.id" + GEN_AI_REQUEST_N = "gen_ai.request.n" + GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" + GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" + GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" + GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e" + GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler" + # Time taken in the forward pass for this across all workers + GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = ( + "gen_ai.latency.time_in_model_forward") + # Time taken in the model execute function. This will include model + # forward, block/sync across workers, cpu-gpu sync time and sampling time. + GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = ( + "gen_ai.latency.time_in_model_execute") + + +def contains_trace_headers(headers: Mapping[str, str]) -> bool: + return any(h in headers for h in TRACE_HEADERS) + + +@run_once +def log_tracing_disabled_warning() -> None: + logger.warning( + "Received a request with trace context but tracing is disabled") diff --git a/.venv/lib/python3.11/site-packages/vllm/utils.py b/.venv/lib/python3.11/site-packages/vllm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8b926959875732bf5ac2829e46fa6853cdc54687 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/utils.py @@ -0,0 +1,2272 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import asyncio +import concurrent +import contextlib +import datetime +import enum +import gc +import getpass +import importlib.metadata +import importlib.util +import inspect +import ipaddress +import multiprocessing +import os +import re +import resource +import signal +import socket +import subprocess +import sys +import tempfile +import threading +import time +import traceback +import uuid +import warnings +import weakref +from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task +from collections import OrderedDict, UserDict, defaultdict +from collections.abc import Hashable, Iterable, Mapping +from dataclasses import dataclass, field +from functools import cache, lru_cache, partial, wraps +from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, + Dict, Generator, Generic, Iterator, List, Literal, + NamedTuple, Optional, Tuple, Type, TypeVar, Union, + overload) +from uuid import uuid4 + +import cloudpickle +import numpy as np +import numpy.typing as npt +import psutil +import torch +import torch.types +import yaml +import zmq +import zmq.asyncio +from packaging.version import Version +from torch.library import Library +from typing_extensions import Never, ParamSpec, TypeIs, assert_never + +import vllm.envs as envs +from vllm.logger import enable_trace_function_call, init_logger + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +logger = init_logger(__name__) + +# Exception strings for non-implemented encoder/decoder scenarios + +# Reminder: Please update docs/source/features/compatibility_matrix.md +# If the feature combo become valid + +STR_NOT_IMPL_ENC_DEC_SWA = \ + "Sliding window attention for encoder/decoder models " + \ + "is not currently supported." + +STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ + "Prefix caching for encoder/decoder models " + \ + "is not currently supported." + +STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL = \ + "Chunked prefill for encoder/decoder models " + \ + "is not currently supported." + +STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = ( + "Models with logits_soft_cap " + "require FlashInfer backend, which is " + "currently not supported for encoder/decoder " + "models.") + +STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is currently not currently " + "supported with encoder/decoder " + "models.") + +STR_NOT_IMPL_ENC_DEC_PP = ("Pipeline parallelism is not " + "currently supported with " + "encoder/decoder models.") + +STR_NOT_IMPL_ENC_DEC_MM = ("Multimodal is not currently " + "supported with encoder/decoder " + "models.") + +STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not " + "currently supported with encoder/" + "decoder models.") + +STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only " + "backends currently supported with encoder/" + "decoder models.") + +STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not " + "currently supported with encoder/" + "decoder models.") + +# Efficiently import all enc/dec error strings +# rather than having to import all of the above +STR_NOT_IMPL_ENC_DEC_ERR_STRS = { + "STR_NOT_IMPL_ENC_DEC_SWA": STR_NOT_IMPL_ENC_DEC_SWA, + "STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE": STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, + "STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL": + STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL, + "STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP": STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP, + "STR_NOT_IMPL_ENC_DEC_LORA": STR_NOT_IMPL_ENC_DEC_LORA, + "STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP, + "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM, + "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, + "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, + "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER, +} + +# Constants related to forcing the attention backend selection + +# String name of register which may be set in order to +# force auto-selection of attention backend by Attention +# wrapper +STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND" + +# Possible string values of STR_BACKEND_ENV_VAR +# register, corresponding to possible backends +STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER" +STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA" +STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH" +STR_XFORMERS_ATTN_VAL: str = "XFORMERS" +STR_FLASH_ATTN_VAL: str = "FLASH_ATTN" +STR_INVALID_VAL: str = "INVALID" + +GB_bytes = 1_000_000_000 +"""The number of bytes in one gigabyte (GB).""" + +GiB_bytes = 1 << 30 +"""The number of bytes in one gibibyte (GiB).""" + +STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.half, + "bfloat16": torch.bfloat16, + "float": torch.float, + "fp8": torch.uint8, + "fp8_e4m3": torch.uint8, + "fp8_e5m2": torch.uint8, +} + +TORCH_DTYPE_TO_NUMPY_DTYPE = { + torch.float16: np.float16, + torch.float32: np.float32, + torch.float64: np.float64, + torch.uint8: np.uint8, + torch.int32: np.int32, + torch.int64: np.int64, +} + +P = ParamSpec('P') +T = TypeVar("T") +U = TypeVar("U") + +_K = TypeVar("_K", bound=Hashable) +_V = TypeVar("_V") + + +class _Sentinel: + ... + + +ALL_PINNED_SENTINEL = _Sentinel() + + +class Device(enum.Enum): + GPU = enum.auto() + CPU = enum.auto() + + +class LayerBlockType(enum.Enum): + attention = "attention" + mamba = "mamba" + + +class Counter: + + def __init__(self, start: int = 0) -> None: + self.counter = start + + def __next__(self) -> int: + i = self.counter + self.counter += 1 + return i + + def reset(self) -> None: + self.counter = 0 + + +class CacheInfo(NamedTuple): + hits: int + total: int + + @property + def hit_ratio(self) -> float: + if self.total == 0: + return 0 + + return self.hits / self.total + + +class LRUCache(Generic[_K, _V]): + """Note: This class is not thread safe!""" + + def __init__(self, capacity: int) -> None: + self.cache = OrderedDict[_K, _V]() + self.pinned_items = set[_K]() + self.capacity = capacity + + self._hits = 0 + self._total = 0 + + def __contains__(self, key: _K) -> bool: + return key in self.cache + + def __len__(self) -> int: + return len(self.cache) + + def __getitem__(self, key: _K) -> _V: + value = self.cache[key] # Raise KeyError if not exists + self.cache.move_to_end(key) + return value + + def __setitem__(self, key: _K, value: _V) -> None: + self.put(key, value) + + def __delitem__(self, key: _K) -> None: + self.pop(key) + + def stat(self) -> CacheInfo: + return CacheInfo(hits=self._hits, total=self._total) + + def touch(self, key: _K) -> None: + self.cache.move_to_end(key) + + def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]: + value: Optional[_V] + if key in self.cache: + value = self.cache[key] + self.cache.move_to_end(key) + + self._hits += 1 + else: + value = default + + self._total += 1 + return value + + def put(self, key: _K, value: _V) -> None: + self.cache[key] = value + self.cache.move_to_end(key) + self._remove_old_if_needed() + + def pin(self, key: _K) -> None: + """ + Pins a key in the cache preventing it from being + evicted in the LRU order. + """ + if key not in self.cache: + raise ValueError(f"Cannot pin key: {key} not in cache.") + self.pinned_items.add(key) + + def _unpin(self, key: _K) -> None: + self.pinned_items.remove(key) + + def _on_remove(self, key: _K, value: Optional[_V]) -> None: + pass + + def remove_oldest(self, *, remove_pinned: bool = False) -> None: + if not self.cache: + return + + if not remove_pinned: + # pop the oldest item in the cache that is not pinned + lru_key = next( + (key for key in self.cache if key not in self.pinned_items), + ALL_PINNED_SENTINEL) + if lru_key is ALL_PINNED_SENTINEL: + raise RuntimeError("All items are pinned, " + "cannot remove oldest from the cache.") + else: + lru_key = next(iter(self.cache)) + self.pop(lru_key) # type: ignore + + def _remove_old_if_needed(self) -> None: + while len(self.cache) > self.capacity: + self.remove_oldest() + + def pop(self, key: _K, default: Optional[_V] = None) -> Optional[_V]: + run_on_remove = key in self.cache + value = self.cache.pop(key, default) + # remove from pinned items + if key in self.pinned_items: + self._unpin(key) + if run_on_remove: + self._on_remove(key, value) + return value + + def clear(self) -> None: + while len(self.cache) > 0: + self.remove_oldest(remove_pinned=True) + self.cache.clear() + + +class PyObjectCache: + """Used to cache python objects to avoid object allocations + across scheduler iterations. + """ + + def __init__(self, obj_builder): + self._obj_builder = obj_builder + self._index = 0 + + self._obj_cache = [] + for _ in range(128): + self._obj_cache.append(self._obj_builder()) + + def _grow_cache(self): + # Double the size of the cache + num_objs = len(self._obj_cache) + for _ in range(num_objs): + self._obj_cache.append(self._obj_builder()) + + def get_object(self): + """Returns a pre-allocated cached object. If there is not enough + objects, then the cache size will double. + """ + if self._index >= len(self._obj_cache): + self._grow_cache() + assert self._index < len(self._obj_cache) + + obj = self._obj_cache[self._index] + self._index += 1 + + return obj + + def reset(self): + """Makes all cached-objects available for the next scheduler iteration. + """ + self._index = 0 + + +@cache +def get_max_shared_memory_bytes(gpu: int = 0) -> int: + """Returns the maximum shared memory per thread block in bytes.""" + from vllm import _custom_ops as ops + max_shared_mem = ( + ops.get_max_shared_memory_per_block_device_attribute(gpu)) + # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py + # will fail + assert max_shared_mem > 0, "max_shared_mem can not be zero" + return int(max_shared_mem) + + +def get_cpu_memory() -> int: + """Returns the total CPU memory of the node in bytes.""" + return psutil.virtual_memory().total + + +def random_uuid() -> str: + return str(uuid.uuid4().hex) + + +def make_async( + func: Callable[P, T], + executor: Optional[concurrent.futures.Executor] = None +) -> Callable[P, Awaitable[T]]: + """Take a blocking function, and run it on in an executor thread. + + This function prevents the blocking function from blocking the + asyncio event loop. + The code in this function needs to be thread safe. + """ + + def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future: + loop = asyncio.get_event_loop() + p_func = partial(func, *args, **kwargs) + return loop.run_in_executor(executor=executor, func=p_func) + + return _async_wrapper + + +def _next_task(iterator: AsyncGenerator[T, None], + loop: AbstractEventLoop) -> Task: + # Can use anext() in python >= 3.10 + return loop.create_task(iterator.__anext__()) # type: ignore[arg-type] + + +async def merge_async_iterators( + *iterators: AsyncGenerator[T, + None], ) -> AsyncGenerator[Tuple[int, T], None]: + """Merge multiple asynchronous iterators into a single iterator. + + This method handle the case where some iterators finish before others. + When it yields, it yields a tuple (i, item) where i is the index of the + iterator that yields the item. + """ + + loop = asyncio.get_running_loop() + + awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)} + try: + while awaits: + done, _ = await asyncio.wait(awaits.keys(), + return_when=FIRST_COMPLETED) + for d in done: + pair = awaits.pop(d) + try: + item = await d + i, it = pair + awaits[_next_task(it, loop)] = pair + yield i, item + except StopAsyncIteration: + pass + finally: + # Cancel any remaining iterators + for f, (_, it) in awaits.items(): + with contextlib.suppress(BaseException): + f.cancel() + await it.aclose() + + +async def collect_from_async_generator( + iterator: AsyncGenerator[T, None]) -> List[T]: + """Collect all items from an async generator into a list.""" + items = [] + async for item in iterator: + items.append(item) + return items + + +def get_ip() -> str: + host_ip = envs.VLLM_HOST_IP + if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ: + logger.warning( + "The environment variable HOST_IP is deprecated and ignored, as" + " it is often used by Docker and other software to" + "interact with the container's network stack. Please " + "use VLLM_HOST_IP instead to set the IP address for vLLM processes" + " to communicate with each other.") + if host_ip: + return host_ip + + # IP is not set, try to get it from the network interface + + # try ipv4 + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable + return s.getsockname()[0] + except Exception: + pass + + # try ipv6 + try: + s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + # Google's public DNS server, see + # https://developers.google.com/speed/public-dns/docs/using#addresses + s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable + return s.getsockname()[0] + except Exception: + pass + + warnings.warn( + "Failed to get the IP address, using 0.0.0.0 by default." + "The value can be set by the environment variable" + " VLLM_HOST_IP or HOST_IP.", + stacklevel=2) + return "0.0.0.0" + + +def is_valid_ipv6_address(address: str) -> bool: + try: + ipaddress.IPv6Address(address) + return True + except ValueError: + return False + + +def get_distributed_init_method(ip: str, port: int) -> str: + # Brackets are not permitted in ipv4 addresses, + # see https://github.com/python/cpython/issues/103848 + return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}" + + +def get_open_zmq_ipc_path() -> str: + base_rpc_path = envs.VLLM_RPC_BASE_PATH + return f"ipc://{base_rpc_path}/{uuid4()}" + + +def get_open_port() -> int: + port = envs.VLLM_PORT + if port is not None: + while True: + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", port)) + return port + except OSError: + port += 1 # Increment port number if already in use + logger.info("Port %d is already in use, trying port %d", + port - 1, port) + # try ipv4 + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + except OSError: + # try ipv6 + with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +def find_process_using_port(port: int) -> Optional[psutil.Process]: + # TODO: We can not check for running processes with network + # port on macOS. Therefore, we can not have a full graceful shutdown + # of vLLM. For now, let's not look for processes in this case. + # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/ + if sys.platform.startswith("darwin"): + return None + + for conn in psutil.net_connections(): + if conn.laddr.port == port: + try: + return psutil.Process(conn.pid) + except psutil.NoSuchProcess: + return None + return None + + +def update_environment_variables(envs: Dict[str, str]): + for k, v in envs.items(): + if k in os.environ and os.environ[k] != v: + logger.warning( + "Overwriting environment variable %s " + "from '%s' to '%s'", k, os.environ[k], v) + os.environ[k] = v + + +def chunk_list(lst: List[T], chunk_size: int): + """Yield successive chunk_size chunks from lst.""" + for i in range(0, len(lst), chunk_size): + yield lst[i:i + chunk_size] + + +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def round_up(x: int, y: int) -> int: + return ((x + y - 1) // y) * y + + +def _generate_random_fp8( + tensor: torch.Tensor, + low: float, + high: float, +) -> None: + # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type, + # it may occur Inf or NaN if we directly use torch.randint + # to generate random data for fp8 data. + # For example, s.11111.00 in fp8e5m2 format represents Inf. + # | E4M3 | E5M2 + #-----|-------------|------------------- + # Inf | N/A | s.11111.00 + # NaN | s.1111.111 | s.11111.{01,10,11} + from vllm import _custom_ops as ops + tensor_tmp = torch.empty_like(tensor, dtype=torch.float16) + tensor_tmp.uniform_(low, high) + ops.convert_fp8(tensor, tensor_tmp) + del tensor_tmp + + +def get_kv_cache_torch_dtype( + cache_dtype: Optional[Union[str, torch.dtype]], + model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype: + if isinstance(cache_dtype, str): + if cache_dtype == "auto": + if isinstance(model_dtype, str): + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] + elif isinstance(model_dtype, torch.dtype): + torch_dtype = model_dtype + else: + raise ValueError(f"Invalid model dtype: {model_dtype}") + elif cache_dtype in ["half", "bfloat16", "float"]: + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] + elif cache_dtype == "fp8": + torch_dtype = torch.uint8 + else: + raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") + elif isinstance(cache_dtype, torch.dtype): + torch_dtype = cache_dtype + else: + raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") + return torch_dtype + + +def create_kv_caches_with_random_flash( + num_blocks: int, + block_size: int, + num_layers: int, + num_heads: int, + head_size: int, + cache_dtype: Optional[Union[str, torch.dtype]], + model_dtype: Optional[Union[str, torch.dtype]] = None, + seed: int = 0, + device: Optional[str] = "cuda", +) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + from vllm.platforms import current_platform + current_platform.seed_everything(seed) + + torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) + key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) + scale = head_size**-0.5 + + key_caches: List[torch.Tensor] = [] + value_caches: List[torch.Tensor] = [] + + for _ in range(num_layers): + key_value_cache = torch.empty(size=key_value_cache_shape, + dtype=torch_dtype, + device=device) + if cache_dtype in ["auto", "half", "bfloat16", "float"]: + key_value_cache.uniform_(-scale, scale) + elif cache_dtype == 'fp8': + _generate_random_fp8(key_value_cache, -scale, scale) + else: + raise ValueError( + f"Does not support key cache of type {cache_dtype}") + key_caches.append(key_value_cache[:, 0]) + value_caches.append(key_value_cache[:, 1]) + return key_caches, value_caches + + +def create_kv_caches_with_random( + num_blocks: int, + block_size: int, + num_layers: int, + num_heads: int, + head_size: int, + cache_dtype: Optional[Union[str, torch.dtype]], + model_dtype: Optional[Union[str, torch.dtype]] = None, + seed: int = 0, + device: Optional[str] = "cuda", +) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + + if cache_dtype == "fp8" and head_size % 16: + raise ValueError( + f"Does not support key cache of type fp8 with head_size {head_size}" + ) + from vllm.platforms import current_platform + current_platform.seed_everything(seed) + + torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) + + scale = head_size**-0.5 + x = 16 // torch.tensor([], dtype=torch_dtype).element_size() + key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) + key_caches: List[torch.Tensor] = [] + for _ in range(num_layers): + key_cache = torch.empty(size=key_cache_shape, + dtype=torch_dtype, + device=device) + if cache_dtype in ["auto", "half", "bfloat16", "float"]: + key_cache.uniform_(-scale, scale) + elif cache_dtype == 'fp8': + _generate_random_fp8(key_cache, -scale, scale) + else: + raise ValueError( + f"Does not support key cache of type {cache_dtype}") + key_caches.append(key_cache) + + value_cache_shape = (num_blocks, num_heads, head_size, block_size) + value_caches: List[torch.Tensor] = [] + for _ in range(num_layers): + value_cache = torch.empty(size=value_cache_shape, + dtype=torch_dtype, + device=device) + if cache_dtype in ["auto", "half", "bfloat16", "float"]: + value_cache.uniform_(-scale, scale) + elif cache_dtype == 'fp8': + _generate_random_fp8(value_cache, -scale, scale) + else: + raise ValueError( + f"Does not support value cache of type {cache_dtype}") + value_caches.append(value_cache) + return key_caches, value_caches + + +@cache +def is_pin_memory_available() -> bool: + from vllm.platforms import current_platform + return current_platform.is_pin_memory_available() + + +class DeviceMemoryProfiler: + + def __init__(self, device: Optional[torch.types.Device] = None): + self.device = device + + def current_memory_usage(self) -> float: + # Return the memory usage in bytes. + from vllm.platforms import current_platform + return current_platform.get_current_memory_usage(self.device) + + def __enter__(self): + self.initial_memory = self.current_memory_usage() + # This allows us to call methods of the context manager if needed + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.final_memory = self.current_memory_usage() + self.consumed_memory = self.final_memory - self.initial_memory + + # Force garbage collection + gc.collect() + + +def make_ndarray_with_pad( + x: List[List[T]], + pad: T, + dtype: npt.DTypeLike, + *, + max_len: Optional[int] = None, +) -> npt.NDArray: + """ + Make a padded array from 2D inputs. + + The padding is applied to the end of each inner list until it reaches + `max_len`. + """ + if max_len is None: + # Unlike for most functions, map is faster than a genexpr over `len` + max_len = max(map(len, x), default=0) + + padded_x = np.full((len(x), max_len), pad, dtype=dtype) + for ind, blocktb in enumerate(x): + assert len(blocktb) <= max_len + padded_x[ind, :len(blocktb)] = blocktb + + return padded_x + + +def make_tensor_with_pad( + x: List[List[T]], + pad: T, + dtype: torch.dtype, + *, + max_len: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + pin_memory: bool = False, +) -> torch.Tensor: + """ + Make a padded tensor from 2D inputs. + + The padding is applied to the end of each inner list until it reaches + `max_len`. + """ + np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype] + padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len) + + tensor = torch.from_numpy(padded_x).to(device) + if pin_memory: + tensor = tensor.pin_memory() + + return tensor + + +def async_tensor_h2d( + data: list, + dtype: torch.dtype, + target_device: Union[str, torch.device], + pin_memory: bool, +) -> torch.Tensor: + """Asynchronously create a tensor and copy it from host to device.""" + t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu") + return t.to(device=target_device, non_blocking=True) + + +def get_dtype_size(dtype: torch.dtype) -> int: + """Get the size of the data type in bytes.""" + return torch.tensor([], dtype=dtype).element_size() + + +def align_to_256bytes(extent: int, dtype: torch.dtype) -> int: + dtype_size = get_dtype_size(dtype) + eles_per_256bytes = 256 // dtype_size + return round_up(extent, eles_per_256bytes) + + +# `collections` helpers +def is_list_of( + value: object, + typ: Union[type[T], tuple[type[T], ...]], + *, + check: Literal["first", "all"] = "first", +) -> TypeIs[List[T]]: + if not isinstance(value, list): + return False + + if check == "first": + return len(value) == 0 or isinstance(value[0], typ) + elif check == "all": + return all(isinstance(v, typ) for v in value) + + assert_never(check) + + +JSONTree = Union[Dict[str, "JSONTree[T]"], List["JSONTree[T]"], + Tuple["JSONTree[T]", ...], T] +"""A nested JSON structure where the leaves need not be JSON-serializable.""" + + +@overload +def json_map_leaves( + func: Callable[[T], U], + value: Dict[str, JSONTree[T]], +) -> Dict[str, JSONTree[U]]: + ... + + +@overload +def json_map_leaves( + func: Callable[[T], U], + value: List[JSONTree[T]], +) -> List[JSONTree[U]]: + ... + + +@overload +def json_map_leaves( + func: Callable[[T], U], + value: Tuple[JSONTree[T], ...], +) -> Tuple[JSONTree[U], ...]: + ... + + +@overload +def json_map_leaves( + func: Callable[[T], U], + value: JSONTree[T], +) -> JSONTree[U]: + ... + + +def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]: + if isinstance(value, dict): + return {k: json_map_leaves(func, v) for k, v in value.items()} + elif isinstance(value, list): + return [json_map_leaves(func, v) for v in value] + elif isinstance(value, tuple): + return tuple(json_map_leaves(func, v) for v in value) + else: + return func(value) + + +def flatten_2d_lists(lists: List[List[T]]) -> List[T]: + """Flatten a list of lists to a single list.""" + return [item for sublist in lists for item in sublist] + + +def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]): + """ + Unlike :class:`itertools.groupby`, groups are not broken by + non-contiguous data. + """ + groups = defaultdict[_K, list[_V]](list) + + for value in values: + groups[key(value)].append(value) + + return groups.items() + + +# TODO: This function can be removed if transformer_modules classes are +# serialized by value when communicating between processes +def init_cached_hf_modules() -> None: + """ + Lazy initialization of the Hugging Face modules. + """ + from transformers.dynamic_module_utils import init_hf_modules + init_hf_modules() + + +@cache +def find_library(lib_name: str) -> str: + """ + Find the library file in the system. + `lib_name` is full filename, with both prefix and suffix. + This function resolves `lib_name` to the full path of the library. + """ + # Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa + # According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard + # `/sbin/ldconfig` should exist in all Linux systems. + # `/sbin/ldconfig` searches the library in the system + libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode() + # each line looks like the following: + # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1 + locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line] + # `LD_LIBRARY_PATH` searches the library in the user-defined paths + env_ld_library_path = envs.LD_LIBRARY_PATH + if not locs and env_ld_library_path: + locs = [ + os.path.join(dir, lib_name) + for dir in env_ld_library_path.split(":") + if os.path.exists(os.path.join(dir, lib_name)) + ] + if not locs: + raise ValueError(f"Cannot find {lib_name} in the system.") + return locs[0] + + +def find_nccl_library() -> str: + """ + We either use the library file specified by the `VLLM_NCCL_SO_PATH` + environment variable, or we find the library file brought by PyTorch. + After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be + found by `ctypes` automatically. + """ + so_file = envs.VLLM_NCCL_SO_PATH + + # manually load the nccl library + if so_file: + logger.info( + "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", + so_file) + else: + if torch.version.cuda is not None: + so_file = "libnccl.so.2" + elif torch.version.hip is not None: + so_file = "librccl.so.1" + else: + raise ValueError("NCCL only supports CUDA and ROCm backends.") + logger.info("Found nccl from library %s", so_file) + return so_file + + +prev_set_stream = torch.cuda.set_stream + +_current_stream = None + + +def _patched_set_stream(stream: torch.cuda.Stream) -> None: + global _current_stream + _current_stream = stream + prev_set_stream(stream) + + +torch.cuda.set_stream = _patched_set_stream + + +def current_stream() -> torch.cuda.Stream: + """ + replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`. + it turns out that `torch.cuda.current_stream()` is quite expensive, + as it will construct a new stream object at each call. + here we patch `torch.cuda.set_stream` to keep track of the current stream + directly, so that we can avoid calling `torch.cuda.current_stream()`. + + the underlying hypothesis is that we do not call `torch._C._cuda_setStream` + from C/C++ code. + """ + global _current_stream + if _current_stream is None: + # when this function is called before any stream is set, + # we return the default stream. + _current_stream = torch.cuda.current_stream() + return _current_stream + + +def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None: + """Set up function tracing for the current thread, + if enabled via the VLLM_TRACE_FUNCTION environment variable + """ + + if envs.VLLM_TRACE_FUNCTION: + tmp_dir = tempfile.gettempdir() + # add username to tmp_dir to avoid permission issues + tmp_dir = os.path.join(tmp_dir, getpass.getuser()) + filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}" + f"_thread_{threading.get_ident()}_" + f"at_{datetime.datetime.now()}.log").replace(" ", "_") + log_path = os.path.join(tmp_dir, "vllm", + f"vllm-instance-{vllm_config.instance_id}", + filename) + os.makedirs(os.path.dirname(log_path), exist_ok=True) + enable_trace_function_call(log_path) + + +# `functools` helpers +def identity(value: T, **kwargs) -> T: + """Returns the first provided value.""" + return value + + +F = TypeVar('F', bound=Callable[..., Any]) + + +def deprecate_args( + start_index: int, + is_deprecated: Union[bool, Callable[[], bool]] = True, + additional_message: Optional[str] = None, +) -> Callable[[F], F]: + + if not callable(is_deprecated): + is_deprecated = partial(identity, is_deprecated) + + def wrapper(fn: F) -> F: + + params = inspect.signature(fn).parameters + pos_types = ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ) + pos_kws = [ + kw for kw, param in params.items() if param.kind in pos_types + ] + + @wraps(fn) + def inner(*args, **kwargs): + if is_deprecated(): + deprecated_args = pos_kws[start_index:len(args)] + if deprecated_args: + msg = ( + f"The positional arguments {deprecated_args} are " + "deprecated and will be removed in a future update.") + if additional_message is not None: + msg += f" {additional_message}" + + warnings.warn( + DeprecationWarning(msg), + stacklevel=3, # The inner function takes up one level + ) + + return fn(*args, **kwargs) + + return inner # type: ignore + + return wrapper + + +def deprecate_kwargs( + *kws: str, + is_deprecated: Union[bool, Callable[[], bool]] = True, + additional_message: Optional[str] = None, +) -> Callable[[F], F]: + deprecated_kws = set(kws) + + if not callable(is_deprecated): + is_deprecated = partial(identity, is_deprecated) + + def wrapper(fn: F) -> F: + + @wraps(fn) + def inner(*args, **kwargs): + if is_deprecated(): + deprecated_kwargs = kwargs.keys() & deprecated_kws + if deprecated_kwargs: + msg = ( + f"The keyword arguments {deprecated_kwargs} are " + "deprecated and will be removed in a future update.") + if additional_message is not None: + msg += f" {additional_message}" + + warnings.warn( + DeprecationWarning(msg), + stacklevel=3, # The inner function takes up one level + ) + + return fn(*args, **kwargs) + + return inner # type: ignore + + return wrapper + + +@lru_cache(maxsize=8) +def _cuda_device_count_stateless( + cuda_visible_devices: Optional[str] = None) -> int: + # Note: cuda_visible_devices is not used, but we keep it as an argument for + # LRU Cache purposes. + + # Code below is based on + # https://github.com/pytorch/pytorch/blob/ + # c1cd946818442aca8c7f812b16d187ce1586c3bc/ + # torch/cuda/__init__.py#L831C1-L831C17 + import torch.cuda + import torch.version + + from vllm.platforms import current_platform + if not torch.cuda._is_compiled(): + return 0 + if current_platform.is_rocm(): + # ROCm uses amdsmi instead of nvml for stateless device count + # This requires a sufficiently modern version of Torch 2.4.0 + raw_count = torch.cuda._device_count_amdsmi() if (hasattr( + torch.cuda, "_device_count_amdsmi")) else -1 + else: + raw_count = torch.cuda._device_count_nvml() + r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count + return r + + +def cuda_device_count_stateless() -> int: + """Get number of CUDA devices, caching based on the value of + CUDA_VISIBLE_DEVICES at the time of call. + + This should be used instead of torch.cuda.device_count() + unless CUDA_VISIBLE_DEVICES has already been set to the desired + value.""" + + # This can be removed and simply replaced with torch.cuda.get_device_count + # after https://github.com/pytorch/pytorch/pull/122815 is released. + return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) + + +def cuda_is_initialized() -> bool: + """Check if CUDA is initialized.""" + if not torch.cuda._is_compiled(): + return False + return torch.cuda.is_initialized() + + +def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]: + """Make an instance method that weakly references + its associated instance and no-ops once that + instance is collected.""" + ref = weakref.ref(bound_method.__self__) # type: ignore[attr-defined] + unbound = bound_method.__func__ # type: ignore[attr-defined] + + def weak_bound(*args, **kwargs) -> None: + if inst := ref(): + unbound(inst, *args, **kwargs) + + return weak_bound + + +#From: https://stackoverflow.com/a/4104188/2749989 +def run_once(f: Callable[P, None]) -> Callable[P, None]: + + def wrapper(*args: P.args, **kwargs: P.kwargs) -> None: + if not wrapper.has_run: # type: ignore[attr-defined] + wrapper.has_run = True # type: ignore[attr-defined] + return f(*args, **kwargs) + + wrapper.has_run = False # type: ignore[attr-defined] + return wrapper + + +class StoreBoolean(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + if values.lower() == "true": + setattr(namespace, self.dest, True) + elif values.lower() == "false": + setattr(namespace, self.dest, False) + else: + raise ValueError(f"Invalid boolean value: {values}. " + "Expected 'true' or 'false'.") + + +class SortedHelpFormatter(argparse.HelpFormatter): + """SortedHelpFormatter that sorts arguments by their option strings.""" + + def add_arguments(self, actions): + actions = sorted(actions, key=lambda x: x.option_strings) + super().add_arguments(actions) + + +class FlexibleArgumentParser(argparse.ArgumentParser): + """ArgumentParser that allows both underscore and dash in names.""" + + def __init__(self, *args, **kwargs): + # Set the default 'formatter_class' to SortedHelpFormatter + if 'formatter_class' not in kwargs: + kwargs['formatter_class'] = SortedHelpFormatter + super().__init__(*args, **kwargs) + + def parse_args(self, args=None, namespace=None): + if args is None: + args = sys.argv[1:] + + if '--config' in args: + args = self._pull_args_from_config(args) + + # Convert underscores to dashes and vice versa in argument names + processed_args = [] + for arg in args: + if arg.startswith('--'): + if '=' in arg: + key, value = arg.split('=', 1) + key = '--' + key[len('--'):].replace('_', '-') + processed_args.append(f'{key}={value}') + else: + processed_args.append('--' + + arg[len('--'):].replace('_', '-')) + elif arg.startswith('-O') and arg != '-O' and len(arg) == 2: + # allow -O flag to be used without space, e.g. -O3 + processed_args.append('-O') + processed_args.append(arg[2:]) + else: + processed_args.append(arg) + + return super().parse_args(processed_args, namespace) + + def _pull_args_from_config(self, args: List[str]) -> List[str]: + """Method to pull arguments specified in the config file + into the command-line args variable. + + The arguments in config file will be inserted between + the argument list. + + example: + ```yaml + port: 12323 + tensor-parallel-size: 4 + ``` + ```python + $: vllm {serve,chat,complete} "facebook/opt-12B" \ + --config config.yaml -tp 2 + $: args = [ + "serve,chat,complete", + "facebook/opt-12B", + '--config', 'config.yaml', + '-tp', '2' + ] + $: args = [ + "serve,chat,complete", + "facebook/opt-12B", + '--port', '12323', + '--tensor-parallel-size', '4', + '-tp', '2' + ] + ``` + + Please note how the config args are inserted after the sub command. + this way the order of priorities is maintained when these are args + parsed by super(). + """ + assert args.count( + '--config') <= 1, "More than one config file specified!" + + index = args.index('--config') + if index == len(args) - 1: + raise ValueError("No config file specified! \ + Please check your command-line arguments.") + + file_path = args[index + 1] + + config_args = self._load_config_file(file_path) + + # 0th index is for {serve,chat,complete} + # followed by model_tag (only for serve) + # followed by config args + # followed by rest of cli args. + # maintaining this order will enforce the precedence + # of cli > config > defaults + if args[0] == "serve": + if index == 1: + raise ValueError( + "No model_tag specified! Please check your command-line" + " arguments.") + args = [args[0]] + [ + args[1] + ] + config_args + args[2:index] + args[index + 2:] + else: + args = [args[0]] + config_args + args[1:index] + args[index + 2:] + + return args + + def _load_config_file(self, file_path: str) -> List[str]: + """Loads a yaml file and returns the key value pairs as a + flattened list with argparse like pattern + ```yaml + port: 12323 + tensor-parallel-size: 4 + ``` + returns: + processed_args: list[str] = [ + '--port': '12323', + '--tensor-parallel-size': '4' + ] + + """ + + extension: str = file_path.split('.')[-1] + if extension not in ('yaml', 'yml'): + raise ValueError( + "Config file must be of a yaml/yml type.\ + %s supplied", extension) + + # only expecting a flat dictionary of atomic types + processed_args: List[str] = [] + + config: Dict[str, Union[int, str]] = {} + try: + with open(file_path) as config_file: + config = yaml.safe_load(config_file) + except Exception as ex: + logger.error( + "Unable to read the config file at %s. \ + Make sure path is correct", file_path) + raise ex + + store_boolean_arguments = [ + action.dest for action in self._actions + if isinstance(action, StoreBoolean) + ] + + for key, value in config.items(): + if isinstance(value, bool) and key not in store_boolean_arguments: + if value: + processed_args.append('--' + key) + else: + processed_args.append('--' + key) + processed_args.append(str(value)) + + return processed_args + + +async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, + **kwargs): + """Utility function to run async task in a lock""" + async with lock: + return await task(*args, **kwargs) + + +def supports_kw( + callable: Callable[..., object], + kw_name: str, + *, + requires_kw_only: bool = False, + allow_var_kwargs: bool = True, +) -> bool: + """Check if a keyword is a valid kwarg for a callable; if requires_kw_only + disallows kwargs names that can also be positional arguments. + """ + params = inspect.signature(callable).parameters + if not params: + return False + + param_val = params.get(kw_name) + + # Types where the it may be valid, i.e., explicitly defined & nonvariadic + passable_kw_types = set((inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY)) + + if param_val: + is_sig_param = param_val.kind in passable_kw_types + # We want kwargs only, but this is passable as a positional arg + if (requires_kw_only and is_sig_param + and param_val.kind != inspect.Parameter.KEYWORD_ONLY): + return False + if ((requires_kw_only + and param_val.kind == inspect.Parameter.KEYWORD_ONLY) + or (not requires_kw_only and is_sig_param)): + return True + + # If we're okay with var-kwargs, it's supported as long as + # the kw_name isn't something like *args, **kwargs + if allow_var_kwargs: + # Get the last param; type is ignored here because params is a proxy + # mapping, but it wraps an ordered dict, and they appear in order. + # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters + last_param = params[next(reversed(params))] # type: ignore + return (last_param.kind == inspect.Parameter.VAR_KEYWORD + and last_param.name != kw_name) + return False + + +def resolve_mm_processor_kwargs( + init_kwargs: Optional[Mapping[str, object]], + inference_kwargs: Optional[Mapping[str, object]], + callable: Callable[..., object], + *, + requires_kw_only: bool = True, + allow_var_kwargs: bool = False, +) -> Dict[str, Any]: + """Applies filtering to eliminate invalid mm_processor_kwargs, i.e., + those who are not explicit keywords to the given callable (of one is + given; otherwise no filtering is done), then merges the kwarg dicts, + giving priority to inference_kwargs if there are any collisions. + + In the case that no kwarg overrides are provided, returns an empty + dict so that it can still be kwarg expanded into the callable later on. + + If allow_var_kwargs=True, allows for things that can be expanded into + kwargs as long as they aren't naming collision for var_kwargs or potential + positional arguments. + """ + # Filter inference time multimodal processor kwargs provided + runtime_mm_kwargs = get_allowed_kwarg_only_overrides( + callable, + overrides=inference_kwargs, + requires_kw_only=requires_kw_only, + allow_var_kwargs=allow_var_kwargs, + ) + + # Filter init time multimodal processor kwargs provided + init_mm_kwargs = get_allowed_kwarg_only_overrides( + callable, + overrides=init_kwargs, + requires_kw_only=requires_kw_only, + allow_var_kwargs=allow_var_kwargs, + ) + + # Merge the final processor kwargs, prioritizing inference + # time values over the initialization time values. + mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs} + return mm_processor_kwargs + + +def get_allowed_kwarg_only_overrides( + callable: Callable[..., object], + overrides: Optional[Mapping[str, object]], + *, + requires_kw_only: bool = True, + allow_var_kwargs: bool = False, +) -> Dict[str, Any]: + """ + Given a callable which has one or more keyword only params and a dict + mapping param names to values, drop values that can be not be kwarg + expanded to overwrite one or more keyword-only args. This is used in a + few places to handle custom processor overrides for multimodal models, + e.g., for profiling when processor options provided by the user + may affect the number of mm tokens per instance. + + Args: + callable: Callable which takes 0 or more keyword only arguments. + If None is provided, all overrides names are allowed. + overrides: Potential overrides to be used when invoking the callable. + allow_var_kwargs: Allows overrides that are expandable for var kwargs. + + Returns: + Dictionary containing the kwargs to be leveraged which may be used + to overwrite one or more keyword only arguments when invoking the + callable. + """ + if not overrides: + return {} + + # Drop any mm_processor_kwargs provided by the user that + # are not kwargs, unless it can fit it var_kwargs param + filtered_overrides = { + kwarg_name: val + for kwarg_name, val in overrides.items() + if supports_kw(callable, + kwarg_name, + requires_kw_only=requires_kw_only, + allow_var_kwargs=allow_var_kwargs) + } + + # If anything is dropped, log a warning + dropped_keys = overrides.keys() - filtered_overrides.keys() + if dropped_keys: + if requires_kw_only: + logger.warning( + "The following intended overrides are not keyword-only args " + "and and will be dropped: %s", dropped_keys) + else: + logger.warning( + "The following intended overrides are not keyword args " + "and and will be dropped: %s", dropped_keys) + + return filtered_overrides + + +# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0. +# In particular, the FakeScalarType is not supported for earlier versions of +# PyTorch which breaks dynamo for any ops registered using ScalarType. +def supports_dynamo() -> bool: + base_torch_version = Version(Version(torch.__version__).base_version) + return base_torch_version >= Version("2.4.0") + + +# Some backends use pytorch version < 2.4.0 which doesn't +# support `torch.library.custom_op`. +def supports_custom_op() -> bool: + return hasattr(torch.library, "custom_op") + + +class AtomicCounter: + """An atomic, thread-safe counter""" + + def __init__(self, initial=0): + """Initialize a new atomic counter to given initial value""" + self._value = initial + self._lock = threading.Lock() + + def inc(self, num=1): + """Atomically increment the counter by num and return the new value""" + with self._lock: + self._value += num + return self._value + + def dec(self, num=1): + """Atomically decrement the counter by num and return the new value""" + with self._lock: + self._value -= num + return self._value + + @property + def value(self): + return self._value + + +# Adapted from: https://stackoverflow.com/a/47212782/5082708 +class LazyDict(Mapping[str, T], Generic[T]): + + def __init__(self, factory: Dict[str, Callable[[], T]]): + self._factory = factory + self._dict: Dict[str, T] = {} + + def __getitem__(self, key: str) -> T: + if key not in self._dict: + if key not in self._factory: + raise KeyError(key) + self._dict[key] = self._factory[key]() + return self._dict[key] + + def __setitem__(self, key: str, value: Callable[[], T]): + self._factory[key] = value + + def __iter__(self): + return iter(self._factory) + + def __len__(self): + return len(self._factory) + + +class ClassRegistry(UserDict[Type[T], _V]): + + def __getitem__(self, key: Type[T]) -> _V: + for cls in key.mro(): + if cls in self.data: + return self.data[cls] + + raise KeyError(key) + + def __contains__(self, key: object) -> bool: + return self.contains(key) + + def contains(self, key: object, *, strict: bool = False) -> bool: + if not isinstance(key, type): + return False + + if strict: + return key in self.data + + return any(cls in self.data for cls in key.mro()) + + +def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor: + """ + Create a weak reference to a tensor. + The new tensor will share the same data as the original tensor, + but will not keep the original tensor alive. + """ + return torch.ops._C.weak_ref_tensor(tensor) + + +def weak_ref_tensors( + tensors: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]] +) -> Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]: + """ + Convenience function to create weak references to tensors, + for single tensor, list of tensors or tuple of tensors. + """ + if isinstance(tensors, torch.Tensor): + return weak_ref_tensor(tensors) + if isinstance(tensors, list): + return [weak_ref_tensor(t) for t in tensors] + if isinstance(tensors, tuple): + return tuple(weak_ref_tensor(t) for t in tensors) + raise ValueError("Invalid type for tensors") + + +def is_in_doc_build() -> bool: + try: + from sphinx.ext.autodoc.mock import _MockModule + return isinstance(torch, _MockModule) + except ModuleNotFoundError: + return False + + +def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): + """ + Import a Python file according to its file path. + + Based on the official recipe: + https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly + """ + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ModuleNotFoundError(f"No module named '{module_name}'") + + assert spec.loader is not None + + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +@cache +def get_vllm_optional_dependencies(): + metadata = importlib.metadata.metadata("vllm") + requirements = metadata.get_all("Requires-Dist", []) + extras = metadata.get_all("Provides-Extra", []) + + return { + extra: [ + re.split(r";|>=|<=|==", req)[0] for req in requirements + if req.endswith(f'extra == "{extra}"') + ] + for extra in extras + } + + +class _PlaceholderBase: + """ + Disallows downstream usage of placeholder modules. + + We need to explicitly override each dunder method because + :meth:`__getattr__` is not called when they are accessed. + + See also: + [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) + """ + + def __getattr__(self, key: str) -> Never: + """ + The main class should implement this to throw an error + for attribute accesses representing downstream usage. + """ + raise NotImplementedError + + # [Basic customization] + + def __lt__(self, other: object): + return self.__getattr__("__lt__") + + def __le__(self, other: object): + return self.__getattr__("__le__") + + def __eq__(self, other: object): + return self.__getattr__("__eq__") + + def __ne__(self, other: object): + return self.__getattr__("__ne__") + + def __gt__(self, other: object): + return self.__getattr__("__gt__") + + def __ge__(self, other: object): + return self.__getattr__("__ge__") + + def __hash__(self): + return self.__getattr__("__hash__") + + def __bool__(self): + return self.__getattr__("__bool__") + + # [Callable objects] + + def __call__(self, *args: object, **kwargs: object): + return self.__getattr__("__call__") + + # [Container types] + + def __len__(self): + return self.__getattr__("__len__") + + def __getitem__(self, key: object): + return self.__getattr__("__getitem__") + + def __setitem__(self, key: object, value: object): + return self.__getattr__("__setitem__") + + def __delitem__(self, key: object): + return self.__getattr__("__delitem__") + + # __missing__ is optional according to __getitem__ specification, + # so it is skipped + + # __iter__ and __reversed__ have a default implementation + # based on __len__ and __getitem__, so they are skipped. + + # [Numeric Types] + + def __add__(self, other: object): + return self.__getattr__("__add__") + + def __sub__(self, other: object): + return self.__getattr__("__sub__") + + def __mul__(self, other: object): + return self.__getattr__("__mul__") + + def __matmul__(self, other: object): + return self.__getattr__("__matmul__") + + def __truediv__(self, other: object): + return self.__getattr__("__truediv__") + + def __floordiv__(self, other: object): + return self.__getattr__("__floordiv__") + + def __mod__(self, other: object): + return self.__getattr__("__mod__") + + def __divmod__(self, other: object): + return self.__getattr__("__divmod__") + + def __pow__(self, other: object, modulo: object = ...): + return self.__getattr__("__pow__") + + def __lshift__(self, other: object): + return self.__getattr__("__lshift__") + + def __rshift__(self, other: object): + return self.__getattr__("__rshift__") + + def __and__(self, other: object): + return self.__getattr__("__and__") + + def __xor__(self, other: object): + return self.__getattr__("__xor__") + + def __or__(self, other: object): + return self.__getattr__("__or__") + + # r* and i* methods have lower priority than + # the methods for left operand so they are skipped + + def __neg__(self): + return self.__getattr__("__neg__") + + def __pos__(self): + return self.__getattr__("__pos__") + + def __abs__(self): + return self.__getattr__("__abs__") + + def __invert__(self): + return self.__getattr__("__invert__") + + # __complex__, __int__ and __float__ have a default implementation + # based on __index__, so they are skipped. + + def __index__(self): + return self.__getattr__("__index__") + + def __round__(self, ndigits: object = ...): + return self.__getattr__("__round__") + + def __trunc__(self): + return self.__getattr__("__trunc__") + + def __floor__(self): + return self.__getattr__("__floor__") + + def __ceil__(self): + return self.__getattr__("__ceil__") + + # [Context managers] + + def __enter__(self): + return self.__getattr__("__enter__") + + def __exit__(self, *args: object, **kwargs: object): + return self.__getattr__("__exit__") + + +class PlaceholderModule(_PlaceholderBase): + """ + A placeholder object to use when a module does not exist. + + This enables more informative errors when trying to access attributes + of a module that does not exists. + """ + + def __init__(self, name: str) -> None: + super().__init__() + + # Apply name mangling to avoid conflicting with module attributes + self.__name = name + + def placeholder_attr(self, attr_path: str): + return _PlaceholderModuleAttr(self, attr_path) + + def __getattr__(self, key: str): + name = self.__name + + try: + importlib.import_module(name) + except ImportError as exc: + for extra, names in get_vllm_optional_dependencies().items(): + if name in names: + msg = f"Please install vllm[{extra}] for {extra} support" + raise ImportError(msg) from exc + + raise exc + + raise AssertionError("PlaceholderModule should not be used " + "when the original module can be imported") + + +class _PlaceholderModuleAttr(_PlaceholderBase): + + def __init__(self, module: PlaceholderModule, attr_path: str) -> None: + super().__init__() + + # Apply name mangling to avoid conflicting with module attributes + self.__module = module + self.__attr_path = attr_path + + def placeholder_attr(self, attr_path: str): + return _PlaceholderModuleAttr(self.__module, + f"{self.__attr_path}.{attr_path}") + + def __getattr__(self, key: str): + getattr(self.__module, f"{self.__attr_path}.{key}") + + raise AssertionError("PlaceholderModule should not be used " + "when the original module can be imported") + + +# create a library to hold the custom op +vllm_lib = Library("vllm", "FRAGMENT") # noqa + + +def direct_register_custom_op( + op_name: str, + op_func: Callable, + mutates_args: List[str], + fake_impl: Optional[Callable] = None, + target_lib: Optional[Library] = None, + dispatch_key: str = "CUDA", +): + """ + `torch.library.custom_op` can have significant overhead because it + needs to consider complicated dispatching logic. This function + directly registers a custom op and dispatches it to the CUDA backend. + See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5 + for more details. + + By default, the custom op is registered to the vLLM library. If you + want to register it to a different library, you can pass the library + object to the `target_lib` argument. + + IMPORTANT: the lifetime of the operator is tied to the lifetime of the + library object. If you want to bind the operator to a different library, + make sure the library object is alive when the operator is used. + """ + if is_in_doc_build(): + return + + if not supports_custom_op(): + from vllm.platforms import current_platform + assert not current_platform.is_cuda_alike(), ( + "cuda platform needs torch>=2.4 to support custom op, " + "chances are you are using an old version of pytorch " + "or a custom build of pytorch. It is recommended to " + "use vLLM in a fresh new environment and let it install " + "the required dependencies.") + return + + import torch.library + if hasattr(torch.library, "infer_schema"): + schema_str = torch.library.infer_schema(op_func, + mutates_args=mutates_args) + else: + # for pytorch 2.4 + import torch._custom_op.impl + schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args) + my_lib = target_lib or vllm_lib + my_lib.define(op_name + schema_str) + my_lib.impl(op_name, op_func, dispatch_key=dispatch_key) + if fake_impl is not None: + my_lib._register_fake(op_name, fake_impl) + + +def resolve_obj_by_qualname(qualname: str) -> Any: + """ + Resolve an object by its fully qualified name. + """ + module_name, obj_name = qualname.rsplit(".", 1) + module = importlib.import_module(module_name) + return getattr(module, obj_name) + + +def kill_process_tree(pid: int): + """ + Kills all descendant processes of the given pid by sending SIGKILL. + + Args: + pid (int): Process ID of the parent process + """ + try: + parent = psutil.Process(pid) + except psutil.NoSuchProcess: + return + + # Get all children recursively + children = parent.children(recursive=True) + + # Send SIGKILL to all children first + for child in children: + with contextlib.suppress(ProcessLookupError): + os.kill(child.pid, signal.SIGKILL) + + # Finally kill the parent + with contextlib.suppress(ProcessLookupError): + os.kill(pid, signal.SIGKILL) + + +@dataclass +class MemorySnapshot: + """Memory snapshot.""" + torch_peak: int = 0 + cuda_memory: int = 0 + torch_memory: int = 0 + non_torch_memory: int = 0 + timestamp: float = 0.0 + auto_measure: bool = True + + def __post_init__(self): + if self.auto_measure: + self.measure() + + def measure(self): + # we measure the torch peak memory usage via allocated_bytes, + # rather than `torch.cuda.memory_reserved()` . + # After `torch.cuda.reset_peak_memory_stats()`, + # `torch.cuda.memory_reserved()` will keep growing, and only shrink + # when we call `torch.cuda.empty_cache()` or OOM happens. + self.torch_peak = torch.cuda.memory_stats().get( + "allocated_bytes.all.peak", 0) + + self.cuda_memory = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + + # torch.cuda.memory_reserved() is how many bytes + # PyTorch gets from cuda (by calling cudaMalloc, etc.) + # this is used to measure the non-torch memory usage + self.torch_memory = torch.cuda.memory_reserved() + + self.non_torch_memory = self.cuda_memory - self.torch_memory + self.timestamp = time.time() + + def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": + return MemorySnapshot( + torch_peak=self.torch_peak - other.torch_peak, + cuda_memory=self.cuda_memory - other.cuda_memory, + torch_memory=self.torch_memory - other.torch_memory, + non_torch_memory=self.non_torch_memory - other.non_torch_memory, + timestamp=self.timestamp - other.timestamp, + auto_measure=False, + ) + + +@dataclass +class MemoryProfilingResult: + """Memory profiling result. All numbers are in bytes. + """ + non_kv_cache_memory: int = 0 + torch_peak_increase: int = 0 + non_torch_increase: int = 0 + weights_memory: float = 0 + before_create: MemorySnapshot = field(default_factory=MemorySnapshot) + before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) + after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) + profile_time: float = 0.0 + + +@contextlib.contextmanager +def memory_profiling( + baseline_snapshot: MemorySnapshot, + weights_memory: int) -> Generator[MemoryProfilingResult, None, None]: + """Memory profiling context manager. + baseline_snapshot: the memory snapshot before the current vLLM instance. + weights_memory: memory used by PyTorch when loading the model weights. + Note that, before loading the model weights, we also initialize the device + and distributed environment, which may consume some memory. This part is not + included in the weights_memory because PyTorch does not control it. + + The memory in one GPU can be classified into 3 categories: + 1. memory used by anything other than the current vLLM instance. + 2. memory used by torch in the current vLLM instance. + 3. memory used in the current vLLM instance, but not by torch. + + A quantitive example: + + Before creating the current vLLM instance: + category 1: 1 GiB + category 2: 0 GiB + category 3: 0 GiB + + After creating the current vLLM instance and loading the model, + (i.e. before profiling): + category 1: 1 GiB + category 2: 2 GiB (model weights take 2 GiB) + category 3: 0.5 GiB (memory used by NCCL) + + During profiling (peak): + category 1: 1 GiB + category 2: 4 GiB (peak activation tensors take 2 GiB) + category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) + + After profiling: + category 1: 1 GiB + category 2: 3 GiB (after garbage-collecting activation tensors) + category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) + + In this case, non-kv cache takes 5 GiB in total, including: + a. 2 GiB used by the model weights (category 2) + b. 2 GiB reserved for the peak activation tensors (category 2) + c. 1 GiB used by non-torch components (category 3) + + The memory used for loading weights (a.) is directly given from the argument `weights_memory`. + + The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). + + The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.). + """ # noqa + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + result = MemoryProfilingResult() + + result.before_create = baseline_snapshot + # the part of memory used for holding the model weights + result.weights_memory = weights_memory + + result.before_profile.measure() + + yield result + + gc.collect() + torch.cuda.empty_cache() + + result.after_profile.measure() + + diff_profile = result.after_profile - result.before_profile + diff_from_create = result.after_profile - result.before_create + result.torch_peak_increase = diff_profile.torch_peak + result.non_torch_increase = diff_from_create.non_torch_memory + result.profile_time = diff_profile.timestamp + result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa + + +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 +def set_ulimit(target_soft_limit=65535): + resource_type = resource.RLIMIT_NOFILE + current_soft, current_hard = resource.getrlimit(resource_type) + + if current_soft < target_soft_limit: + try: + resource.setrlimit(resource_type, + (target_soft_limit, current_hard)) + except ValueError as e: + logger.warning( + "Found ulimit of %s and failed to automatically increase" + "with error %s. This can cause fd limit errors like" + "`OSError: [Errno 24] Too many open files`. Consider " + "increasing with ulimit -n", current_soft, e) + + +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501 +def get_exception_traceback(): + etype, value, tb = sys.exc_info() + err_str = "".join(traceback.format_exception(etype, value, tb)) + return err_str + + +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501 +def make_zmq_socket( + ctx: Union[zmq.asyncio.Context, zmq.Context], # type: ignore[name-defined] + path: str, + type: Any, +) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] + """Make a ZMQ socket with the proper bind/connect semantics.""" + + mem = psutil.virtual_memory() + socket = ctx.socket(type) + + # Calculate buffer size based on system memory + total_mem = mem.total / 1024**3 + available_mem = mem.available / 1024**3 + # For systems with substantial memory (>32GB total, >16GB available): + # - Set a large 0.5GB buffer to improve throughput + # For systems with less memory: + # - Use system default (-1) to avoid excessive memory consumption + if total_mem > 32 and available_mem > 16: + buf_size = int(0.5 * 1024**3) # 0.5GB in bytes + else: + buf_size = -1 # Use system default buffer size + + if type == zmq.constants.PULL: + socket.setsockopt(zmq.constants.RCVHWM, 0) + socket.setsockopt(zmq.constants.RCVBUF, buf_size) + socket.connect(path) + elif type == zmq.constants.PUSH: + socket.setsockopt(zmq.constants.SNDHWM, 0) + socket.setsockopt(zmq.constants.SNDBUF, buf_size) + socket.bind(path) + else: + raise ValueError(f"Unknown Socket Type: {type}") + + return socket + + +@contextlib.contextmanager +def zmq_socket_ctx( + path: str, + type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] + """Context manager for a ZMQ socket""" + + ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] + try: + yield make_zmq_socket(ctx, path, type) + + except KeyboardInterrupt: + logger.debug("Got Keyboard Interrupt.") + + finally: + ctx.destroy(linger=0) + + +def _check_multiproc_method(): + if (cuda_is_initialized() + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): + logger.warning("CUDA was previously initialized. We must use " + "the `spawn` multiprocessing start method. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "See https://docs.vllm.ai/en/latest/getting_started/" + "troubleshooting.html#python-multiprocessing " + "for more information.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def get_mp_context(): + _check_multiproc_method() + mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD + return multiprocessing.get_context(mp_method) + + +def bind_kv_cache( + ctx: Dict[str, Any], + kv_cache: List[List[torch.Tensor]], # [virtual_engine][layer_index] +) -> None: + # Bind the kv_cache tensor to Attention modules, similar to + # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)] + # Special things handled here: + # 1. Some models have non-attention layers, e.g., Jamba + # 2. Pipeline parallelism, each rank only has a subset of layers + # 3. Encoder attention has no kv cache + # 4. Encoder-decoder models, encoder-decoder attention and decoder-only + # attention of the same layer (e.g., bart's decoder.layers.1.self_attn + # and decoder.layers.1.encoder_attn) is mapped to the same kv cache + # tensor + from vllm.attention import AttentionType + from vllm.model_executor.models.utils import extract_layer_index + layer_need_kv_cache = [ + layer_name for layer_name in ctx + if ctx[layer_name].attn_type in (AttentionType.DECODER, + AttentionType.ENCODER_DECODER) + ] + layer_index_sorted = sorted( + set( + extract_layer_index(layer_name) + for layer_name in layer_need_kv_cache)) + for layer_name in layer_need_kv_cache: + kv_cache_idx = layer_index_sorted.index( + extract_layer_index(layer_name)) + forward_ctx = ctx[layer_name] + assert len(forward_ctx.kv_cache) == len(kv_cache) + for ve, ve_kv_cache in enumerate(kv_cache): + forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx] + + +def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any], + kwargs: Dict[str, Any]) -> Any: + """ + Run a method of an object with the given arguments and keyword arguments. + If the method is string, it will be converted to a method using getattr. + If the method is serialized bytes and will be deserialized using + cloudpickle. + If the method is a callable, it will be called directly. + """ + if isinstance(method, bytes): + func = partial(cloudpickle.loads(method), obj) + elif isinstance(method, str): + try: + func = getattr(obj, method) + except AttributeError: + raise NotImplementedError(f"Method {method!r} is not" + " implemented.") from None + else: + func = partial(method, obj) # type: ignore + return func(*args, **kwargs) + + +def import_pynvml(): + """ + Historical comments: + + libnvml.so is the library behind nvidia-smi, and + pynvml is a Python wrapper around it. We use it to get GPU + status without initializing CUDA context in the current process. + Historically, there are two packages that provide pynvml: + - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official + wrapper. It is a dependency of vLLM, and is installed when users + install vLLM. It provides a Python module named `pynvml`. + - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper. + Prior to version 12.0, it also provides a Python module `pynvml`, + and therefore conflicts with the official one. What's worse, + the module is a Python package, and has higher priority than + the official one which is a standalone Python file. + This causes errors when both of them are installed. + Starting from version 12.0, it migrates to a new module + named `pynvml_utils` to avoid the conflict. + + TL;DR: if users have pynvml<12.0 installed, it will cause problems. + Otherwise, `import pynvml` will import the correct module. + We take the safest approach here, to manually import the correct + `pynvml.py` module from the `nvidia-ml-py` package. + """ + if TYPE_CHECKING: + import pynvml + return pynvml + if "pynvml" in sys.modules: + import pynvml + if pynvml.__file__.endswith("__init__.py"): + # this is pynvml < 12.0 + raise RuntimeError( + "You are using a deprecated `pynvml` package. " + "Please uninstall `pynvml` or upgrade to at least" + " version 12.0. See https://pypi.org/project/pynvml " + "for more information.") + return sys.modules["pynvml"] + import importlib.util + import os + import site + for site_dir in site.getsitepackages(): + pynvml_path = os.path.join(site_dir, "pynvml.py") + if os.path.exists(pynvml_path): + spec = importlib.util.spec_from_file_location( + "pynvml", pynvml_path) + pynvml = importlib.util.module_from_spec(spec) + sys.modules["pynvml"] = pynvml + spec.loader.exec_module(pynvml) + return pynvml diff --git a/.venv/lib/python3.11/site-packages/vllm/version.py b/.venv/lib/python3.11/site-packages/vllm/version.py new file mode 100644 index 0000000000000000000000000000000000000000..70cd0289b441f8e8dc169e1e269d6fe90f44eb27 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/version.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: Apache-2.0 + +try: + from ._version import __version__, __version_tuple__ +except Exception as e: + import warnings + + warnings.warn(f"Failed to read commit hash:\n{e}", + RuntimeWarning, + stacklevel=2) + + __version__ = "dev" + __version_tuple__ = (0, 0, __version__)