diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/__init__.py b/.venv/lib/python3.11/site-packages/vllm/assets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de9e51bfc0d09c60c55f66b2c9f46362d4a884a8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/audio.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/audio.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3eda9220e8c7a627e59b84e0628cffe7498b0529 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/audio.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..233de1023c0aa423d04a40ad3386f3562ef96a65 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/base.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/image.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/image.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7df8a9bf9066c0c8da290d699e83ddf3341809cb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/image.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/video.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/video.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88cb51934d834a372b3ce8fae5b12a6ef9dce8f6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/video.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/audio.py b/.venv/lib/python3.11/site-packages/vllm/assets/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..d9e51082e6ca2a7ea54deb5c30a30b35c561eadf --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/assets/audio.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import Literal +from urllib.parse import urljoin + +import numpy.typing as npt + +from vllm.utils import PlaceholderModule + +from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] + +ASSET_DIR = "multimodal_asset" + + +@dataclass(frozen=True) +class AudioAsset: + name: Literal["winning_call", "mary_had_lamb"] + + @property + def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: + audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", + s3_prefix=ASSET_DIR) + return librosa.load(audio_path, sr=None) + + @property + def url(self) -> str: + return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg") diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/base.py b/.venv/lib/python3.11/site-packages/vllm/assets/base.py new file mode 100644 index 0000000000000000000000000000000000000000..03f3b9dabf1438662d533be53d3abacc679c472b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/assets/base.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 + +from functools import lru_cache +from pathlib import Path +from typing import Optional + +import vllm.envs as envs +from vllm.connections import global_http_connection + +VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com" + + +def get_cache_dir() -> Path: + """Get the path to the cache for storing downloaded assets.""" + path = Path(envs.VLLM_ASSETS_CACHE) + path.mkdir(parents=True, exist_ok=True) + + return path + + +@lru_cache +def get_vllm_public_assets(filename: str, + s3_prefix: Optional[str] = None) -> Path: + """ + Download an asset file from ``s3://vllm-public-assets`` + and return the path to the downloaded file. + """ + asset_directory = get_cache_dir() / "vllm_public_assets" + asset_directory.mkdir(parents=True, exist_ok=True) + + asset_path = asset_directory / filename + if not asset_path.exists(): + if s3_prefix is not None: + filename = s3_prefix + "/" + filename + global_http_connection.download_file( + f"{VLLM_S3_BUCKET_URL}/{filename}", + asset_path, + timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT) + + return asset_path diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/image.py b/.venv/lib/python3.11/site-packages/vllm/assets/image.py new file mode 100644 index 0000000000000000000000000000000000000000..2b1d258da9c784ab2eeffbcb380afbf75c338183 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/assets/image.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import Literal + +import torch +from PIL import Image + +from .base import get_vllm_public_assets + +VLM_IMAGES_DIR = "vision_model_images" + + +@dataclass(frozen=True) +class ImageAsset: + name: Literal["stop_sign", "cherry_blossom"] + + @property + def pil_image(self) -> Image.Image: + image_path = get_vllm_public_assets(filename=f"{self.name}.jpg", + s3_prefix=VLM_IMAGES_DIR) + return Image.open(image_path) + + @property + def image_embeds(self) -> torch.Tensor: + """ + Image embeddings, only used for testing purposes with llava 1.5. + """ + image_path = get_vllm_public_assets(filename=f"{self.name}.pt", + s3_prefix=VLM_IMAGES_DIR) + return torch.load(image_path, map_location="cpu", weights_only=True) diff --git a/.venv/lib/python3.11/site-packages/vllm/assets/video.py b/.venv/lib/python3.11/site-packages/vllm/assets/video.py new file mode 100644 index 0000000000000000000000000000000000000000..494cfc38381cfd70c93787a9eb013eec89425768 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/assets/video.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from functools import lru_cache +from typing import List, Literal + +import cv2 +import numpy as np +import numpy.typing as npt +from huggingface_hub import hf_hub_download +from PIL import Image + +from vllm.multimodal.video import sample_frames_from_video + +from .base import get_cache_dir + + +@lru_cache +def download_video_asset(filename: str) -> str: + """ + Download and open an image from huggingface + repo: raushan-testing-hf/videos-test + """ + video_directory = get_cache_dir() / "video-example-data" + video_directory.mkdir(parents=True, exist_ok=True) + + video_path = video_directory / filename + video_path_str = str(video_path) + if not video_path.exists(): + video_path_str = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", + filename=filename, + repo_type="dataset", + cache_dir=video_directory, + ) + return video_path_str + + +def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: + cap = cv2.VideoCapture(path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file {path}") + + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + frames = [] + for i in range(total_frames): + ret, frame = cap.read() + if ret: + frames.append(frame) + cap.release() + + frames = np.stack(frames) + frames = sample_frames_from_video(frames, num_frames) + if len(frames) < num_frames: + raise ValueError(f"Could not read enough frames from video file {path}" + f" (expected {num_frames} frames, got {len(frames)})") + return frames + + +def video_to_pil_images_list(path: str, + num_frames: int = -1) -> List[Image.Image]: + frames = video_to_ndarrays(path, num_frames) + return [ + Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + for frame in frames + ] + + +@dataclass(frozen=True) +class VideoAsset: + name: Literal["sample_demo_1.mp4"] + num_frames: int = -1 + + @property + def pil_images(self) -> List[Image.Image]: + video_path = download_video_asset(self.name) + ret = video_to_pil_images_list(video_path, self.num_frames) + return ret + + @property + def np_ndarrays(self) -> npt.NDArray: + video_path = download_video_asset(self.name) + ret = video_to_ndarrays(video_path, self.num_frames) + return ret diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/__init__.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..741bd1a6a1c1264b9bb52e1f8ced82fe8578e1b1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/__init__.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: Apache-2.0 + +from .base import MultiModalPlaceholderMap, MultiModalPlugin +from .hasher import MultiModalHashDict, MultiModalHasher +from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, + MultiModalDataDict, MultiModalKwargs, + MultiModalPlaceholderDict, NestedTensors) +from .registry import MultiModalRegistry + +MULTIMODAL_REGISTRY = MultiModalRegistry() +""" +The global :class:`~MultiModalRegistry` is used by model runners to +dispatch data processing according to the target model. + +See also: + :ref:`mm-processing` +""" + +__all__ = [ + "BatchedTensorInputs", + "ModalityData", + "MultiModalDataBuiltins", + "MultiModalDataDict", + "MultiModalHashDict", + "MultiModalHasher", + "MultiModalKwargs", + "MultiModalPlaceholderDict", + "MultiModalPlaceholderMap", + "MultiModalPlugin", + "NestedTensors", + "MULTIMODAL_REGISTRY", + "MultiModalRegistry", +] diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/hasher.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/hasher.py new file mode 100644 index 0000000000000000000000000000000000000000..7d277fd67deca1425603c857586a5bdd66867248 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/hasher.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pickle +from typing import TYPE_CHECKING, Iterable, Mapping, Optional + +import numpy as np +import torch +from blake3 import blake3 +from PIL import Image + +from vllm.logger import init_logger + +if TYPE_CHECKING: + from vllm.inputs import TokensPrompt + +logger = init_logger(__name__) + +MultiModalHashDict = Mapping[str, list[str]] +""" +A dictionary containing hashes for items in each modality. +""" + + +class MultiModalHasher: + + @classmethod + def serialize_item(cls, obj: object) -> bytes: + # Simple cases + if isinstance(obj, str): + return obj.encode("utf-8") + if isinstance(obj, bytes): + return obj + if isinstance(obj, Image.Image): + return obj.tobytes() + + # Convertible to NumPy arrays + if isinstance(obj, torch.Tensor): + obj = obj.numpy() + if isinstance(obj, (int, float)): + obj = np.array(obj) + if isinstance(obj, np.ndarray): + return obj.tobytes() + + logger.warning( + "No serialization method found for %s. " + "Falling back to pickle.", type(obj)) + + return pickle.dumps(obj) + + @classmethod + def item_to_bytes( + cls, + key: str, + obj: object, + ) -> Iterable[tuple[bytes, bytes]]: + # Recursive cases + if isinstance(obj, (list, tuple)): + for i, elem in enumerate(obj): + yield from cls.item_to_bytes(f"{key}.{i}", elem) + elif isinstance(obj, dict): + for k, v in obj.items(): + yield from cls.item_to_bytes(f"{key}.{k}", v) + else: + key_bytes = cls.serialize_item(key) + value_bytes = cls.serialize_item(obj) + yield key_bytes, value_bytes + + @classmethod + def hash_kwargs(cls, **kwargs: object) -> str: + hasher = blake3() + + for k, v in kwargs.items(): + for k_bytes, v_bytes in cls.item_to_bytes(k, v): + hasher.update(k_bytes) + hasher.update(v_bytes) + + return hasher.hexdigest() + + @classmethod + def hash_prompt_mm_data( + cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]: + """Hash multimodal data in the user input prompt if they exist.""" + + if "multi_modal_data" not in prompt: + return None + + mm_data = prompt["multi_modal_data"] + if not mm_data: + # mm_data can be None or an empty dict. + return None + + mm_items = { + modality: items if isinstance(items, list) else [items] + for modality, items in mm_data.items() + } + + mm_hashes = { + modality: [cls.hash_kwargs(**{modality: item}) for item in items] + for modality, items in mm_items.items() + } + + return mm_hashes diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/image.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/image.py new file mode 100644 index 0000000000000000000000000000000000000000..98ac8057e8f18342f28318c58d8b069f41ce4323 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/image.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: Apache-2.0 + +import base64 +from functools import lru_cache +from io import BytesIO +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, Optional + +import torch +from PIL import Image + +from vllm.inputs.registry import InputContext +from vllm.logger import init_logger +from vllm.transformers_utils.processor import get_image_processor +from vllm.utils import is_list_of + +from .base import MediaIO, MultiModalPlugin +from .inputs import ImageItem, ModalityData, MultiModalKwargs + +if TYPE_CHECKING: + from vllm.config import ModelConfig + +logger = init_logger(__name__) + +cached_get_image_processor = lru_cache(get_image_processor) + + +class ImagePlugin(MultiModalPlugin): + """Plugin for image data.""" + + def get_data_key(self) -> str: + return "image" + + def _get_hf_image_processor( + self, + model_config: "ModelConfig", + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + ): + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + return cached_get_image_processor( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + **mm_processor_kwargs) + + def _default_input_mapper( + self, + ctx: InputContext, + data: ModalityData[ImageItem], + **mm_processor_kwargs, + ) -> MultiModalKwargs: + model_config = ctx.model_config + + # PIL image + if isinstance(data, Image.Image) or is_list_of(data, Image.Image): + image_processor = self._get_hf_image_processor( + model_config, + mm_processor_kwargs, + ) + + if image_processor is None: + raise RuntimeError("No HuggingFace processor is available " + "to process the image object") + try: + # NOTE: It may make sense to forward the mm_processor_kwargs + # here too. For now, to keep it simple, we only allow it be + # used for the initialization call though, just in case the + # signatures of the preprocessor initializer don't match + # preprocess() + batch_data = image_processor \ + .preprocess(data, return_tensors="pt") \ + .data + except Exception: + logger.error( + "Failed to process image (%s) with the default mapper. " + "This is most likely an edge-case with this model's image " + "processor in transformers (type: %s), and not vLLM.", + data, + type(image_processor).__name__) + raise + + return MultiModalKwargs(batch_data) + + # Image embedding + elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor): + return MultiModalKwargs({"image_embeds": data}) + + raise TypeError(f"Invalid image type: {type(data)}") + + def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: + return 3000 + + +def rescale_image_size(image: Image.Image, + size_factor: float, + transpose: int = -1) -> Image.Image: + """Rescale the dimensions of an image by a constant factor.""" + new_width = int(image.width * size_factor) + new_height = int(image.height * size_factor) + image = image.resize((new_width, new_height)) + if transpose >= 0: + image = image.transpose(Image.Transpose(transpose)) + return image + + +class ImageMediaIO(MediaIO[Image.Image]): + + def __init__(self, *, image_mode: str = "RGB") -> None: + super().__init__() + + self.image_mode = image_mode + + def load_bytes(self, data: bytes) -> Image.Image: + image = Image.open(BytesIO(data)) + image.load() + return image.convert(self.image_mode) + + def load_base64(self, media_type: str, data: str) -> Image.Image: + return self.load_bytes(base64.b64decode(data)) + + def load_file(self, filepath: Path) -> Image.Image: + image = Image.open(filepath) + image.load() + return image.convert(self.image_mode) + + def encode_base64( + self, + media: Image.Image, + *, + image_format: str = "JPEG", + ) -> str: + image = media + + with BytesIO() as buffer: + image = image.convert(self.image_mode) + image.save(buffer, image_format) + data = buffer.getvalue() + + return base64.b64encode(data).decode('utf-8') diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/inputs.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/inputs.py new file mode 100644 index 0000000000000000000000000000000000000000..5f9593ee8b2052d23000ff6319f0b9813d9e4f6f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/inputs.py @@ -0,0 +1,741 @@ +# SPDX-License-Identifier: Apache-2.0 + +from abc import ABC, abstractmethod +from collections import UserDict, defaultdict +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from functools import partial +from itertools import accumulate +from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, + Union, cast, final) + +import numpy as np +import torch +import torch.types +from PIL.Image import Image +from transformers import BatchFeature +from typing_extensions import NotRequired, TypeAlias + +from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves + +if TYPE_CHECKING: + from .hasher import MultiModalHashDict + +_T = TypeVar("_T") + +HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] +""" +A :class:`transformers.image_utils.ImageInput` representing a single image +item, which can be passed to a HuggingFace :code:`ImageProcessor`. +""" + +HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor, + list[np.ndarray], list[torch.Tensor]] +""" +A :class:`transformers.image_utils.VideoInput` representing a single video +item, which can be passed to a HuggingFace :code:`VideoProcessor`. +""" + +HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor] +""" +Represents a single audio +item, which can be passed to a HuggingFace :code:`AudioProcessor`. +""" + +ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor] +""" +A :class:`transformers.image_utils.ImageInput` representing a single image +item, which can be passed to a HuggingFace :code:`ImageProcessor`. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as image embeddings; +these are directly passed to the model without HF processing. +""" + +VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor] +""" +A :class:`transformers.image_utils.VideoInput` representing a single video +item, which can be passed to a HuggingFace :code:`VideoProcessor`. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as video embeddings; +these are directly passed to the model without HF processing. +""" + +AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], + torch.Tensor] +""" +Represents a single audio +item, which can be passed to a HuggingFace :code:`AudioProcessor`. + +Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate +is different from that expected by the model; +these are resampled to the model's sampling rate before being processed by HF. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as audio embeddings; +these are directly passed to the model without HF processing. +""" + +ModalityData: TypeAlias = Union[_T, list[_T]] +""" +Either a single data item, or a list of data items. + +The number of data items allowed per modality is restricted by +:code:`--limit-mm-per-prompt`. +""" + + +@final +class MultiModalDataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: ModalityData[ImageItem] + """The input image(s).""" + + video: ModalityData[VideoItem] + """The input video(s).""" + + audio: ModalityData[AudioItem] + """The input audio(s).""" + + +MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] +""" +A dictionary containing an entry for each modality type to input. + +The built-in modalities are defined by :class:`MultiModalDataBuiltins`. +""" + + +class PlaceholderRange(TypedDict): + """ + Placeholder location information for multi-modal data. + + Example: + + Prompt: :code:`AAAA BBBB What is in these images?` + + Images A and B will have: + + .. code-block:: + + A: { "offset": 0, "length": 4 } + B: { "offset": 5, "length": 4 } + """ + + offset: int + """The start index of the placeholder in the prompt.""" + + length: int + """The length of the placeholder.""" + + +NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor, + tuple[torch.Tensor, ...]] +""" +Uses a list instead of a tensor if the dimensions of each element do not match. +""" + + +def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: + """Equality check between :data:`NestedTensors` objects.""" + if isinstance(a, torch.Tensor): + return isinstance(b, torch.Tensor) and torch.equal(a, b) + elif isinstance(b, torch.Tensor): + return isinstance(a, torch.Tensor) and torch.equal(b, a) + + if isinstance(a, list): + return (isinstance(b, list) + and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b))) + if isinstance(b, list): + return (isinstance(a, list) + and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a))) + + # Both a and b are scalars + return a == b + + +BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] +""" +A dictionary containing nested tensors which have been batched via +:meth:`MultiModalKwargs.batch`. +""" + + +@dataclass(frozen=True) +class MultiModalFieldElem: + """ + Represents a keyword argument corresponding to a multi-modal item + in :class:`MultiModalKwargs`. + """ + + modality: str + """ + The modality of the corresponding multi-modal item. + Each multi-modal item can consist of multiple keyword arguments. + """ + + key: str + """ + The key of this field in :class:`MultiModalKwargs`, + i.e. the name of the keyword argument to be passed to the model. + """ + + data: NestedTensors + """ + The tensor data of this field in :class:`MultiModalKwargs`, + i.e. the value of the keyword argument to be passed to the model. + """ + + field: "BaseMultiModalField" + """ + Defines how to combine the tensor data of this field with others + in order to batch multi-modal items together for model inference. + """ + + def __eq__(self, other: object) -> bool: + if not isinstance(other, self.__class__): + return False + + return ((self.modality, self.key) == (other.modality, other.key) + and nested_tensors_equal(self.data, other.data) + and type(self.field) == type(other.field)) # noqa: E721 + + +@dataclass(frozen=True) +class BaseMultiModalField(ABC): + """ + Defines how to interpret tensor data belonging to a keyword argument in + :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa. + """ + + def _field_factory(self, *, modality: str, key: str): + f = partial( + MultiModalFieldElem, + modality=modality, + key=key, + field=self, + ) + + # Allow passing data as positional argument + def factory(data: NestedTensors) -> MultiModalFieldElem: + return f(data=data) + + return factory + + @abstractmethod + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + """ + Construct :class:`MultiModalFieldElem` instances to represent + the provided data. + + This is the inverse of :meth:`reduce_data`. + """ + raise NotImplementedError + + @abstractmethod + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + raise NotImplementedError + + def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: + """ + Merge the data from multiple instances of :class:`MultiModalFieldElem`. + + This is the inverse of :meth:`build_elems`. + """ + field_types = [type(item.field) for item in elems] + if len(set(field_types)) > 1: + raise ValueError(f"Cannot merge different {field_types=}") + + return self._reduce_data([item.data for item in elems]) + + +@dataclass(frozen=True) +class MultiModalBatchedField(BaseMultiModalField): + """ + See also: + :func:`MultiModalFieldConfig.batched` + """ + + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(item) for item in data] + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + if len(batch) == 1: + # An optimization when `batch` contains only one tensor: + # - produce exactly same result as `torch.stack(batch)` + # - will achieve zero-copy if the tensor is contiguous + return batch[0].unsqueeze(0).contiguous() + first_shape = batch[0].shape + if all(elem.shape == first_shape for elem in batch): + return torch.stack(batch) + + return batch + + +@dataclass(frozen=True) +class MultiModalFlatField(BaseMultiModalField): + """ + See also: + :func:`MultiModalFieldConfig.flat` + :func:`MultiModalFieldConfig.flat_from_sizes` + """ + slices: Sequence[slice] + + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(data[s]) for s in self.slices] + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + if len(batch) == 1: + # An optimization when `batch` contains only one tensor: + # - produce exactly same result as `torch.concat(batch)` + # - will achieve zero-copy if the tensor is contiguous + return batch[0].contiguous() + first_shape = batch[0].shape + if all(elem.shape[1:] == first_shape[1:] for elem in batch): + return torch.concat(batch) + + return [e for elem in batch for e in elem] + + +@dataclass(frozen=True) +class MultiModalSharedField(BaseMultiModalField): + """ + See also: + :func:`MultiModalFieldConfig.shared` + """ + batch_size: int + + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(data)] * self.batch_size + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + return batch[0] + + +class MultiModalFieldConfig: + + @staticmethod + def batched(modality: str): + """ + Defines a field where an element in the batch is obtained by + indexing into the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + + Example: + + .. code-block:: + + Input: + Data: [[AAAA] + [BBBB] + [CCCC]] + + Output: + Element 1: [AAAA] + Element 2: [BBBB] + Element 3: [CCCC] + """ + return MultiModalFieldConfig( + field=MultiModalBatchedField(), + modality=modality, + ) + + @staticmethod + def flat(modality: str, slices: Sequence[slice]): + """ + Defines a field where an element in the batch is obtained by + slicing along the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + slices: For each multi-modal item, a slice that is used to extract + the data corresponding to it. + + Example: + + .. code-block:: + + Given: + slices: [slice(0, 3), slice(3, 7), slice(7, 9)] + + Input: + Data: [AAABBBBCC] + + Output: + Element 1: [AAA] + Element 2: [BBBB] + Element 3: [CC] + """ + return MultiModalFieldConfig( + field=MultiModalFlatField(slices=slices), + modality=modality, + ) + + @staticmethod + def flat_from_sizes(modality: str, size_per_item: torch.Tensor): + """ + Defines a field where an element in the batch is obtained by + slicing along the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + slices: For each multi-modal item, the size of the slice that + is used to extract the data corresponding to it. + + Example: + + .. code-block:: + + Given: + size_per_item: [3, 4, 2] + + Input: + Data: [AAABBBBCC] + + Output: + Element 1: [AAA] + Element 2: [BBBB] + Element 3: [CC] + + See also: + :func:`MultiModalFieldConfig.flat` + """ + + slice_idxs = [0, *accumulate(size_per_item)] + slices = [ + slice(slice_idxs[i], slice_idxs[i + 1]) + for i in range(len(size_per_item)) + ] + + return MultiModalFieldConfig.flat(modality, slices) + + @staticmethod + def shared(modality: str, batch_size: int): + """ + Defines a field where an element in the batch is obtained by + taking the entirety of the underlying data. + + This means that the data is the same for each element in the batch. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + batch_size: The number of multi-modal items which share this data. + + Example: + + .. code-block:: + + Given: + batch_size: 4 + + Input: + Data: [XYZ] + + Output: + Element 1: [XYZ] + Element 2: [XYZ] + Element 3: [XYZ] + Element 4: [XYZ] + """ + return MultiModalFieldConfig( + field=MultiModalSharedField(batch_size), + modality=modality, + ) + + def __init__(self, field: BaseMultiModalField, modality: str) -> None: + super().__init__() + + self.field = field + self.modality = modality + + def build_elems( + self, + key: str, + batch: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + return self.field.build_elems(self.modality, key, batch) + + +class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): + """ + A collection of :class:`MultiModalFieldElem` + corresponding to a data item in :class:`MultiModalDataItems`. + """ + + @staticmethod + def from_elems(elems: Sequence[MultiModalFieldElem]): + return MultiModalKwargsItem({elem.key: elem for elem in elems}) + + @property + def modality(self) -> str: + modalities = {elem.modality for elem in self.data.values()} + assert len(modalities) == 1, f"Found different modalities={modalities}" + return next(iter(modalities)) + + +# NOTE: UserDict is for V0 compatibility. +# V1 should access individual items via `get_item`. +class MultiModalKwargs(UserDict[str, NestedTensors]): + """ + A dictionary that represents the keyword arguments to + :meth:`~torch.nn.Module.forward`. + + The metadata :code:`items` enables us to obtain the keyword arguments + corresponding to each data item in :class:`MultiModalDataItems`, via + :meth:`get_item` and :meth:`get_items`. + """ + + @staticmethod + def from_hf_inputs( + hf_inputs: BatchFeature, + config_by_key: Mapping[str, MultiModalFieldConfig], + ): + # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` + # We assume that those fields are not used in vLLM + elems_by_key = dict[str, Sequence[MultiModalFieldElem]]() + keys_by_modality = defaultdict[str, set[str]](set) + for key, config in config_by_key.items(): + batch = hf_inputs.get(key) + if batch is not None: + elems = config.build_elems(key, batch) + if len(elems) > 0: + elems_by_key[key] = elems + keys_by_modality[config.modality].add(key) + + items = list[MultiModalKwargsItem]() + for modality, keys in keys_by_modality.items(): + elems_in_modality = {k: elems_by_key[k] for k in keys} + batch_sizes = {k: len(v) for k, v in elems_in_modality.items()} + + if len(set(batch_sizes.values())) > 1: + raise ValueError( + f"Cannot merge different batch sizes for {modality=}! " + f"Found: {batch_sizes=}") + + batch_size = next(iter(batch_sizes.values())) + for item_idx in range(batch_size): + elems = [v[item_idx] for v in elems_in_modality.values()] + items.append(MultiModalKwargsItem.from_elems(elems)) + + return MultiModalKwargs.from_items(items) + + @staticmethod + def from_items(items: Sequence[MultiModalKwargsItem]): + """Construct a new :class:`MultiModalKwargs` from multiple items.""" + elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) + for item in items: + for key, elem in item.items(): + elems_by_key[key].append(elem) + + data = { + key: elems[0].field.reduce_data(elems) + for key, elems in elems_by_key.items() if len(elems) > 0 + } + + return MultiModalKwargs(data, items=items) + + def __init__( + self, + data: Mapping[str, NestedTensors], + *, + items: Optional[Sequence[MultiModalKwargsItem]] = None, + ) -> None: + super().__init__(data) + + items_by_modality = full_groupby(items or [], key=lambda x: x.modality) + self._items_by_modality = dict(items_by_modality) + + @property + def modalities(self): + return self._items_by_modality.keys() + + @staticmethod + def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: + """ + Stack the inner dimensions that have the same shape in + a nested list of tensors. + + Thus, a dimension represented by a list means that the inner + dimensions are different for each element along that dimension. + """ + if isinstance(nested_tensors, torch.Tensor): + return nested_tensors + + # TODO: Remove these once all models have been migrated + if isinstance(nested_tensors, np.ndarray): + return torch.from_numpy(nested_tensors) + if isinstance(nested_tensors, (int, float)): + return torch.tensor(nested_tensors) + + stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] + if not is_list_of(stacked, torch.Tensor, check="all"): + # Only tensors (not lists) can be stacked. + return stacked + + tensors_ = cast(list[torch.Tensor], stacked) + if len(tensors_) == 1: + # An optimization when `tensors_` contains only one tensor: + # - produce exactly same result as `torch.stack(tensors_)` + # - will achieve zero-copy if the tensor is contiguous + return tensors_[0].unsqueeze(0).contiguous() + + if any(t.shape != tensors_[0].shape for t in tensors_): + # The tensors have incompatible shapes and can't be stacked. + return tensors_ + + return torch.stack(tensors_) + + @staticmethod + def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs: + """ + Batch multiple inputs together into a dictionary. + + The resulting dictionary has the same keys as the inputs. + If the corresponding value from each input is a tensor and they all + share the same shape, the output value is a single batched tensor; + otherwise, the output value is a list containing the original value + from each input. + """ + if len(inputs_list) == 0: + return {} + + # We need to consider the case where each item in the batch + # contains different modalities (i.e. different keys). + item_lists = defaultdict[str, list[NestedTensors]](list) + + for inputs in inputs_list: + for k, v in inputs.items(): + item_lists[k].append(v) + + return { + k: MultiModalKwargs._try_stack(item_list) + for k, item_list in item_lists.items() + } + + @staticmethod + def as_kwargs( + batched_inputs: BatchedTensorInputs, + *, + device: torch.types.Device, + ) -> BatchedTensorInputs: + json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) + + json_mapped = json_map_leaves( + lambda x: x.to(device, non_blocking=True), + json_inputs, + ) + + return cast(BatchedTensorInputs, json_mapped) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, self.__class__): + return False + if self._items_by_modality != other._items_by_modality: + return False + + ks = self.keys() + return (ks == other.keys() + and all(nested_tensors_equal(self[k], other[k]) for k in ks)) + + def _validate_modality(self, method_name: str, modality: str) -> None: + if not self._items_by_modality: + raise RuntimeError( + f"`{method_name}` is not supported when " + "MultiModalKwargs is not initialized with `items`") + + if modality not in self._items_by_modality: + available_modalities = set(self._items_by_modality.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + + def get_item_count(self, modality: str) -> int: + """Get the number of items belonging to a modality.""" + self._validate_modality("get_item_count", modality) + return len(self._items_by_modality[modality]) + + def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem: + """ + Get the keyword arguments corresponding to an item identified by + its modality and index. + """ + self._validate_modality("get_item", modality) + return self._items_by_modality[modality][item_index] + + def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: + """ + Get the keyword arguments corresponding to each item belonging to + a modality. + """ + self._validate_modality("get_items", modality) + return self._items_by_modality[modality] + + +MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] +""" +A dictionary containing placeholder ranges for each modality. +""" + + +class MultiModalInputs(TypedDict): + """ + Represents the outputs of + :class:`vllm.multimodal.processing.BaseMultiModalProcessor`, + ready to be passed to vLLM internals. + """ + + type: Literal["multimodal"] + """The type of inputs.""" + + prompt: str + """The processed prompt text.""" + + prompt_token_ids: list[int] + """The processed token IDs which includes placeholder tokens.""" + + token_type_ids: NotRequired[list[int]] + """The token type IDs of the prompt.""" + + mm_kwargs: MultiModalKwargs + """Keyword arguments to be directly passed to the model after batching.""" + + mm_hashes: NotRequired[Optional["MultiModalHashDict"]] + """The hashes of the multi-modal data.""" + + mm_placeholders: MultiModalPlaceholderDict + """ + For each modality, information about the placeholder tokens in + :code:`prompt_token_ids`. + """ diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/parse.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/parse.py new file mode 100644 index 0000000000000000000000000000000000000000..063f458b2c4d958b578e3632c18a9057cb52838f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/parse.py @@ -0,0 +1,368 @@ +# SPDX-License-Identifier: Apache-2.0 + +from abc import ABC, abstractmethod +from collections import UserDict +from collections.abc import Callable, Iterator, Mapping, Sequence +from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar, + Union) + +import numpy as np +import torch +from PIL.Image import Image +from typing_extensions import TypeAlias, TypeGuard, assert_never + +from vllm.utils import is_list_of + +from .audio import resample_audio +from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, + ImageItem, ModalityData, MultiModalDataDict, VideoItem) + +_T = TypeVar("_T") +_I = TypeVar("_I") + + +class ModalityDataItems(ABC, Generic[_T, _I]): + """ + Represents data items for a modality in :class:`MultiModalDataItems`. + """ + + def __init__(self, data: _T, modality: str) -> None: + super().__init__() + + self.data = data + self.modality = modality + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r}, " + f"len={len(self)})") + + def __len__(self) -> int: + return self.get_count() + + def __getitem__(self, index: int) -> _I: + return self.get(index) + + if TYPE_CHECKING: + # Auto-generated + def __iter__(self) -> Iterator[_I]: + ... + + @abstractmethod + def get_count(self) -> int: + """Get the number of data items.""" + raise NotImplementedError + + @abstractmethod + def get(self, index: int) -> _I: + """Get a data item by its index.""" + raise NotImplementedError + + def get_all(self) -> list[_I]: + """Get all data items.""" + return [self.get(idx) for idx in range(self.get_count())] + + @abstractmethod + def get_processor_data(self) -> Mapping[str, object]: + """Get the data to pass to the HF processor.""" + raise NotImplementedError + + @abstractmethod + def get_passthrough_data(self) -> Mapping[str, object]: + """Get the data to pass directly to the model.""" + raise NotImplementedError + + +class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): + """Base class for data items that are arranged in a list.""" + + def get_count(self) -> int: + return len(self.data) + + def get(self, index: int) -> _T: + return self.data[index] + + def get_processor_data(self) -> Mapping[str, object]: + return {f"{self.modality}s": self.data} + + def get_passthrough_data(self) -> Mapping[str, object]: + return {} + + +class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]], + torch.Tensor]): + """ + Base class for data items that are expressed as a batched embedding tensor, + or a list of embedding tensors (one per item). + """ + + def get_count(self) -> int: + return len(self.data) + + def get(self, index: int) -> torch.Tensor: + return self.data[index] + + def get_processor_data(self) -> Mapping[str, object]: + return {} + + def get_passthrough_data(self) -> Mapping[str, object]: + return {f"{self.modality}_embeds": self.data} + + def get_feature_size(self, item_idx: int) -> int: + return len(self.get(item_idx)) + + +class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): + + def __init__(self, data: Sequence[HfAudioItem]) -> None: + super().__init__(data, "audio") + + +class AudioEmbeddingItems(EmbeddingItems): + + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: + super().__init__(data, "audio") + + +class ImageSize(NamedTuple): + width: int + height: int + + +class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): + + def __init__(self, data: Sequence[HfImageItem]) -> None: + super().__init__(data, "image") + + def get_image_size(self, item_idx: int) -> ImageSize: + image = self.get(item_idx) + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + + +class ImageEmbeddingItems(EmbeddingItems): + + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: + super().__init__(data, "image") + + +class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): + + def __init__(self, data: Sequence[HfVideoItem]) -> None: + super().__init__(data, "video") + + def get_num_frames(self, item_idx: int) -> int: + return len(self.get(item_idx)) + + def get_frame_size(self, item_idx: int) -> ImageSize: + image = self.get(item_idx)[0] # Assume that the video isn't empty + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + + +class VideoEmbeddingItems(EmbeddingItems): + + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: + super().__init__(data, "video") + + +_D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) + + +class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): + """ + As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized + such that each entry corresponds to a list. + """ + + def get_count(self, modality: str, *, strict: bool = True) -> int: + """ + Get the number of data items belonging to a modality. + + If `strict=False`, return `0` instead of raising :exc:`KeyError` + even if the modality is not found. + """ + if modality not in self: + if strict: + available_modalities = set(self.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + + return 0 + + return self[modality].get_count() + + def get_all_counts(self) -> Mapping[str, int]: + """Get the number of items belonging to each modality.""" + return {m: items.get_count() for m, items in self.items()} + + def get_items( + self, + modality: str, + typ: Union[type[_D], tuple[type[_D], ...]], + ) -> _D: + """ + Get the data items belonging to a modality, + requiring that they belong to a certain type. + """ + if modality not in self: + available_modalities = set(self.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + + items = self[modality] + if not isinstance(items, typ): + raise TypeError(f"Invalid type of data items for {modality=}. " + f"Expected type: {typ}, but " + f"found type: {type(items)}") + + return items # type: ignore[return-value] + + +ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], + ModalityDataItems[Any, Any]] + + +class MultiModalDataParser: + """ + Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into + :class:`MultiModalDataItems`. + + Args: + target_sr (float, optional): Enables automatic resampling of audio + items to the model's expected sampling rate. + """ + + def __init__(self, *, target_sr: Optional[float] = None) -> None: + super().__init__() + + self.target_sr = target_sr + + def _is_embeddings( + self, data: object + ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]: + if isinstance(data, torch.Tensor): + return data.ndim == 3 + if is_list_of(data, torch.Tensor): + return len(data) == 0 or data[0].ndim == 2 + + return False + + def _get_audio_with_sr( + self, + audio: AudioItem, + ) -> tuple[np.ndarray, Optional[float]]: + if isinstance(audio, tuple): + return audio + if isinstance(audio, list): + return np.array(audio), None + if isinstance(audio, np.ndarray): + return audio, None + if isinstance(audio, torch.Tensor): + return audio.numpy(), None + + assert_never(audio) + + def _parse_audio_data( + self, + data: ModalityData[AudioItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return AudioEmbeddingItems(data) + + if (is_list_of(data, float) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 1 + or isinstance(data, tuple)): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + new_audios = list[np.ndarray]() + for data_item in data_items: + audio, orig_sr = self._get_audio_with_sr(data_item) + if orig_sr is None: + new_audio = audio + else: + target_sr = self.target_sr + if target_sr is None: + raise RuntimeError( + "Audio resampling is not supported when " + "`target_sr` is not provided") + + new_audio = resample_audio(audio, + orig_sr=orig_sr, + target_sr=target_sr) + + new_audios.append(new_audio) + + return AudioProcessorItems(new_audios) + + def _parse_image_data( + self, + data: ModalityData[ImageItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return ImageEmbeddingItems(data) + + if (isinstance(data, Image) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 3): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + return ImageProcessorItems(data_items) + + def _parse_video_data( + self, + data: ModalityData[VideoItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return VideoEmbeddingItems(data) + + if (is_list_of(data, Image) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 4): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + return VideoProcessorItems(data_items) + + def _get_subparsers(self) -> Mapping[str, ModalityDataParser]: + return { + "audio": self._parse_audio_data, + "image": self._parse_image_data, + "video": self._parse_video_data, + } + + def parse_mm_data(self, + mm_data: MultiModalDataDict) -> MultiModalDataItems: + subparsers = self._get_subparsers() + + mm_items = MultiModalDataItems() + for k, v in mm_data.items(): + if k not in subparsers: + raise ValueError(f"Unsupported modality: {k}") + + mm_items[k] = subparsers[k](v) + + return mm_items diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/processing.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/processing.py new file mode 100644 index 0000000000000000000000000000000000000000..d704fa59b96af1514d478b12de8e6adfa7c333b9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/processing.py @@ -0,0 +1,1295 @@ +# SPDX-License-Identifier: Apache-2.0 + +import re +from abc import ABC, abstractmethod +from collections import defaultdict +from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, + Sequence) +from dataclasses import dataclass, field +from functools import lru_cache +from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, + TypeVar, Union) + +from transformers import BatchFeature, PretrainedConfig, ProcessorMixin + +import vllm.envs as envs +from vllm.inputs import InputProcessingContext +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, + encode_tokens) +from vllm.utils import LRUCache, flatten_2d_lists, full_groupby + +from .hasher import MultiModalHasher +from .inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem, + PlaceholderRange) +from .parse import MultiModalDataItems, MultiModalDataParser + +if TYPE_CHECKING: + from .profiling import BaseDummyInputsBuilder + +logger = init_logger(__name__) + +_S = TypeVar("_S", str, list[int]) + +PromptSeq = Union[str, list[int]] +"""A token sequence (list of token IDs) or text.""" + + +@dataclass +class PromptReplacementDetails: + """Details about the replacement token sequence or text.""" + + full: PromptSeq + """The full replacement.""" + + features: PromptSeq + """ + The part of the replacement that corresponds to feature placeholders; + this will be replaced by the output of the vision encoder during model + inference. + """ + + @staticmethod + def from_seq(seq: PromptSeq) -> "PromptReplacementDetails": + return PromptReplacementDetails(full=seq, features=seq) + + +PromptRepl = Union[PromptSeq, PromptReplacementDetails] +""" +The replacement token sequence or text. + +If only part of the replacement corresponds to feature placeholders, you can +use :class:`PromptReplacementDetails` to specify which part. +""" + + +@dataclass +class PromptReplacement: + """ + Defines how to replace portions of an input prompt with placeholder tokens. + + Example: + + For each image, replace one ```` input placeholder in the prompt + with a number of ```` feature placeholders + equal to the feature size of the vision encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement="" * image_feature_size, + ) + + As above, but further pad the feature placeholders with ```` + and ```, which are not supposed to be passed to the vision + encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement=PromptReplacementDetails( + full="".join([ + "", + "" * image_feature_size, + "", + ]), + features="" * image_feature_size, + ), + ) + + To avoid unnecessary tokenization during prompt replacement, + we recommended passing token sequences instead of text: + + .. code-block:: python + + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_bos_id] + [image_token_id] * image_feature_size + + [image_eos_id]), + features=[image_token_id] * image_feature_size, + ), + ) + """ + + modality: str + """The modality for which the replacement is made.""" + + target: PromptSeq + """The token sequence (or text) to find and replace.""" + + replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) + """ + Given the index of the processed item within :attr:`modality`, + output the replacement token sequence (or text). + + For convenience, you can directly pass in the replacement token sequence + (or text) instead of a function if it does not depend on the input. + """ + + def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement": + return BoundPromptReplacement( + tokenizer=tokenizer, + modality=self.modality, + _target=self.target, + _replacement=self.replacement, + ) + + +@lru_cache(maxsize=2048) +def _cached_encode( + tokenizer: AnyTokenizer, + text: str, + *, + add_special_tokens: bool = False, +) -> list[int]: + return encode_tokens(tokenizer, + text, + add_special_tokens=add_special_tokens) + + +@lru_cache(maxsize=2048) +def _cached_decode( + tokenizer: AnyTokenizer, + token_ids: tuple[int, ...], + *, + skip_special_tokens: bool = False, +) -> str: + return decode_tokens(tokenizer, + list(token_ids), + skip_special_tokens=skip_special_tokens) + + +class _HasModalityAttr(Protocol): + modality: str + + +class _HasModalityProp(Protocol): + + @property + def modality(self) -> str: + ... + + +_M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp]) + + +def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: + """Convenience function to apply :func:`full_groupby` based on modality.""" + return full_groupby(values, key=lambda x: x.modality) + + +@dataclass +class _BoundPromptSequence: + """ + A :data:`_PromptSeq` bound to a tokenizer to automatically + convert between token sequence and text representations. + """ + tokenizer: AnyTokenizer = field(repr=False) + + _text: Optional[str] + _token_ids: Optional[list[int]] + + @staticmethod + def from_seq( + tokenizer: AnyTokenizer, + seq: PromptSeq, + ) -> "_BoundPromptSequence": + return _BoundPromptSequence( + tokenizer=tokenizer, + _text=seq if isinstance(seq, str) else None, + _token_ids=seq if isinstance(seq, list) else None, + ) + + def __post_init__(self) -> None: + if self._text is None and self._token_ids is None: + raise ValueError("At least one of 'text' and 'token_ids' must be " + "specified") + + @property + def text(self) -> str: + if self._text is None: + assert self._token_ids is not None + self._text = _cached_decode(self.tokenizer, tuple(self._token_ids)) + + return self._text + + @property + def token_ids(self) -> list[int]: + if self._token_ids is None: + assert self._text is not None + self._token_ids = _cached_encode(self.tokenizer, self._text) + + return self._token_ids + + +@dataclass +class _BoundPromptReplacementGroup: + full: _BoundPromptSequence + features: _BoundPromptSequence + + +@dataclass +class BoundPromptReplacement: + """ + A :class:`PromptReplacement` bound to a tokenizer to automatically + convert :attr:`target` and the result of :meth:`get_replacement` between + token sequence and text representations. + """ + tokenizer: AnyTokenizer = field(repr=False) + modality: str + + _target: PromptSeq + _replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) + + def __post_init__(self) -> None: + self._replacement_cache = dict[int, _BoundPromptReplacementGroup]() + + @property + def target(self) -> _BoundPromptSequence: + """The token sequence (or text) to find and replace.""" + return _BoundPromptSequence.from_seq(self.tokenizer, self._target) + + def get_replacement(self, item_idx: int) -> _BoundPromptReplacementGroup: + """ + Given the index of the processed item within :attr:`modality`, + output the replacement token sequence (or text). + """ + replacement = self._replacement + if callable(replacement): + cache_key = item_idx + if cache_key in self._replacement_cache: + return self._replacement_cache[cache_key] + + replacement = replacement(item_idx) + else: + cache_key = None + + if not isinstance(replacement, PromptReplacementDetails): + replacement = PromptReplacementDetails.from_seq(replacement) + + bound_full = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.full) + bound_features = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.features) + bound_replacement = _BoundPromptReplacementGroup( + full=bound_full, + features=bound_features, + ) + + if cache_key is not None: + self._replacement_cache[cache_key] = bound_replacement + + return bound_replacement + + +class _TokenMatch(NamedTuple): + start_idx: int + end_idx: int + + +def iter_token_matches( + token_ids: list[int], + match_ids: list[int], +) -> Generator[_TokenMatch]: + """ + Yield each occurrence of :code:`match_ids` in :code:`token_ids`. + + Note that empty matches are ignored. + """ + prompt_len = len(token_ids) + match_len = len(match_ids) + + if match_len == 0: + return + + start_idx = 0 + while start_idx < prompt_len - match_len + 1: + end_idx = start_idx + match_len + + if token_ids[start_idx:end_idx] == match_ids: + yield _TokenMatch(start_idx=start_idx, end_idx=end_idx) + + # Exclude overlapping matches + start_idx = end_idx + else: + start_idx += 1 + + +@dataclass(repr=False) +class _PromptReplacementMatch(ABC): + prompt_repl: BoundPromptReplacement + + @property + def modality(self) -> str: + return self.prompt_repl.modality + + @property + @abstractmethod + def start_idx(self) -> int: + raise NotImplementedError + + @property + @abstractmethod + def end_idx(self) -> int: + raise NotImplementedError + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r}, " + f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})") + + +@dataclass(repr=False) +class _PromptReplacementTokenMatch(_PromptReplacementMatch): + match: _TokenMatch + + @property + def start_idx(self) -> int: + return self.match.start_idx + + @property + def end_idx(self) -> int: + return self.match.end_idx + + +@dataclass(repr=False) +class _PromptReplacementTextMatch(_PromptReplacementMatch): + match: re.Match[str] + + @property + def start_idx(self) -> int: + return self.match.start() + + @property + def end_idx(self) -> int: + return self.match.end() + + +@dataclass +class PlaceholderFeaturesInfo: + modality: str + item_idx: int + start_idx: int + tokens: list[int] + + @property + def length(self) -> int: + return len(self.tokens) + + def to_range(self) -> PlaceholderRange: + return PlaceholderRange( + offset=self.start_idx, + length=self.length, + ) + + +def find_token_matches( + prompt: list[int], + prompt_repls: Sequence[BoundPromptReplacement], +) -> list[_PromptReplacementTokenMatch]: + """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" + return [ + _PromptReplacementTokenMatch(prompt_repl, match) + for prompt_repl in prompt_repls + for match in iter_token_matches(prompt, prompt_repl.target.token_ids) + ] + + +def find_text_matches( + prompt: str, + prompt_repls: Sequence[BoundPromptReplacement], +) -> list[_PromptReplacementTextMatch]: + """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" + return [ + _PromptReplacementTextMatch(prompt_repl, match) + for prompt_repl in prompt_repls + for match in re.finditer(re.escape(prompt_repl.target.text), prompt) + ] + + +def _resolve_matches( + prompt: PromptSeq, + mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], +) -> list[_PromptReplacementMatch]: + """ + Resolve :code:`mm_matches` to ensure that there are no overlapping matches, + and sort them such that earlier matches take priority over later ones. + """ + matches = [m for matches in mm_matches.values() for m in matches] + + seen_matches: list[Optional[_PromptReplacementMatch]] = [None + ] * len(prompt) + + for match in matches: + for idx in range(match.start_idx, match.end_idx): + if seen_matches[idx] is not None: + raise ValueError("Found overlapping matches " + f"({seen_matches[idx]} and {match}) " + f"at index={idx} of prompt={prompt}") + + seen_matches[idx] = match + + return sorted(matches, key=lambda x: x.start_idx) + + +def _replace_matches( + prompt: _S, + mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], + mm_item_counts: Mapping[str, int], +) -> list[_S]: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + out_seqs = list[_S]() + prev_end_idx = 0 + next_idx_by_modality = defaultdict[str, int](lambda: 0) + + for match in _resolve_matches(prompt, mm_matches): + modality = match.modality + + item_idx = next_idx_by_modality[modality] + if item_idx >= mm_item_counts.get(modality, 0): + continue + + start_idx = match.start_idx + end_idx = match.end_idx + + repl_info = match.prompt_repl + replacement = repl_info.get_replacement(item_idx) + + if isinstance(prompt, str): + repl_seq = replacement.full.text + out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) + else: + repl_seq = replacement.full.token_ids + out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) + + prev_end_idx = end_idx + next_idx_by_modality[modality] += 1 + + out_seqs.append(prompt[prev_end_idx:]) + + return out_seqs + + +def replace_token_matches( + prompt: list[int], + mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]], + mm_item_counts: Mapping[str, int], +) -> list[int]: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + if not mm_matches: + return prompt + + token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts) + + return flatten_2d_lists(token_id_seqs) + + +def replace_text_matches( + prompt: str, + mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]], + mm_item_counts: Mapping[str, int], +) -> str: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + if not mm_matches: + return prompt + + texts = _replace_matches(prompt, mm_matches, mm_item_counts) + + return "".join(texts) + + +def _iter_placeholders( + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], + prompt: list[int], + mm_item_counts: Mapping[str, int], +) -> Iterable[PlaceholderFeaturesInfo]: + """ + Yield each set of placeholder tokens found in :code:`prompt`. + + Matches are exclusive even when multiple modalities share + the same placeholder tokens. In that case, the modality that + appears earlier in `mm_prompt_repls` takes priority. + + Note that empty matches are ignored. + """ + prompt_len = len(prompt) + item_idx_by_modality = defaultdict[str, int](lambda: 0) + + start_idx = 0 + while start_idx < prompt_len: + found = False + + for modality, modality_repls in mm_prompt_repls.items(): + item_idx = item_idx_by_modality[modality] + if item_idx >= mm_item_counts.get(modality, 0): + continue + + for repl_info in modality_repls: + replacement = repl_info.get_replacement(item_idx) + repl_tokens_full = replacement.full.token_ids + repl_len_full = len(repl_tokens_full) + end_idx_full = start_idx + repl_len_full + + if repl_len_full == 0 or end_idx_full > prompt_len: + continue + + if prompt[start_idx:end_idx_full] == repl_tokens_full: + repl_tokens_feat = replacement.features.token_ids + + try: + match = next( + iter_token_matches(repl_tokens_full, + repl_tokens_feat)) + yield PlaceholderFeaturesInfo( + modality=modality, + item_idx=item_idx, + start_idx=start_idx + match.start_idx, + tokens=repl_tokens_feat, + ) + except StopIteration: + raise AssertionError( + f"{repl_tokens_feat=} should be a " + f"subsequence of {repl_tokens_full=}") from None + + # Exclude overlapping matches + start_idx = end_idx_full + item_idx_by_modality[modality] += 1 + found = True + break + + if found: + break # Go back to the outer while loop + + if not found: + start_idx += 1 + + +def find_mm_placeholders( + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], + prompt: list[int], + mm_item_counts: Mapping[str, int], +) -> Mapping[str, list[PlaceholderFeaturesInfo]]: + it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) + return dict(full_groupby_modality(it)) + + +class ProcessingCache: + + def __init__(self, capacity: int) -> None: + super().__init__() + + # DEBUG: Set to None to disable + self.debug_cache_hit_ratio_steps: Optional[int] = None + + self._cache = LRUCache[str, MultiModalKwargsItem](capacity) + + def _maybe_log_cache_stats(self) -> None: + steps = self.debug_cache_hit_ratio_steps + if not steps: + return + + cache_stats = self._cache.stat() + if cache_stats.total % steps == 0: + logger.debug("ProcessingCache: hit_ratio = %.2f", + cache_stats.hit_ratio) + + def get( + self, + model_id: str, + modality: str, + input_item: object, + input_kwargs: Mapping[str, object], + ) -> Optional[MultiModalKwargsItem]: + """ + Get a processed multi-modal item from the cache + according to its dependencies, including: + + - The model ID + - The modality of the item + - The original data item passed to the HF processor + - The configuration options of the HF processor + """ + self._maybe_log_cache_stats() + + cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) + return self._cache.get(cache_key) + + def put( + self, + model_id: str, + modality: str, + input_item: object, + input_kwargs: Mapping[str, object], + output_kwargs: MultiModalKwargsItem, + ) -> None: + """ + Put a processed multi-modal item into the cache + according to its dependencies (see :meth:`get`). + """ + cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) + self._cache.put(cache_key, output_kwargs) + + +class BaseProcessingInfo: + """Base class to provide the information necessary for data processing.""" + + def __init__(self, ctx: InputProcessingContext) -> None: + super().__init__() + + self.ctx = ctx + + @property + def model_id(self) -> str: + return self.ctx.model_config.model + + def get_tokenizer(self) -> AnyTokenizer: + return self.ctx.tokenizer + + def get_hf_config(self) -> PretrainedConfig: + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + """ + Subclasses can override this method to handle + specific kwargs from model config or user inputs. + """ + return self.ctx.get_hf_processor(**kwargs) + + @abstractmethod + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + """ + Return the maximum supported number of items for each modality. + + A value of `None` means unlimited number of items. + + Omitting a modality from the returned dictionary means that + it is not supported at all. + """ + raise NotImplementedError + + @abstractmethod + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + """ + Get the maximum possible number of tokens per data item + for each modality. + + The dictionary returned by this method should have the same + keys as that returned by :meth:`get_supported_mm_limits`. + """ + raise NotImplementedError + + +_I = TypeVar("_I", bound=BaseProcessingInfo) + + +class BaseMultiModalProcessor(ABC, Generic[_I]): + """ + Abstract base class to process multi-modal inputs to be used in vLLM. + + Not to be confused with :class:`transformers.ProcessorMixin`. + """ + + def __init__(self, + info: _I, + dummy_inputs: "BaseDummyInputsBuilder[_I]", + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__() + + self.info = info + self.dummy_inputs = dummy_inputs + self.cache = cache + self.enable_sanity_checks = enable_sanity_checks + + self.data_parser = self._get_data_parser() + + def __call__( + self, + prompt: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputs: + return self.apply(prompt, mm_data, hf_processor_mm_kwargs) + + def _get_data_parser(self) -> MultiModalDataParser: + """ + Construct a parser to preprocess multi-modal data items + before passing them to :meth:`_get_hf_mm_data`. + + You can support additional modalities by creating a subclass + of :class:`MultiModalDataParser` that has additional subparsers. + """ + return MultiModalDataParser() + + def _to_mm_items( + self, + mm_data: MultiModalDataDict, + ) -> MultiModalDataItems: + """ + Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` + before passing them to :meth:`_get_hf_mm_data`. + """ + mm_items = self.data_parser.parse_mm_data(mm_data) + + mm_limits = self.info.ctx.get_mm_config().limit_per_prompt + for modality, items in mm_items.items(): + limit = mm_limits.get(modality, 1) + if len(items) > limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but passed {len(items)} " + f"{modality} items in the same prompt.") + + return mm_items + + @abstractmethod + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + """Given the HF-processed data, output the metadata of each field.""" + raise NotImplementedError + + @abstractmethod + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + """ + Given the original multi-modal items for this modality + and HF-processed data, output the replacements to perform. + + Notes: + - You should not assume that HF processor always performs prompt + replacement: in :meth:`_apply_hf_processor_missing`, this method + is called on text-only and multimodal-only inputs separately, + instead of passing them in the same call. + - The replacement information returned by this method is also used + to determine the placeholder token positions for each multi-modal + item. + """ + raise NotImplementedError + + def _find_mm_placeholders( + self, + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], + new_token_ids: list[int], + mm_item_counts: Mapping[str, int], + ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: + return find_mm_placeholders(mm_prompt_repls, new_token_ids, + mm_item_counts) + + def _get_hf_mm_data( + self, + mm_items: MultiModalDataItems, + ) -> tuple[Mapping[str, object], Mapping[str, object]]: + processor_data = dict[str, object]() + passthrough_data = dict[str, object]() + + for items in mm_items.values(): + processor_data.update(items.get_processor_data()) + passthrough_data.update(items.get_passthrough_data()) + + return processor_data, passthrough_data + + def _call_hf_processor( + self, + prompt: str, + # Not to be confused with `mm_data` in `self.apply`. + # This refers to the data to be passed to HF processor. + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + """ + Call the HF processor on the prompt text and + associated multi-modal data. + """ + return self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + mm_kwargs, + ) + + def _apply_hf_processor_text_mm( + self, + prompt_text: str, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the prompt text and multi-modal data + together. + """ + processor_data, passthrough_data = self._get_hf_mm_data(mm_items) + + processed_data = self._call_hf_processor( + prompt=prompt_text, + mm_data=processor_data, + mm_kwargs=hf_processor_mm_kwargs, + ) + processed_data.update(passthrough_data) + + prompt_ids, = processed_data.pop("input_ids").tolist() + + mm_kwargs = MultiModalKwargs.from_hf_inputs( + processed_data, + self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), + ) + + return prompt_ids, mm_kwargs + + def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]: + """ + Apply the HF processor on the prompt text only. + + Since HF processor requires that text and multi-modal items + correspond to each other, we create dummy multi-modal items + to go along with the text. + """ + prompt_ids, _ = self._apply_hf_processor_text_mm( + prompt_text=prompt_text, + mm_items=MultiModalDataItems({}), + hf_processor_mm_kwargs={}, + ) + + return prompt_ids + + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + """ + Apply the HF processor on the prompt tokens only. + + Most HF processors accept prompt text but not prompt tokens. + If the HF processor adds or removes tokens that are not related to + multi-modal data, you should override this method so it is consistent + with the output of :meth:`_apply_hf_processor_text_only` on the + corresponding text. + """ + return prompt_tokens + + def _apply_hf_processor_mm_only( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalKwargs: + """ + Apply the HF processor on the multi-modal data only. + + Since HF processor requires that text and multi-modal items + correspond to each other, we generate dummy text using + :class:`DummyInputsBuilder` to go along with the multi-modal data. + """ + mm_counts = mm_items.get_all_counts() + + dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs( + self.info.ctx.model_config.max_model_len, + mm_counts, + ) + + _, mm_kwargs = self._apply_hf_processor_text_mm( + prompt_text=dummy_inputs.prompt_text, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + return mm_kwargs + + def _apply_hf_processor_main( + self, + prompt: Union[str, list[int]], + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + *, + enable_hf_prompt_replacement: bool, + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the prompt text and multi-modal data. + + Note: + If :code:`enable_hf_prompt_replacement=False`, the prompt should + correspond to the multi-modal items. + """ + if isinstance(prompt, str): + if enable_hf_prompt_replacement: + return self._apply_hf_processor_text_mm( + prompt_text=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + prompt_ids = self._apply_hf_processor_text_only(prompt) + else: + prompt_ids = self._apply_hf_processor_tokens_only(prompt) + + mm_missing_kwargs = self._apply_hf_processor_mm_only( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + return prompt_ids, mm_missing_kwargs + + def _cached_apply_hf_processor( + self, + prompt: Union[str, list[int]], + mm_data_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the full prompt text, + caching the results and reusing cached results. + """ + cache = self.cache + model_id = self.info.model_id + + _, passthrough_data = self._get_hf_mm_data(mm_data_items) + if cache is None or passthrough_data: + return self._apply_hf_processor_main( + prompt=prompt, + mm_items=mm_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_replacement=True, + ) + + mm_maybe_cached_kw_items = { + modality: [ + cache.get(model_id, modality, item, hf_processor_mm_kwargs) + for item in items + ] + for modality, items in mm_data_items.items() + } + + mm_missing_idxs = { + modality: + [idx for idx, item in enumerate(kw_items) if item is None] + for modality, kw_items in mm_maybe_cached_kw_items.items() + } + mm_missing_data = { + modality: [mm_data_items[modality][idx] for idx in idxs] + for modality, idxs in mm_missing_idxs.items() + } + mm_missing_data_items = self._to_mm_items(mm_missing_data) + + # NOTE: `prompt` does not correspond to `mm_missing_data_items`, + # so we need to pass `enable_hf_prompt_replacement=False` + prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main( + prompt=prompt, + mm_items=mm_missing_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_replacement=False, + ) + + mm_missing_next_idx = { + modality: 0 + for modality in mm_missing_data_items + } + + merged_kw_items = list[MultiModalKwargsItem]() + for modality, kw_items in mm_maybe_cached_kw_items.items(): + for idx, kw_item in enumerate(kw_items): + if kw_item is None: + kw_item = mm_missing_kwargs.get_item( + modality, + mm_missing_next_idx[modality], + ) + + cache.put( + model_id, + modality, + mm_data_items[modality][idx], + hf_processor_mm_kwargs, + kw_item, + ) + + mm_missing_next_idx[modality] += 1 + + merged_kw_items.append(kw_item) + + if self.enable_sanity_checks: + mm_missing_counts = mm_missing_data_items.get_all_counts() + assert all( + item_count == mm_missing_counts[modality] + for modality, item_count in mm_missing_next_idx.items()), dict( + mm_missing_next_idx=mm_missing_next_idx, + mm_missing_counts=mm_missing_counts) + + mm_kwargs = MultiModalKwargs.from_items(merged_kw_items) + + return prompt_ids, mm_kwargs + + def _bind_and_group_repls( + self, + prompt_repls: list[PromptReplacement], + ) -> dict[str, list[BoundPromptReplacement]]: + tokenizer = self.info.get_tokenizer() + + it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls) + return dict(full_groupby_modality(it)) + + def _always_apply_prompt_replacements(self) -> bool: + """ + A flag which can be overridden so that + :meth:`_apply_prompt_replacements` is always called even if we + detect that HF has performed processing via + :meth:`_find_placeholders_by_modality`. + + This is useful in cases where :meth:`_find_placeholders_by_modality` + cannot be reliably used to detect whether HF has performed processing. + """ + return False + + def _apply_prompt_replacements( + self, + token_ids: list[int], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], + mm_item_counts: Mapping[str, int], + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: + tokenizer = self.info.get_tokenizer() + + mm_token_matches = { + modality: find_token_matches(token_ids, prompt_repls) + for modality, prompt_repls in mm_prompt_repls.items() + } + mm_match_counts = { + modality: len(matches) + for modality, matches in mm_token_matches.items() + } + + # If the search text does not represent a special token, + # it may have different token IDs in the prompt, because + # the tokens may go across the boundaries of the search text. + # ---- + # e.g. when searching for "foo" in "food", if "food" itself makes + # up a token, then the token ID of "foo" will not appear at all + # ---- + # Since it is inefficient to search for all possible tokenizations + # of the search text in the prompt, we instead perform string + # replacement on the decoded token IDs, then encode them back. + if all( + mm_match_counts.get(modality, 0) >= item_count + for modality, item_count in mm_item_counts.items() + ): # yapf: disable + token_ids = replace_token_matches( + token_ids, + mm_token_matches, + mm_item_counts, + ) + + text = decode_tokens(tokenizer, token_ids) + matched_repls = { + modality: [match.prompt_repl for match in token_matches] + for modality, token_matches in mm_token_matches.items() + } + else: + text = decode_tokens(tokenizer, token_ids) + + mm_text_matches = { + modality: find_text_matches(text, prompt_repls) + for modality, prompt_repls in mm_prompt_repls.items() + } + text = replace_text_matches( + text, + mm_text_matches, + mm_item_counts, + ) + + token_ids = encode_tokens(tokenizer, + text, + add_special_tokens=False) + matched_repls = { + modality: [match.prompt_repl for match in token_matches] + for modality, token_matches in mm_text_matches.items() + } + + placeholders = self._find_mm_placeholders( + matched_repls, + token_ids, + mm_item_counts, + ) + + return token_ids, text, placeholders + + def _validate_mm_kwargs( + self, + mm_kwargs: MultiModalKwargs, + mm_item_counts: Mapping[str, int], + ) -> None: + for modality, item_count in mm_item_counts.items(): + if modality in mm_kwargs.modalities: + items = mm_kwargs.get_items(modality) + else: + items = [] + + if len(items) != item_count: + raise RuntimeError( + f"Expected there to be {item_count} {modality} items in " + f"keyword arguments corresponding to {item_count} " + f"{modality} data items, but only found {len(items)}! " + "There is likely a problem with your " + "implementation of merged multi-modal processor for this " + "model (usually arising from an inconsistency between " + "`_call_hf_processor` and `_get_mm_fields_config`).") + + def _validate_mm_placeholders( + self, + mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], + mm_item_counts: Mapping[str, int], + *, + allow_missing: bool = False, + ) -> Mapping[str, int]: + missing_repl_counts = dict[str, int]() + + for modality, item_count in mm_item_counts.items(): + placeholders = mm_placeholders.get(modality, []) + + if len(placeholders) != item_count and not allow_missing: + raise RuntimeError( + f"Expected there to be {item_count} prompt replacements " + f"corresponding to {item_count} {modality} items, but only " + f"found {len(placeholders)} prompt replacements! Either " + "the prompt text has missing/incorrect tokens for " + "multi-modal inputs, or there is a problem with your " + "implementation of merged multi-modal processor for this " + "model (usually arising from an inconsistency between " + "`_call_hf_processor` and `_get_prompt_replacements`).") + + missing_repl_counts[modality] = item_count - len(placeholders) + + return missing_repl_counts + + def apply( + self, + prompt: Union[str, list[int]], + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputs: + """ + Process multi-modal inputs to be used in vLLM. + + The main steps are: + + 1. Apply HF Processor on prompt text and multi-modal data together, + outputting token IDs and processed tensors. + 2. Find and replace sequences in the token IDs with placeholder tokens. + The number of placeholder tokens equals the feature size of the + multi-modal data outputted by the multi-modal encoder. + 3. Extract information about the placeholder tokens from the + processed token IDs. + """ + mm_items = self._to_mm_items(mm_data) + + # Create MM hashes (only used in V1) + # TODO: Use these hash keys for caching operations in apply_hf_processor + # instead of rehashing. + + if envs.VLLM_USE_V1: + model_id = self.info.model_id + mm_hashes = { + modality: [ + MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: item}, + **hf_processor_mm_kwargs) + for item in items + ] + for modality, items in mm_items.items() + } + else: + mm_hashes = None + + prompt_ids, mm_kwargs = self._cached_apply_hf_processor( + prompt, + mm_items, + hf_processor_mm_kwargs, + ) + + unbound_prompt_repls = self._get_prompt_replacements( + mm_items, + hf_processor_mm_kwargs, + mm_kwargs, + ) + mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls) + + mm_item_counts = mm_items.get_all_counts() + self._validate_mm_kwargs(mm_kwargs, mm_item_counts) + + hf_mm_placeholders = self._find_mm_placeholders( + mm_prompt_repls, + prompt_ids, + mm_item_counts, + ) + + if self._always_apply_prompt_replacements(): + mm_missing_repl_counts = mm_item_counts + mm_missing_repls = dict(mm_prompt_repls) + else: + mm_missing_repl_counts = self._validate_mm_placeholders( + hf_mm_placeholders, + mm_item_counts, + allow_missing=True, + ) + + mm_missing_repls = dict[str, list[BoundPromptReplacement]]() + for modality, missing_repl_count in mm_missing_repl_counts.items(): + if missing_repl_count == 0: + mm_missing_repls[modality] = [] + elif missing_repl_count == mm_item_counts.get(modality, 0): + mm_missing_repls[modality] = mm_prompt_repls[modality] + else: + raise ValueError("Partial prompt replacement within " + f"{modality=} is not supported") + + # If HF processor already inserts placeholder tokens, + # there is no need for us to insert them + if all(len(repls) == 0 for repls in mm_missing_repls.values()): + tokenizer = self.info.get_tokenizer() + prompt = decode_tokens(tokenizer, prompt_ids) + mm_placeholders = hf_mm_placeholders + else: + ( + prompt_ids, + prompt, + missing_mm_placeholders, + ) = self._apply_prompt_replacements( + prompt_ids, + mm_missing_repls, + mm_missing_repl_counts, + ) + + mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders} + + self._validate_mm_placeholders(mm_placeholders, mm_item_counts) + + mm_placeholder_ranges = { + modality: [item.to_range() for item in placeholders] + for modality, placeholders in mm_placeholders.items() + } + + return MultiModalInputs( + type="multimodal", + prompt=prompt, + prompt_token_ids=prompt_ids, + mm_kwargs=mm_kwargs, + mm_hashes=mm_hashes, + mm_placeholders=mm_placeholder_ranges, + ) diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/profiling.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/profiling.py new file mode 100644 index 0000000000000000000000000000000000000000..5dd7548540448c17d3395c30aeb17d8c0f85fa59 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/profiling.py @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: Apache-2.0 + +from abc import ABC, abstractmethod +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Generic, TypeVar + +import numpy as np +import numpy.typing as npt +from PIL import Image + +import vllm.envs as envs +from vllm.inputs import DummyData +from vllm.logger import init_logger + +from .inputs import MultiModalDataDict, MultiModalInputs +from .processing import BaseMultiModalProcessor, BaseProcessingInfo + +logger = init_logger(__name__) + + +@dataclass +class ProcessorInputs: + """ + Represents the keyword arguments to + :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. + """ + prompt_text: str + mm_data: MultiModalDataDict + hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) + + +_I = TypeVar("_I", bound=BaseProcessingInfo) + + +class BaseDummyInputsBuilder(ABC, Generic[_I]): + """ + Abstract base class that constructs the dummy data to profile + multi-modal models. + """ + + def __init__(self, info: _I) -> None: + super().__init__() + + self.info = info + + @abstractmethod + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + """ + Build the input which, after processing, results in + :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens. + """ + raise NotImplementedError + + def _get_dummy_audios( + self, + *, + length: int, + num_audios: int, + ) -> list[npt.NDArray]: + audio = np.zeros((length, )) + return [audio] * num_audios + + def _get_dummy_images( + self, + *, + width: int, + height: int, + num_images: int, + ) -> list[Image.Image]: + image = Image.new("RGB", (width, height), color=0) + return [image] * num_images + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[npt.NDArray]: + video = np.zeros((num_frames, width, height, 3)) + return [video] * num_videos + + +class MultiModalProfiler(Generic[_I]): + """ + Contains code for running memory profiling for multi-modal models. + """ + + def __init__( + self, + processor: BaseMultiModalProcessor[_I], + ) -> None: + super().__init__() + + self.processor = processor + + @property + def processing_info(self) -> BaseProcessingInfo: + return self.processor.info + + @property + def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]: + return self.processor.dummy_inputs + + def get_mm_limits(self) -> Mapping[str, int]: + mm_config = self.processing_info.ctx.get_mm_config() + mm_limit_per_prompt = mm_config.limit_per_prompt + + supported_mm_limits = self.processing_info.get_supported_mm_limits() + + mm_limits = { + modality: mm_limit_per_prompt.get(modality, 1) + for modality in supported_mm_limits + } + + for modality, supported_limit in supported_mm_limits.items(): + limit = mm_limits[modality] + if supported_limit is not None and supported_limit < limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but this model only supports " + f"at most {supported_limit} {modality} items.") + + return mm_limits + + def _get_dummy_mm_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalInputs: + factory = self.dummy_inputs + processor_inputs = factory.get_dummy_processor_inputs( + seq_len, mm_counts) + + return self.processor.apply( + prompt=processor_inputs.prompt_text, + mm_data=processor_inputs.mm_data, + hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, + ) + + def get_dummy_data(self, seq_len: int) -> DummyData: + # Avoid circular import + from vllm.sequence import SequenceData + + mm_counts = self.get_mm_limits() + + info = self.processing_info + mm_max_tokens_per_item = info.get_mm_max_tokens_per_item( + seq_len, mm_counts) + + if mm_counts.keys() != mm_max_tokens_per_item.keys(): + raise AssertionError( + "The keys returned by `get_supported_mm_limits`" + f"({set(mm_counts.keys())}) should be the same as those " + "returned by `get_mm_max_tokens_per_item` " + f"({set(mm_max_tokens_per_item.keys())})") + + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + prompt_token_ids = mm_inputs["prompt_token_ids"] + placeholders_by_modality = mm_inputs["mm_placeholders"] + + total_placeholders_by_modality = { + modality: sum(item["length"] for item in placeholders) + for modality, placeholders in placeholders_by_modality.items() + } + expected_placeholders_by_modality = { + modality: mm_max_tokens_per_item[modality] * mm_counts[modality] + for modality in placeholders_by_modality + } + if total_placeholders_by_modality != expected_placeholders_by_modality: + raise AssertionError( + f"The processed dummy data has a total of " + f"{total_placeholders_by_modality} placeholder tokens, which " + f"is not the expected {expected_placeholders_by_modality} " + "tokens.") + + total_len = len(prompt_token_ids) + + # V0 does not support chunked prefill. + if total_len > seq_len and not envs.VLLM_USE_V1: + logger.warning( + "The context length (%d) of the model is too short " + "to hold the multi-modal embeddings in the worst case " + "(%d tokens in total, out of which %s are reserved for " + "multi-modal embeddings). This may cause certain multi-modal " + "inputs to fail during inference, even when the input text is " + "short. To avoid this, you should increase `max_model_len`, " + "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, + total_len, total_placeholders_by_modality) + + return DummyData( + seq_data=SequenceData.from_prompt_token_counts((0, seq_len)), + multi_modal_data=None, + multi_modal_placeholders=None, + ) + + prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) + + return DummyData( + seq_data=SequenceData.from_seqs(prompt_token_ids), + multi_modal_data=mm_inputs["mm_kwargs"], + multi_modal_placeholders=placeholders_by_modality, + ) diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/registry.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..04141114288c9339b1d636e7741379edd6fe3b15 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/registry.py @@ -0,0 +1,458 @@ +# SPDX-License-Identifier: Apache-2.0 + +import functools +from collections import UserDict +from dataclasses import dataclass +from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional, + Protocol, Sequence, Type, TypeVar) + +import torch.nn as nn + +from vllm.inputs import InputProcessingContext +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import ClassRegistry + +from .audio import AudioPlugin +from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc +from .image import ImagePlugin +from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors +from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, + ProcessingCache) +from .profiling import BaseDummyInputsBuilder, MultiModalProfiler +from .utils import cached_get_tokenizer +from .video import VideoPlugin + +if TYPE_CHECKING: + from vllm.config import ModelConfig + +logger = init_logger(__name__) + +# TODO: Tune the MM cache size +MM_CACHE_SIZE = 256 + +N = TypeVar("N", bound=Type[nn.Module]) +_I = TypeVar("_I", bound=BaseProcessingInfo) +_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True) + + +class ProcessingInfoFactory(Protocol[_I_co]): + """Constructs a :class:`MultiModalProcessor` instance from the context.""" + + def __call__( + self, + ctx: InputProcessingContext, + ) -> _I_co: + ... + + +class DummyInputsBuilderFactory(Protocol[_I]): + """ + Constructs a :class:`BaseDummyInputsBuilder` instance from the context. + """ + + def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: + ... + + +class MultiModalProcessorFactory(Protocol[_I]): + """Constructs a :class:`MultiModalProcessor` instance from the context.""" + + def __call__( + self, + info: _I, + dummy_inputs: BaseDummyInputsBuilder[_I], + *, + cache: Optional[ProcessingCache] = None, + ) -> BaseMultiModalProcessor[_I]: + ... + + +@dataclass(frozen=True) +class _ProcessorFactories(Generic[_I]): + info: ProcessingInfoFactory[_I] + processor: MultiModalProcessorFactory[_I] + dummy_inputs: DummyInputsBuilderFactory[_I] + + def build_processor( + self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + ): + info = self.info(ctx) + dummy_inputs_builder = self.dummy_inputs(info) + return self.processor(info, dummy_inputs_builder, cache=cache) + + +class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): + """ + Wraps `_limits_by_model` for a more informative error message + when attempting to access a model that does not exist. + """ + + def __getitem__(self, key: "ModelConfig") -> Dict[str, int]: + try: + return super().__getitem__(key) + except KeyError as exc: + msg = (f"Cannot find `mm_limits` for model={key.model}. Did you " + "forget to call `init_mm_limits_per_prompt`?") + raise KeyError(msg) from exc + + +class MultiModalRegistry: + """ + A registry that dispatches data processing according to the model. + """ + + DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin()) + + def __init__( + self, + *, + plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None: + self._plugins = {p.get_data_key(): p for p in plugins} + + self._processor_factories = ClassRegistry[nn.Module, + _ProcessorFactories]() + + # This is used for non-multimodal models + self._disabled_limits_per_plugin = {k: 0 for k in self._plugins} + + self._limits_by_model = _MultiModalLimits() + + self._processing_cache = ProcessingCache(MM_CACHE_SIZE) + + def register_plugin(self, plugin: MultiModalPlugin) -> None: + """ + Register a multi-modal plugin so it can be recognized by vLLM. + """ + data_type_key = plugin.get_data_key() + + if data_type_key in self._plugins: + logger.warning( + "A plugin is already registered for data type %s, " + "and will be overwritten by the new plugin %s.", data_type_key, + plugin) + + self._plugins[data_type_key] = plugin + + def _get_plugin(self, data_type_key: str): + plugin = self._plugins.get(data_type_key) + if plugin is not None: + return plugin + + msg = f"Unknown multi-modal data type: {data_type_key}" + raise NotImplementedError(msg) + + def register_input_mapper( + self, + data_type_key: str, + mapper: Optional[MultiModalInputMapper] = None, + ): + """ + Register an input mapper for a specific modality to a model class. + + See :meth:`MultiModalPlugin.register_input_mapper` for more details. + """ + return self._get_plugin(data_type_key).register_input_mapper(mapper) + + def register_image_input_mapper( + self, + mapper: Optional[MultiModalInputMapper] = None, + ): + """ + Register an input mapper for image data to a model class. + + See :meth:`MultiModalPlugin.register_input_mapper` for more details. + """ + return self.register_input_mapper("image", mapper) + + def map_input( + self, + model_config: "ModelConfig", + data: MultiModalDataDict, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + ) -> MultiModalKwargs: + """ + Apply an input mapper to the data passed to the model. + + The data belonging to each modality is passed to the corresponding + plugin which in turn converts the data into into keyword arguments + via the input mapper registered for that model. + + See :meth:`MultiModalPlugin.map_input` for more details. + + Note: + This should be called after :meth:`init_mm_limits_per_prompt`. + """ + merged_dict: Dict[str, NestedTensors] = {} + + for data_key, data_value in data.items(): + plugin = self._get_plugin(data_key) + + num_items = len(data_value) if isinstance(data_value, list) else 1 + max_items = self._limits_by_model[model_config][data_key] + if num_items > max_items: + raise ValueError( + f"You set {data_key}={max_items} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but found {num_items} items " + "in the same prompt.") + + input_dict = plugin.map_input(model_config, data_value, + mm_processor_kwargs) + for input_key, input_tensor in input_dict.items(): + if input_key in merged_dict: + raise ValueError(f"The input mappers (keys={set(data)}) " + f"resulted in a conflicting keyword " + f"argument to `forward()`: {input_key}") + + merged_dict[input_key] = input_tensor + + return MultiModalKwargs(merged_dict) + + def create_input_mapper(self, model_config: "ModelConfig"): + """ + Create an input mapper (see :meth:`map_input`) for a specific model. + """ + # NOTE - we currently make the assumption that if a model has multiple + # supported modalities, they take the same kwargs. For the default, + # this could be an issue in the future if it falls back to two HF + # resources and we can't inspect the signature easily since it's + # getting initialized through the autoclass. + # + # If this is a problem in the future, we should revisit it, but since + # it potentially introduces a lot of complexity for a currently + # uncommon case, we do not for simplicity of both use & implementation + return functools.partial(self.map_input, model_config) + + def register_max_multimodal_tokens( + self, + data_type_key: str, + max_mm_tokens: Optional[MultiModalTokensCalc] = None, + ): + """ + Register the maximum number of tokens, corresponding to a single + instance of multimodal data belonging to a specific modality, that are + passed to the language model for a model class. + """ + return self._get_plugin(data_type_key) \ + .register_max_multimodal_tokens(max_mm_tokens) + + def register_max_image_tokens( + self, + max_mm_tokens: Optional[MultiModalTokensCalc] = None, + ): + """ + Register the maximum number of image tokens, corresponding to a single + image, that are passed to the language model for a model class. + """ + return self.register_max_multimodal_tokens("image", max_mm_tokens) + + def get_max_tokens_per_item_by_modality( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: + """ + Get the maximum number of tokens per data item from each modality based + on underlying model configuration. + """ + if self.has_processor(model_config): + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + processor = self.create_processor(model_config, tokenizer) + seq_len = model_config.max_model_len + mm_limits = self.get_mm_limits_per_prompt(model_config) + return processor.info.get_mm_max_tokens_per_item( + seq_len, mm_limits) + + return { + key: plugin.get_max_multimodal_tokens(model_config) + for key, plugin in self._plugins.items() + } + + def get_max_tokens_per_item_by_nonzero_modality( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: + """ + Get the maximum number of tokens per data item from each modality based + on underlying model configuration, excluding modalities that user + explicitly disabled via `limit_mm_per_prompt`. + + Note: + This is currently directly used only in V1 for profiling the memory + usage of a model. + """ + mm_limits = self.get_mm_limits_per_prompt(model_config) + + return { + key: max_tokens_per_mm_item + for key, max_tokens_per_mm_item in + self.get_max_tokens_per_item_by_modality(model_config).items() + if mm_limits[key] > 0 + } + + def get_max_tokens_by_modality( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: + """ + Get the maximum number of tokens from each modality + for profiling the memory usage of a model. + + See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details. + + Note: + This should be called after :meth:`init_mm_limits_per_prompt`. + """ + mm_limits = self.get_mm_limits_per_prompt(model_config) + + return { + key: mm_limits[key] * max_tokens_per_mm_item + for key, max_tokens_per_mm_item in + self.get_max_tokens_per_item_by_modality(model_config).items() + } + + def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: + """ + Get the maximum number of multi-modal tokens + for profiling the memory usage of a model. + + See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details. + + Note: + This should be called after :meth:`init_mm_limits_per_prompt`. + """ + return sum(self.get_max_tokens_by_modality(model_config).values()) + + def init_mm_limits_per_prompt( + self, + model_config: "ModelConfig", + ) -> None: + """ + Initialize the maximum number of multi-modal input instances for each + modality that are allowed per prompt for a model class. + """ + if model_config in self._limits_by_model: + logger.warning( + "`mm_limits` has already been set for model=%s, and will " + "be overwritten by the new values.", model_config.model) + + multimodal_config = model_config.multimodal_config + if multimodal_config is None: + limits_per_plugin = self._disabled_limits_per_plugin + else: + config_limits_per_plugin = multimodal_config.limit_per_prompt + + extra_keys = config_limits_per_plugin.keys() - self._plugins.keys() + if extra_keys: + logger.warning( + "Detected extra keys in `--limit-mm-per-prompt` which " + "are not registered as multi-modal plugins: %s. " + "They will be ignored.", extra_keys) + + # NOTE: Currently the default is set to 1 for each plugin + # TODO: Automatically determine the limits based on budget + # once more models support multi-image inputs + limits_per_plugin = { + key: config_limits_per_plugin.get(key, 1) + for key in self._plugins + } + + self._limits_by_model[model_config] = limits_per_plugin + + def get_mm_limits_per_prompt( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: + """ + Get the maximum number of multi-modal input instances for each modality + that are allowed per prompt for a model class. + + Note: + This should be called after :meth:`init_mm_limits_per_prompt`. + """ + if self.has_processor(model_config): + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + processor = self.create_processor(model_config, tokenizer) + profiler = MultiModalProfiler(processor) + return profiler.get_mm_limits() + + return self._limits_by_model[model_config] + + def register_processor( + self, + processor: MultiModalProcessorFactory[_I], + *, + info: ProcessingInfoFactory[_I], + dummy_inputs: DummyInputsBuilderFactory[_I], + ): + """ + Register a multi-modal processor to a model class. The processor + is constructed lazily, hence a factory method should be passed. + + When the model receives multi-modal data, the provided function is + invoked to transform the data into a dictionary of model inputs. + + See also: + :ref:`mm-processing` + """ + + def wrapper(model_cls: N) -> N: + if self._processor_factories.contains(model_cls, strict=True): + logger.warning( + "Model class %s already has a multi-modal processor " + "registered to %s. It is overwritten by the new one.", + model_cls, self) + + self._processor_factories[model_cls] = _ProcessorFactories( + info=info, + dummy_inputs=dummy_inputs, + processor=processor, + ) + + return model_cls + + return wrapper + + def _get_model_cls(self, model_config: "ModelConfig"): + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + return model_cls + + def has_processor(self, model_config: "ModelConfig") -> bool: + """ + Test whether a multi-modal processor is defined for a specific model. + + See also: + :ref:`mm-processing` + """ + return self._get_model_cls(model_config) in self._processor_factories + + def create_processor( + self, + model_config: "ModelConfig", + tokenizer: AnyTokenizer, + ) -> BaseMultiModalProcessor[BaseProcessingInfo]: + """ + Create a multi-modal processor for a specific model and tokenizer. + + See also: + :ref:`mm-processing` + """ + model_cls = self._get_model_cls(model_config) + factories = self._processor_factories[model_cls] + + ctx = InputProcessingContext(model_config, tokenizer) + cache = (None if model_config.disable_mm_preprocessor_cache else + self._processing_cache) + + return factories.build_processor(ctx, cache=cache) diff --git a/.venv/lib/python3.11/site-packages/vllm/multimodal/utils.py b/.venv/lib/python3.11/site-packages/vllm/multimodal/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..583f536551243cc7eeffb20b74dee3d2a9da274d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/multimodal/utils.py @@ -0,0 +1,518 @@ +# SPDX-License-Identifier: Apache-2.0 + +from functools import lru_cache +from itertools import groupby +from pathlib import Path +from typing import TYPE_CHECKING, Optional, TypeVar, Union +from urllib.parse import ParseResult, urlparse + +import numpy as np +import numpy.typing as npt +from PIL import Image + +import vllm.envs as envs +from vllm.connections import HTTPConnection, global_http_connection +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer + +from .audio import AudioMediaIO +from .base import MediaIO +from .image import ImageMediaIO +from .inputs import PlaceholderRange +from .video import VideoMediaIO + +logger = init_logger(__name__) + +cached_get_tokenizer = lru_cache(get_tokenizer) + +_M = TypeVar("_M") + +if TYPE_CHECKING: + from .hasher import MultiModalHashDict + from .inputs import MultiModalKwargs, MultiModalPlaceholderDict + + +class MediaConnector: + + def __init__( + self, + connection: HTTPConnection = global_http_connection, + *, + allowed_local_media_path: str = "", + ) -> None: + super().__init__() + + self.connection = connection + + if allowed_local_media_path: + allowed_local_media_path_ = Path(allowed_local_media_path) + + if not allowed_local_media_path_.exists(): + raise ValueError( + "Invalid `--allowed-local-media-path`: The path " + f"{allowed_local_media_path_} does not exist.") + if not allowed_local_media_path_.is_dir(): + raise ValueError( + "Invalid `--allowed-local-media-path`: The path " + f"{allowed_local_media_path_} must be a directory.") + else: + allowed_local_media_path_ = None + + self.allowed_local_media_path = allowed_local_media_path_ + + def _load_data_url( + self, + url_spec: ParseResult, + media_io: MediaIO[_M], + ) -> _M: + data_spec, data = url_spec.path.split(",", 1) + media_type, data_type = data_spec.split(";", 1) + + if data_type != "base64": + msg = "Only base64 data URLs are supported for now." + raise NotImplementedError(msg) + + return media_io.load_base64(media_type, data) + + def _load_file_url( + self, + url_spec: ParseResult, + media_io: MediaIO[_M], + ) -> _M: + allowed_local_media_path = self.allowed_local_media_path + if allowed_local_media_path is None: + raise RuntimeError("Cannot load local files without " + "`--allowed-local-media-path`.") + + filepath = Path(url_spec.path) + if allowed_local_media_path not in filepath.resolve().parents: + raise ValueError( + f"The file path {filepath} must be a subpath " + f"of `--allowed-local-media-path` {allowed_local_media_path}.") + + return media_io.load_file(filepath) + + def load_from_url( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = None, + ) -> _M: + url_spec = urlparse(url) + + if url_spec.scheme.startswith("http"): + connection = self.connection + data = connection.get_bytes(url, timeout=fetch_timeout) + + return media_io.load_bytes(data) + + if url_spec.scheme == "data": + return self._load_data_url(url_spec, media_io) + + if url_spec.scheme == "file": + return self._load_file_url(url_spec, media_io) + + msg = "The URL must be either a HTTP, data or file URL." + raise ValueError(msg) + + async def load_from_url_async( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = None, + ) -> _M: + url_spec = urlparse(url) + + if url_spec.scheme.startswith("http"): + connection = self.connection + data = await connection.async_get_bytes(url, timeout=fetch_timeout) + + return media_io.load_bytes(data) + + if url_spec.scheme == "data": + return self._load_data_url(url_spec, media_io) + + if url_spec.scheme == "file": + return self._load_file_url(url_spec, media_io) + + msg = "The URL must be either a HTTP, data or file URL." + raise ValueError(msg) + + def fetch_audio( + self, + audio_url: str, + ) -> tuple[np.ndarray, Union[int, float]]: + """ + Load audio from a URL. + """ + audio_io = AudioMediaIO() + + return self.load_from_url( + audio_url, + audio_io, + fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, + ) + + async def fetch_audio_async( + self, + audio_url: str, + ) -> tuple[np.ndarray, Union[int, float]]: + """ + Asynchronously fetch audio from a URL. + """ + audio_io = AudioMediaIO() + + return await self.load_from_url_async( + audio_url, + audio_io, + fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, + ) + + def fetch_image( + self, + image_url: str, + *, + image_mode: str = "RGB", + ) -> Image.Image: + """ + Load a PIL image from a HTTP or base64 data URL. + + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) + + return self.load_from_url( + image_url, + image_io, + fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) + + async def fetch_image_async( + self, + image_url: str, + *, + image_mode: str = "RGB", + ) -> Image.Image: + """ + Asynchronously load a PIL image from a HTTP or base64 data URL. + + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) + + return await self.load_from_url_async( + image_url, + image_io, + fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) + + def fetch_video( + self, + video_url: str, + *, + image_mode: str = "RGB", + num_frames: int = 32, + ) -> npt.NDArray: + """ + Load video from a HTTP or base64 data URL. + """ + image_io = ImageMediaIO(image_mode=image_mode) + video_io = VideoMediaIO(image_io, num_frames=num_frames) + + return self.load_from_url( + video_url, + video_io, + fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, + ) + + async def fetch_video_async( + self, + video_url: str, + *, + image_mode: str = "RGB", + num_frames: int = 32, + ) -> npt.NDArray: + """ + Asynchronously load video from a HTTP or base64 data URL. + + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) + video_io = VideoMediaIO(image_io, num_frames=num_frames) + + return await self.load_from_url_async( + video_url, + video_io, + fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, + ) + + +global_media_connector = MediaConnector() +"""The global :class:`MediaConnector` instance used by vLLM.""" + +fetch_audio = global_media_connector.fetch_audio +fetch_image = global_media_connector.fetch_image +fetch_video = global_media_connector.fetch_video + + +def encode_audio_base64( + audio: np.ndarray, + sampling_rate: int, +) -> str: + """Encode audio as base64.""" + audio_io = AudioMediaIO() + return audio_io.encode_base64((audio, sampling_rate)) + + +def encode_image_base64( + image: Image.Image, + *, + image_mode: str = "RGB", + format: str = "JPEG", +) -> str: + """ + Encode a pillow image to base64 format. + + By default, the image is converted into RGB format before being encoded. + """ + image_io = ImageMediaIO(image_mode=image_mode) + return image_io.encode_base64(image, image_format=format) + + +def encode_video_base64(frames: npt.NDArray) -> str: + image_io = ImageMediaIO() + video_io = VideoMediaIO(image_io) + return video_io.encode_base64(frames) + + +# Utilities for input processors +_T = TypeVar("_T", str, int) + + +def repeat_and_pad_token( + token: _T, + *, + repeat_count: int = 1, + pad_token_left: Optional[_T] = None, + pad_token_right: Optional[_T] = None, +) -> list[_T]: + replacement = [token] * repeat_count + if pad_token_left is not None: + replacement = [pad_token_left] + replacement + if pad_token_right is not None: + replacement = replacement + [pad_token_right] + + return replacement + + +def repeat_and_pad_placeholder_tokens( + tokenizer: AnyTokenizer, + prompt: Optional[str], + prompt_token_ids: list[int], + *, + placeholder_token_id: int, + repeat_count: Union[int, list[int]], + pad_token_left: Optional[int] = None, + pad_token_right: Optional[int] = None, +) -> tuple[Optional[str], list[int], list[PlaceholderRange]]: + if isinstance(repeat_count, int): + repeat_count = [repeat_count] + + if prompt is None: + new_prompt = None + else: + placeholder_token_str = tokenizer.decode(placeholder_token_id) + pad_token_str_left = (None if pad_token_left is None else + tokenizer.decode(pad_token_left)) + pad_token_str_right = (None if pad_token_right is None else + tokenizer.decode(pad_token_right)) + + placeholder_token_count = prompt.count(placeholder_token_str) + # This is an arbitrary number to distinguish between the two cases + if placeholder_token_count > 16: + logger.warning( + "Please follow the prompt format that is " + "documented on HuggingFace which does not involve " + "repeating %s tokens.", placeholder_token_str) + if placeholder_token_count < len(repeat_count): + logger.warning( + "The number of multi-modal placeholder tokens in the prompt " + "is less than the number of multi-modal inputs. Extra " + "placeholder tokens will be treated as plain text") + repeat_count = repeat_count[:placeholder_token_count] + + prompt_parts = prompt.split(placeholder_token_str, + maxsplit=len(repeat_count)) + new_prompt = "" + for i, repeat_count_item in enumerate(repeat_count): + replacement_str = "".join( + repeat_and_pad_token( + placeholder_token_str, + repeat_count=repeat_count_item, + pad_token_left=pad_token_str_left, + pad_token_right=pad_token_str_right, + )) + # The image tokens are removed to be consistent with HuggingFace + new_prompt += prompt_parts[i] + replacement_str + new_prompt += prompt_parts[-1] + + new_token_ids = list[int]() + placeholder_ranges = list[PlaceholderRange]() + placeholder_token_idx = 0 + for i, token in enumerate(prompt_token_ids): + if token == placeholder_token_id: + curr_repeat_count = repeat_count[placeholder_token_idx] + replacement_ids = repeat_and_pad_token( + placeholder_token_id, + repeat_count=curr_repeat_count, + pad_token_left=pad_token_left, + pad_token_right=pad_token_right, + ) + offset = len(new_token_ids) + if pad_token_left is not None: + offset += 1 + placeholder_ranges.append({ + "offset": offset, + "length": curr_repeat_count, + }) + new_token_ids.extend(replacement_ids) + placeholder_token_idx += 1 + + # No need to further scan the list since we replaced all tokens + if placeholder_token_idx >= len(repeat_count): + new_token_ids.extend(prompt_token_ids[i + 1:]) + break + else: + new_token_ids.append(token) + + return new_prompt, new_token_ids, placeholder_ranges + + +def consecutive_placeholder_ranges( + num_items: int, + item_size: int, + initial_offset: int = 0) -> list[PlaceholderRange]: + """Returns a list of consecutive PlaceholderRanges of a fixed size""" + + return [ + PlaceholderRange(offset=initial_offset + i * item_size, + length=item_size) for i in range(num_items) + ] + + +def merge_and_sort_multimodal_metadata( + mm_positions: "MultiModalPlaceholderDict", + mm_hashes: Optional["MultiModalHashDict"], +) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]: + """Given a MultiModalPlaceholderDict, merge all PlaceholderRange + objects from all available modalities into a single list of + PlaceholderRange, sorted by their offset (starting index in the input + sequence) in the ascending order. + + Optionally if a MultiModalHashDict is given, same operation will be + applied to the object and the sorted list of hashes will be returned. + + Raises: + ValueError: If the input prompt has interleaved placeholders from + different modalities (e.g, "