koichi12 commited on Feb 12, 2025

Commit

a1956b1

verified ·

1 Parent(s): 2bdf65d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/vllm/assets/__init__.py +0 -0
.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/audio.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/base.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/image.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/video.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/assets/audio.py +33 -0
.venv/lib/python3.11/site-packages/vllm/assets/base.py +40 -0
.venv/lib/python3.11/site-packages/vllm/assets/image.py +31 -0
.venv/lib/python3.11/site-packages/vllm/assets/video.py +84 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/__init__.py +33 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/hasher.py +102 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/image.py +139 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/inputs.py +741 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/parse.py +368 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/processing.py +1295 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/profiling.py +209 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/registry.py +458 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/utils.py +518 -0
.venv/lib/python3.11/site-packages/vllm/multimodal/video.py +191 -0
.venv/lib/python3.11/site-packages/vllm/triton_utils/__init__.py +12 -0
.venv/lib/python3.11/site-packages/vllm/triton_utils/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/triton_utils/__pycache__/importing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__init__.py +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cache_engine.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cpu_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cpu_pooling_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cpu_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/enc_dec_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/hpu_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/model_runner_base.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/multi_step_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/multi_step_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/neuron_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/neuron_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/openvino_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/openvino_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/pooling_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/tpu_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/tpu_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/worker_base.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/xpu_model_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/xpu_worker.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/vllm/assets/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (184 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/audio.cpython-311.pyc ADDED Viewed

Binary file (2.03 kB). View file

.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (1.87 kB). View file

.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/image.cpython-311.pyc ADDED Viewed

Binary file (1.87 kB). View file

.venv/lib/python3.11/site-packages/vllm/assets/__pycache__/video.cpython-311.pyc ADDED Viewed

Binary file (4.79 kB). View file

.venv/lib/python3.11/site-packages/vllm/assets/audio.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from typing import Literal
+from urllib.parse import urljoin
+import numpy.typing as npt
+from vllm.utils import PlaceholderModule
+from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+ASSET_DIR = "multimodal_asset"
+@dataclass(frozen=True)
+class AudioAsset:
+    name: Literal["winning_call", "mary_had_lamb"]
+    @property
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
+        audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
+                                            s3_prefix=ASSET_DIR)
+        return librosa.load(audio_path, sr=None)
+    @property
+    def url(self) -> str:
+        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")

.venv/lib/python3.11/site-packages/vllm/assets/base.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# SPDX-License-Identifier: Apache-2.0
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+import vllm.envs as envs
+from vllm.connections import global_http_connection
+VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
+def get_cache_dir() -> Path:
+    """Get the path to the cache for storing downloaded assets."""
+    path = Path(envs.VLLM_ASSETS_CACHE)
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+@lru_cache
+def get_vllm_public_assets(filename: str,
+                           s3_prefix: Optional[str] = None) -> Path:
+    """
+    Download an asset file from ``s3://vllm-public-assets``
+    and return the path to the downloaded file.
+    """
+    asset_directory = get_cache_dir() / "vllm_public_assets"
+    asset_directory.mkdir(parents=True, exist_ok=True)
+    asset_path = asset_directory / filename
+    if not asset_path.exists():
+        if s3_prefix is not None:
+            filename = s3_prefix + "/" + filename
+        global_http_connection.download_file(
+            f"{VLLM_S3_BUCKET_URL}/{filename}",
+            asset_path,
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
+    return asset_path

.venv/lib/python3.11/site-packages/vllm/assets/image.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from typing import Literal
+import torch
+from PIL import Image
+from .base import get_vllm_public_assets
+VLM_IMAGES_DIR = "vision_model_images"
+@dataclass(frozen=True)
+class ImageAsset:
+    name: Literal["stop_sign", "cherry_blossom"]
+    @property
+    def pil_image(self) -> Image.Image:
+        image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
+                                            s3_prefix=VLM_IMAGES_DIR)
+        return Image.open(image_path)
+    @property
+    def image_embeds(self) -> torch.Tensor:
+        """
+        Image embeddings, only used for testing purposes with llava 1.5.
+        """
+        image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
+                                            s3_prefix=VLM_IMAGES_DIR)
+        return torch.load(image_path, map_location="cpu", weights_only=True)

.venv/lib/python3.11/site-packages/vllm/assets/video.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import List, Literal
+import cv2
+import numpy as np
+import numpy.typing as npt
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from vllm.multimodal.video import sample_frames_from_video
+from .base import get_cache_dir
+@lru_cache
+def download_video_asset(filename: str) -> str:
+    """
+    Download and open an image from huggingface
+    repo: raushan-testing-hf/videos-test
+    """
+    video_directory = get_cache_dir() / "video-example-data"
+    video_directory.mkdir(parents=True, exist_ok=True)
+    video_path = video_directory / filename
+    video_path_str = str(video_path)
+    if not video_path.exists():
+        video_path_str = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test",
+            filename=filename,
+            repo_type="dataset",
+            cache_dir=video_directory,
+        )
+    return video_path_str
+def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frames = []
+    for i in range(total_frames):
+        ret, frame = cap.read()
+        if ret:
+            frames.append(frame)
+    cap.release()
+    frames = np.stack(frames)
+    frames = sample_frames_from_video(frames, num_frames)
+    if len(frames) < num_frames:
+        raise ValueError(f"Could not read enough frames from video file {path}"
+                         f" (expected {num_frames} frames, got {len(frames)})")
+    return frames
+def video_to_pil_images_list(path: str,
+                             num_frames: int = -1) -> List[Image.Image]:
+    frames = video_to_ndarrays(path, num_frames)
+    return [
+        Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        for frame in frames
+    ]
+@dataclass(frozen=True)
+class VideoAsset:
+    name: Literal["sample_demo_1.mp4"]
+    num_frames: int = -1
+    @property
+    def pil_images(self) -> List[Image.Image]:
+        video_path = download_video_asset(self.name)
+        ret = video_to_pil_images_list(video_path, self.num_frames)
+        return ret
+    @property
+    def np_ndarrays(self) -> npt.NDArray:
+        video_path = download_video_asset(self.name)
+        ret = video_to_ndarrays(video_path, self.num_frames)
+        return ret

.venv/lib/python3.11/site-packages/vllm/multimodal/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# SPDX-License-Identifier: Apache-2.0
+from .base import MultiModalPlaceholderMap, MultiModalPlugin
+from .hasher import MultiModalHashDict, MultiModalHasher
+from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
+                     MultiModalDataDict, MultiModalKwargs,
+                     MultiModalPlaceholderDict, NestedTensors)
+from .registry import MultiModalRegistry
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""
+The global :class:`~MultiModalRegistry` is used by model runners to
+dispatch data processing according to the target model.
+See also:
+    :ref:`mm-processing`
+"""
+__all__ = [
+    "BatchedTensorInputs",
+    "ModalityData",
+    "MultiModalDataBuiltins",
+    "MultiModalDataDict",
+    "MultiModalHashDict",
+    "MultiModalHasher",
+    "MultiModalKwargs",
+    "MultiModalPlaceholderDict",
+    "MultiModalPlaceholderMap",
+    "MultiModalPlugin",
+    "NestedTensors",
+    "MULTIMODAL_REGISTRY",
+    "MultiModalRegistry",
+]

.venv/lib/python3.11/site-packages/vllm/multimodal/hasher.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# SPDX-License-Identifier: Apache-2.0
+import pickle
+from typing import TYPE_CHECKING, Iterable, Mapping, Optional
+import numpy as np
+import torch
+from blake3 import blake3
+from PIL import Image
+from vllm.logger import init_logger
+if TYPE_CHECKING:
+    from vllm.inputs import TokensPrompt
+logger = init_logger(__name__)
+MultiModalHashDict = Mapping[str, list[str]]
+"""
+A dictionary containing hashes for items in each modality.
+"""
+class MultiModalHasher:
+    @classmethod
+    def serialize_item(cls, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, Image.Image):
+            return obj.tobytes()
+        # Convertible to NumPy arrays
+        if isinstance(obj, torch.Tensor):
+            obj = obj.numpy()
+        if isinstance(obj, (int, float)):
+            obj = np.array(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tobytes()
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+        return pickle.dumps(obj)
+    @classmethod
+    def item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from cls.item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from cls.item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = cls.serialize_item(key)
+            value_bytes = cls.serialize_item(obj)
+            yield key_bytes, value_bytes
+    @classmethod
+    def hash_kwargs(cls, **kwargs: object) -> str:
+        hasher = blake3()
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in cls.item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+        return hasher.hexdigest()
+    @classmethod
+    def hash_prompt_mm_data(
+            cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]:
+        """Hash multimodal data in the user input prompt if they exist."""
+        if "multi_modal_data" not in prompt:
+            return None
+        mm_data = prompt["multi_modal_data"]
+        if not mm_data:
+            # mm_data can be None or an empty dict.
+            return None
+        mm_items = {
+            modality: items if isinstance(items, list) else [items]
+            for modality, items in mm_data.items()
+        }
+        mm_hashes = {
+            modality: [cls.hash_kwargs(**{modality: item}) for item in items]
+            for modality, items in mm_items.items()
+        }
+        return mm_hashes

.venv/lib/python3.11/site-packages/vllm/multimodal/image.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# SPDX-License-Identifier: Apache-2.0
+import base64
+from functools import lru_cache
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Optional
+import torch
+from PIL import Image
+from vllm.inputs.registry import InputContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.processor import get_image_processor
+from vllm.utils import is_list_of
+from .base import MediaIO, MultiModalPlugin
+from .inputs import ImageItem, ModalityData, MultiModalKwargs
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+logger = init_logger(__name__)
+cached_get_image_processor = lru_cache(get_image_processor)
+class ImagePlugin(MultiModalPlugin):
+    """Plugin for image data."""
+    def get_data_key(self) -> str:
+        return "image"
+    def _get_hf_image_processor(
+        self,
+        model_config: "ModelConfig",
+        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+        return cached_get_image_processor(
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code,
+            **mm_processor_kwargs)
+    def _default_input_mapper(
+        self,
+        ctx: InputContext,
+        data: ModalityData[ImageItem],
+        **mm_processor_kwargs,
+    ) -> MultiModalKwargs:
+        model_config = ctx.model_config
+        # PIL image
+        if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
+            image_processor = self._get_hf_image_processor(
+                model_config,
+                mm_processor_kwargs,
+            )
+            if image_processor is None:
+                raise RuntimeError("No HuggingFace processor is available "
+                                   "to process the image object")
+            try:
+                # NOTE: It may make sense to forward the mm_processor_kwargs
+                # here too. For now, to keep it simple, we only allow it be
+                # used for the initialization call though, just in case the
+                # signatures of the preprocessor initializer don't match
+                # preprocess()
+                batch_data = image_processor \
+                    .preprocess(data, return_tensors="pt") \
+                    .data
+            except Exception:
+                logger.error(
+                    "Failed to process image (%s) with the default mapper. "
+                    "This is most likely an edge-case with this model's image "
+                    "processor in transformers (type: %s), and not vLLM.",
+                    data,
+                    type(image_processor).__name__)
+                raise
+            return MultiModalKwargs(batch_data)
+        # Image embedding
+        elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor):
+            return MultiModalKwargs({"image_embeds": data})
+        raise TypeError(f"Invalid image type: {type(data)}")
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 3000
+def rescale_image_size(image: Image.Image,
+                       size_factor: float,
+                       transpose: int = -1) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    image = image.resize((new_width, new_height))
+    if transpose >= 0:
+        image = image.transpose(Image.Transpose(transpose))
+    return image
+class ImageMediaIO(MediaIO[Image.Image]):
+    def __init__(self, *, image_mode: str = "RGB") -> None:
+        super().__init__()
+        self.image_mode = image_mode
+    def load_bytes(self, data: bytes) -> Image.Image:
+        image = Image.open(BytesIO(data))
+        image.load()
+        return image.convert(self.image_mode)
+    def load_base64(self, media_type: str, data: str) -> Image.Image:
+        return self.load_bytes(base64.b64decode(data))
+    def load_file(self, filepath: Path) -> Image.Image:
+        image = Image.open(filepath)
+        image.load()
+        return image.convert(self.image_mode)
+    def encode_base64(
+        self,
+        media: Image.Image,
+        *,
+        image_format: str = "JPEG",
+    ) -> str:
+        image = media
+        with BytesIO() as buffer:
+            image = image.convert(self.image_mode)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+        return base64.b64encode(data).decode('utf-8')

.venv/lib/python3.11/site-packages/vllm/multimodal/inputs.py ADDED Viewed

	@@ -0,0 +1,741 @@

+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from collections import UserDict, defaultdict
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from functools import partial
+from itertools import accumulate
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
+                    Union, cast, final)
+import numpy as np
+import torch
+import torch.types
+from PIL.Image import Image
+from transformers import BatchFeature
+from typing_extensions import NotRequired, TypeAlias
+from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+_T = TypeVar("_T")
+HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
+"""
+A :class:`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+"""
+HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
+                               list[np.ndarray], list[torch.Tensor]]
+"""
+A :class:`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+"""
+HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
+"""
+Represents a single audio
+item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+"""
+ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
+"""
+A :class:`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as image embeddings;
+these are directly passed to the model without HF processing.
+"""
+VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
+"""
+A :class:`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as video embeddings;
+these are directly passed to the model without HF processing.
+"""
+AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
+                             torch.Tensor]
+"""
+Represents a single audio
+item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
+is different from that expected by the model;
+these are resampled to the model's sampling rate before being processed by HF.
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as audio embeddings;
+these are directly passed to the model without HF processing.
+"""
+ModalityData: TypeAlias = Union[_T, list[_T]]
+"""
+Either a single data item, or a list of data items.
+The number of data items allowed per modality is restricted by
+:code:`--limit-mm-per-prompt`.
+"""
+@final
+class MultiModalDataBuiltins(TypedDict, total=False):
+    """Type annotations for modality types predefined by vLLM."""
+    image: ModalityData[ImageItem]
+    """The input image(s)."""
+    video: ModalityData[VideoItem]
+    """The input video(s)."""
+    audio: ModalityData[AudioItem]
+    """The input audio(s)."""
+MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
+"""
+A dictionary containing an entry for each modality type to input.
+The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
+"""
+class PlaceholderRange(TypedDict):
+    """
+    Placeholder location information for multi-modal data.
+    Example:
+        Prompt: :code:`AAAA BBBB What is in these images?`
+        Images A and B will have:
+        .. code-block::
+            A: { "offset": 0, "length": 4 }
+            B: { "offset": 5, "length": 4 }
+    """
+    offset: int
+    """The start index of the placeholder in the prompt."""
+    length: int
+    """The length of the placeholder."""
+NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
+                      tuple[torch.Tensor, ...]]
+"""
+Uses a list instead of a tensor if the dimensions of each element do not match.
+"""
+def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
+    """Equality check between :data:`NestedTensors` objects."""
+    if isinstance(a, torch.Tensor):
+        return isinstance(b, torch.Tensor) and torch.equal(a, b)
+    elif isinstance(b, torch.Tensor):
+        return isinstance(a, torch.Tensor) and torch.equal(b, a)
+    if isinstance(a, list):
+        return (isinstance(b, list)
+                and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b)))
+    if isinstance(b, list):
+        return (isinstance(a, list)
+                and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a)))
+    # Both a and b are scalars
+    return a == b
+BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
+"""
+A dictionary containing nested tensors which have been batched via
+:meth:`MultiModalKwargs.batch`.
+"""
+@dataclass(frozen=True)
+class MultiModalFieldElem:
+    """
+    Represents a keyword argument corresponding to a multi-modal item
+    in :class:`MultiModalKwargs`.
+    """
+    modality: str
+    """
+    The modality of the corresponding multi-modal item.
+    Each multi-modal item can consist of multiple keyword arguments.
+    """
+    key: str
+    """
+    The key of this field in :class:`MultiModalKwargs`,
+    i.e. the name of the keyword argument to be passed to the model.
+    """
+    data: NestedTensors
+    """
+    The tensor data of this field in :class:`MultiModalKwargs`,
+    i.e. the value of the keyword argument to be passed to the model.
+    """
+    field: "BaseMultiModalField"
+    """
+    Defines how to combine the tensor data of this field with others
+    in order to batch multi-modal items together for model inference.
+    """
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        return ((self.modality, self.key) == (other.modality, other.key)
+                and nested_tensors_equal(self.data, other.data)
+                and type(self.field) == type(other.field))  # noqa: E721
+@dataclass(frozen=True)
+class BaseMultiModalField(ABC):
+    """
+    Defines how to interpret tensor data belonging to a keyword argument in
+    :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa.
+    """
+    def _field_factory(self, *, modality: str, key: str):
+        f = partial(
+            MultiModalFieldElem,
+            modality=modality,
+            key=key,
+            field=self,
+        )
+        # Allow passing data as positional argument
+        def factory(data: NestedTensors) -> MultiModalFieldElem:
+            return f(data=data)
+        return factory
+    @abstractmethod
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        """
+        Construct :class:`MultiModalFieldElem` instances to represent
+        the provided data.
+        This is the inverse of :meth:`reduce_data`.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        raise NotImplementedError
+    def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
+        """
+        Merge the data from multiple instances of :class:`MultiModalFieldElem`.
+        This is the inverse of :meth:`build_elems`.
+        """
+        field_types = [type(item.field) for item in elems]
+        if len(set(field_types)) > 1:
+            raise ValueError(f"Cannot merge different {field_types=}")
+        return self._reduce_data([item.data for item in elems])
+@dataclass(frozen=True)
+class MultiModalBatchedField(BaseMultiModalField):
+    """
+    See also:
+        :func:`MultiModalFieldConfig.batched`
+    """
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(item) for item in data]
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            if len(batch) == 1:
+                # An optimization when `batch` contains only one tensor:
+                # - produce exactly same result as `torch.stack(batch)`
+                # - will achieve zero-copy if the tensor is contiguous
+                return batch[0].unsqueeze(0).contiguous()
+            first_shape = batch[0].shape
+            if all(elem.shape == first_shape for elem in batch):
+                return torch.stack(batch)
+        return batch
+@dataclass(frozen=True)
+class MultiModalFlatField(BaseMultiModalField):
+    """
+    See also:
+        :func:`MultiModalFieldConfig.flat`
+        :func:`MultiModalFieldConfig.flat_from_sizes`
+    """
+    slices: Sequence[slice]
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(data[s]) for s in self.slices]
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            if len(batch) == 1:
+                # An optimization when `batch` contains only one tensor:
+                # - produce exactly same result as `torch.concat(batch)`
+                # - will achieve zero-copy if the tensor is contiguous
+                return batch[0].contiguous()
+            first_shape = batch[0].shape
+            if all(elem.shape[1:] == first_shape[1:] for elem in batch):
+                return torch.concat(batch)
+        return [e for elem in batch for e in elem]
+@dataclass(frozen=True)
+class MultiModalSharedField(BaseMultiModalField):
+    """
+    See also:
+        :func:`MultiModalFieldConfig.shared`
+    """
+    batch_size: int
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(data)] * self.batch_size
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        return batch[0]
+class MultiModalFieldConfig:
+    @staticmethod
+    def batched(modality: str):
+        """
+        Defines a field where an element in the batch is obtained by
+        indexing into the first dimension of the underlying data.
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+        Example:
+        .. code-block::
+            Input:
+                Data: [[AAAA]
+                       [BBBB]
+                       [CCCC]]
+            Output:
+                Element 1: [AAAA]
+                Element 2: [BBBB]
+                Element 3: [CCCC]
+        """
+        return MultiModalFieldConfig(
+            field=MultiModalBatchedField(),
+            modality=modality,
+        )
+    @staticmethod
+    def flat(modality: str, slices: Sequence[slice]):
+        """
+        Defines a field where an element in the batch is obtained by
+        slicing along the first dimension of the underlying data.
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            slices: For each multi-modal item, a slice that is used to extract
+                the data corresponding to it.
+        Example:
+        .. code-block::
+            Given:
+                slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
+            Input:
+                Data: [AAABBBBCC]
+            Output:
+                Element 1: [AAA]
+                Element 2: [BBBB]
+                Element 3: [CC]
+        """
+        return MultiModalFieldConfig(
+            field=MultiModalFlatField(slices=slices),
+            modality=modality,
+        )
+    @staticmethod
+    def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
+        """
+        Defines a field where an element in the batch is obtained by
+        slicing along the first dimension of the underlying data.
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            slices: For each multi-modal item, the size of the slice that
+                is used to extract the data corresponding to it.
+        Example:
+        .. code-block::
+            Given:
+                size_per_item: [3, 4, 2]
+            Input:
+                Data: [AAABBBBCC]
+            Output:
+                Element 1: [AAA]
+                Element 2: [BBBB]
+                Element 3: [CC]
+        See also:
+            :func:`MultiModalFieldConfig.flat`
+        """
+        slice_idxs = [0, *accumulate(size_per_item)]
+        slices = [
+            slice(slice_idxs[i], slice_idxs[i + 1])
+            for i in range(len(size_per_item))
+        ]
+        return MultiModalFieldConfig.flat(modality, slices)
+    @staticmethod
+    def shared(modality: str, batch_size: int):
+        """
+        Defines a field where an element in the batch is obtained by
+        taking the entirety of the underlying data.
+        This means that the data is the same for each element in the batch.
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            batch_size: The number of multi-modal items which share this data.
+        Example:
+        .. code-block::
+            Given:
+                batch_size: 4
+            Input:
+                Data: [XYZ]
+            Output:
+                Element 1: [XYZ]
+                Element 2: [XYZ]
+                Element 3: [XYZ]
+                Element 4: [XYZ]
+        """
+        return MultiModalFieldConfig(
+            field=MultiModalSharedField(batch_size),
+            modality=modality,
+        )
+    def __init__(self, field: BaseMultiModalField, modality: str) -> None:
+        super().__init__()
+        self.field = field
+        self.modality = modality
+    def build_elems(
+        self,
+        key: str,
+        batch: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        return self.field.build_elems(self.modality, key, batch)
+class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
+    """
+    A collection of :class:`MultiModalFieldElem`
+    corresponding to a data item in :class:`MultiModalDataItems`.
+    """
+    @staticmethod
+    def from_elems(elems: Sequence[MultiModalFieldElem]):
+        return MultiModalKwargsItem({elem.key: elem for elem in elems})
+    @property
+    def modality(self) -> str:
+        modalities = {elem.modality for elem in self.data.values()}
+        assert len(modalities) == 1, f"Found different modalities={modalities}"
+        return next(iter(modalities))
+# NOTE: UserDict is for V0 compatibility.
+# V1 should access individual items via `get_item`.
+class MultiModalKwargs(UserDict[str, NestedTensors]):
+    """
+    A dictionary that represents the keyword arguments to
+    :meth:`~torch.nn.Module.forward`.
+    The metadata :code:`items` enables us to obtain the keyword arguments
+    corresponding to each data item in :class:`MultiModalDataItems`, via
+    :meth:`get_item` and :meth:`get_items`.
+    """
+    @staticmethod
+    def from_hf_inputs(
+        hf_inputs: BatchFeature,
+        config_by_key: Mapping[str, MultiModalFieldConfig],
+    ):
+        # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
+        # We assume that those fields are not used in vLLM
+        elems_by_key = dict[str, Sequence[MultiModalFieldElem]]()
+        keys_by_modality = defaultdict[str, set[str]](set)
+        for key, config in config_by_key.items():
+            batch = hf_inputs.get(key)
+            if batch is not None:
+                elems = config.build_elems(key, batch)
+                if len(elems) > 0:
+                    elems_by_key[key] = elems
+                    keys_by_modality[config.modality].add(key)
+        items = list[MultiModalKwargsItem]()
+        for modality, keys in keys_by_modality.items():
+            elems_in_modality = {k: elems_by_key[k] for k in keys}
+            batch_sizes = {k: len(v) for k, v in elems_in_modality.items()}
+            if len(set(batch_sizes.values())) > 1:
+                raise ValueError(
+                    f"Cannot merge different batch sizes for {modality=}! "
+                    f"Found: {batch_sizes=}")
+            batch_size = next(iter(batch_sizes.values()))
+            for item_idx in range(batch_size):
+                elems = [v[item_idx] for v in elems_in_modality.values()]
+                items.append(MultiModalKwargsItem.from_elems(elems))
+        return MultiModalKwargs.from_items(items)
+    @staticmethod
+    def from_items(items: Sequence[MultiModalKwargsItem]):
+        """Construct a new :class:`MultiModalKwargs` from multiple items."""
+        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
+        for item in items:
+            for key, elem in item.items():
+                elems_by_key[key].append(elem)
+        data = {
+            key: elems[0].field.reduce_data(elems)
+            for key, elems in elems_by_key.items() if len(elems) > 0
+        }
+        return MultiModalKwargs(data, items=items)
+    def __init__(
+        self,
+        data: Mapping[str, NestedTensors],
+        *,
+        items: Optional[Sequence[MultiModalKwargsItem]] = None,
+    ) -> None:
+        super().__init__(data)
+        items_by_modality = full_groupby(items or [], key=lambda x: x.modality)
+        self._items_by_modality = dict(items_by_modality)
+    @property
+    def modalities(self):
+        return self._items_by_modality.keys()
+    @staticmethod
+    def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
+        """
+        Stack the inner dimensions that have the same shape in
+        a nested list of tensors.
+        Thus, a dimension represented by a list means that the inner
+        dimensions are different for each element along that dimension.
+        """
+        if isinstance(nested_tensors, torch.Tensor):
+            return nested_tensors
+        # TODO: Remove these once all models have been migrated
+        if isinstance(nested_tensors, np.ndarray):
+            return torch.from_numpy(nested_tensors)
+        if isinstance(nested_tensors, (int, float)):
+            return torch.tensor(nested_tensors)
+        stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
+        if not is_list_of(stacked, torch.Tensor, check="all"):
+            # Only tensors (not lists) can be stacked.
+            return stacked
+        tensors_ = cast(list[torch.Tensor], stacked)
+        if len(tensors_) == 1:
+            # An optimization when `tensors_` contains only one tensor:
+            # - produce exactly same result as `torch.stack(tensors_)`
+            # - will achieve zero-copy if the tensor is contiguous
+            return tensors_[0].unsqueeze(0).contiguous()
+        if any(t.shape != tensors_[0].shape for t in tensors_):
+            # The tensors have incompatible shapes and can't be stacked.
+            return tensors_
+        return torch.stack(tensors_)
+    @staticmethod
+    def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
+        """
+        Batch multiple inputs together into a dictionary.
+        The resulting dictionary has the same keys as the inputs.
+        If the corresponding value from each input is a tensor and they all
+        share the same shape, the output value is a single batched tensor;
+        otherwise, the output value is a list containing the original value
+        from each input.
+        """
+        if len(inputs_list) == 0:
+            return {}
+        # We need to consider the case where each item in the batch
+        # contains different modalities (i.e. different keys).
+        item_lists = defaultdict[str, list[NestedTensors]](list)
+        for inputs in inputs_list:
+            for k, v in inputs.items():
+                item_lists[k].append(v)
+        return {
+            k: MultiModalKwargs._try_stack(item_list)
+            for k, item_list in item_lists.items()
+        }
+    @staticmethod
+    def as_kwargs(
+        batched_inputs: BatchedTensorInputs,
+        *,
+        device: torch.types.Device,
+    ) -> BatchedTensorInputs:
+        json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
+        json_mapped = json_map_leaves(
+            lambda x: x.to(device, non_blocking=True),
+            json_inputs,
+        )
+        return cast(BatchedTensorInputs, json_mapped)
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if self._items_by_modality != other._items_by_modality:
+            return False
+        ks = self.keys()
+        return (ks == other.keys()
+                and all(nested_tensors_equal(self[k], other[k]) for k in ks))
+    def _validate_modality(self, method_name: str, modality: str) -> None:
+        if not self._items_by_modality:
+            raise RuntimeError(
+                f"`{method_name}` is not supported when "
+                "MultiModalKwargs is not initialized with `items`")
+        if modality not in self._items_by_modality:
+            available_modalities = set(self._items_by_modality.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+    def get_item_count(self, modality: str) -> int:
+        """Get the number of items belonging to a modality."""
+        self._validate_modality("get_item_count", modality)
+        return len(self._items_by_modality[modality])
+    def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
+        """
+        Get the keyword arguments corresponding to an item identified by
+        its modality and index.
+        """
+        self._validate_modality("get_item", modality)
+        return self._items_by_modality[modality][item_index]
+    def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
+        """
+        Get the keyword arguments corresponding to each item belonging to
+        a modality.
+        """
+        self._validate_modality("get_items", modality)
+        return self._items_by_modality[modality]
+MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
+"""
+A dictionary containing placeholder ranges for each modality.
+"""
+class MultiModalInputs(TypedDict):
+    """
+    Represents the outputs of
+    :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
+    ready to be passed to vLLM internals.
+    """
+    type: Literal["multimodal"]
+    """The type of inputs."""
+    prompt: str
+    """The processed prompt text."""
+    prompt_token_ids: list[int]
+    """The processed token IDs which includes placeholder tokens."""
+    token_type_ids: NotRequired[list[int]]
+    """The token type IDs of the prompt."""
+    mm_kwargs: MultiModalKwargs
+    """Keyword arguments to be directly passed to the model after batching."""
+    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
+    """The hashes of the multi-modal data."""
+    mm_placeholders: MultiModalPlaceholderDict
+    """
+    For each modality, information about the placeholder tokens in
+    :code:`prompt_token_ids`.
+    """

.venv/lib/python3.11/site-packages/vllm/multimodal/parse.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from collections import UserDict
+from collections.abc import Callable, Iterator, Mapping, Sequence
+from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar,
+                    Union)
+import numpy as np
+import torch
+from PIL.Image import Image
+from typing_extensions import TypeAlias, TypeGuard, assert_never
+from vllm.utils import is_list_of
+from .audio import resample_audio
+from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
+                     ImageItem, ModalityData, MultiModalDataDict, VideoItem)
+_T = TypeVar("_T")
+_I = TypeVar("_I")
+class ModalityDataItems(ABC, Generic[_T, _I]):
+    """
+    Represents data items for a modality in :class:`MultiModalDataItems`.
+    """
+    def __init__(self, data: _T, modality: str) -> None:
+        super().__init__()
+        self.data = data
+        self.modality = modality
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r}, "
+                f"len={len(self)})")
+    def __len__(self) -> int:
+        return self.get_count()
+    def __getitem__(self, index: int) -> _I:
+        return self.get(index)
+    if TYPE_CHECKING:
+        # Auto-generated
+        def __iter__(self) -> Iterator[_I]:
+            ...
+    @abstractmethod
+    def get_count(self) -> int:
+        """Get the number of data items."""
+        raise NotImplementedError
+    @abstractmethod
+    def get(self, index: int) -> _I:
+        """Get a data item by its index."""
+        raise NotImplementedError
+    def get_all(self) -> list[_I]:
+        """Get all data items."""
+        return [self.get(idx) for idx in range(self.get_count())]
+    @abstractmethod
+    def get_processor_data(self) -> Mapping[str, object]:
+        """Get the data to pass to the HF processor."""
+        raise NotImplementedError
+    @abstractmethod
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        """Get the data to pass directly to the model."""
+        raise NotImplementedError
+class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+    """Base class for data items that are arranged in a list."""
+    def get_count(self) -> int:
+        return len(self.data)
+    def get(self, index: int) -> _T:
+        return self.data[index]
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}s": self.data}
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {}
+class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
+                                       torch.Tensor]):
+    """
+    Base class for data items that are expressed as a batched embedding tensor,
+    or a list of embedding tensors (one per item).
+    """
+    def get_count(self) -> int:
+        return len(self.data)
+    def get(self, index: int) -> torch.Tensor:
+        return self.data[index]
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}_embeds": self.data}
+    def get_feature_size(self, item_idx: int) -> int:
+        return len(self.get(item_idx))
+class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
+    def __init__(self, data: Sequence[HfAudioItem]) -> None:
+        super().__init__(data, "audio")
+class AudioEmbeddingItems(EmbeddingItems):
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+        super().__init__(data, "audio")
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
+    def __init__(self, data: Sequence[HfImageItem]) -> None:
+        super().__init__(data, "image")
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+        assert_never(image)
+class ImageEmbeddingItems(EmbeddingItems):
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+        super().__init__(data, "image")
+class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
+    def __init__(self, data: Sequence[HfVideoItem]) -> None:
+        super().__init__(data, "video")
+    def get_num_frames(self, item_idx: int) -> int:
+        return len(self.get(item_idx))
+    def get_frame_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)[0]  # Assume that the video isn't empty
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+        assert_never(image)
+class VideoEmbeddingItems(EmbeddingItems):
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+        super().__init__(data, "video")
+_D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
+class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
+    """
+    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
+    such that each entry corresponds to a list.
+    """
+    def get_count(self, modality: str, *, strict: bool = True) -> int:
+        """
+        Get the number of data items belonging to a modality.
+        If `strict=False`, return `0` instead of raising :exc:`KeyError`
+        even if the modality is not found.
+        """
+        if modality not in self:
+            if strict:
+                available_modalities = set(self.keys())
+                raise KeyError(f"Modality {modality!r} not found. "
+                               f"Available modalities: {available_modalities}")
+            return 0
+        return self[modality].get_count()
+    def get_all_counts(self) -> Mapping[str, int]:
+        """Get the number of items belonging to each modality."""
+        return {m: items.get_count() for m, items in self.items()}
+    def get_items(
+        self,
+        modality: str,
+        typ: Union[type[_D], tuple[type[_D], ...]],
+    ) -> _D:
+        """
+        Get the data items belonging to a modality,
+        requiring that they belong to a certain type.
+        """
+        if modality not in self:
+            available_modalities = set(self.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+        items = self[modality]
+        if not isinstance(items, typ):
+            raise TypeError(f"Invalid type of data items for {modality=}. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(items)}")
+        return items  # type: ignore[return-value]
+ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
+                                         ModalityDataItems[Any, Any]]
+class MultiModalDataParser:
+    """
+    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
+    :class:`MultiModalDataItems`.
+    Args:
+        target_sr (float, optional): Enables automatic resampling of audio
+            items to the model's expected sampling rate.
+    """
+    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+        super().__init__()
+        self.target_sr = target_sr
+    def _is_embeddings(
+            self, data: object
+    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
+        if isinstance(data, torch.Tensor):
+            return data.ndim == 3
+        if is_list_of(data, torch.Tensor):
+            return len(data) == 0 or data[0].ndim == 2
+        return False
+    def _get_audio_with_sr(
+        self,
+        audio: AudioItem,
+    ) -> tuple[np.ndarray, Optional[float]]:
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), None
+        if isinstance(audio, np.ndarray):
+            return audio, None
+        if isinstance(audio, torch.Tensor):
+            return audio.numpy(), None
+        assert_never(audio)
+    def _parse_audio_data(
+        self,
+        data: ModalityData[AudioItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return AudioEmbeddingItems(data)
+        if (is_list_of(data, float)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 1
+                or isinstance(data, tuple)):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+        new_audios = list[np.ndarray]()
+        for data_item in data_items:
+            audio, orig_sr = self._get_audio_with_sr(data_item)
+            if orig_sr is None:
+                new_audio = audio
+            else:
+                target_sr = self.target_sr
+                if target_sr is None:
+                    raise RuntimeError(
+                        "Audio resampling is not supported when "
+                        "`target_sr` is not provided")
+                new_audio = resample_audio(audio,
+                                           orig_sr=orig_sr,
+                                           target_sr=target_sr)
+            new_audios.append(new_audio)
+        return AudioProcessorItems(new_audios)
+    def _parse_image_data(
+        self,
+        data: ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return ImageEmbeddingItems(data)
+        if (isinstance(data, Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 3):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+        return ImageProcessorItems(data_items)
+    def _parse_video_data(
+        self,
+        data: ModalityData[VideoItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return VideoEmbeddingItems(data)
+        if (is_list_of(data, Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 4):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+        return VideoProcessorItems(data_items)
+    def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
+        return {
+            "audio": self._parse_audio_data,
+            "image": self._parse_image_data,
+            "video": self._parse_video_data,
+        }
+    def parse_mm_data(self,
+                      mm_data: MultiModalDataDict) -> MultiModalDataItems:
+        subparsers = self._get_subparsers()
+        mm_items = MultiModalDataItems()
+        for k, v in mm_data.items():
+            if k not in subparsers:
+                raise ValueError(f"Unsupported modality: {k}")
+            mm_items[k] = subparsers[k](v)
+        return mm_items

.venv/lib/python3.11/site-packages/vllm/multimodal/processing.py ADDED Viewed

	@@ -0,0 +1,1295 @@

+# SPDX-License-Identifier: Apache-2.0
+import re
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
+                             Sequence)
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
+                    TypeVar, Union)
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
+import vllm.envs as envs
+from vllm.inputs import InputProcessingContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
+                                               encode_tokens)
+from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
+from .hasher import MultiModalHasher
+from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                     MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem,
+                     PlaceholderRange)
+from .parse import MultiModalDataItems, MultiModalDataParser
+if TYPE_CHECKING:
+    from .profiling import BaseDummyInputsBuilder
+logger = init_logger(__name__)
+_S = TypeVar("_S", str, list[int])
+PromptSeq = Union[str, list[int]]
+"""A token sequence (list of token IDs) or text."""
+@dataclass
+class PromptReplacementDetails:
+    """Details about the replacement token sequence or text."""
+    full: PromptSeq
+    """The full replacement."""
+    features: PromptSeq
+    """
+    The part of the replacement that corresponds to feature placeholders;
+    this will be replaced by the output of the vision encoder during model
+    inference.
+    """
+    @staticmethod
+    def from_seq(seq: PromptSeq) -> "PromptReplacementDetails":
+        return PromptReplacementDetails(full=seq, features=seq)
+PromptRepl = Union[PromptSeq, PromptReplacementDetails]
+"""
+The replacement token sequence or text.
+If only part of the replacement corresponds to feature placeholders, you can
+use :class:`PromptReplacementDetails` to specify which part.
+"""
+@dataclass
+class PromptReplacement:
+    """
+    Defines how to replace portions of an input prompt with placeholder tokens.
+    Example:
+        For each image, replace one ``<image>`` input placeholder in the prompt
+        with a number of ``<image>`` feature placeholders
+        equal to the feature size of the vision encoder:
+        .. code-block:: python
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement="<image>" * image_feature_size,
+            )
+        As above, but further pad the feature placeholders with ``<image_bos>``
+        and `<image_eos>``, which are not supposed to be passed to the vision
+        encoder:
+        .. code-block:: python
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=PromptReplacementDetails(
+                    full="".join([
+                        "<image_bos>",
+                        "<image>" * image_feature_size,
+                        "<image_eos>",
+                    ]),
+                    features="<image>" * image_feature_size,
+                ),
+            )
+        To avoid unnecessary tokenization during prompt replacement,
+        we recommended passing token sequences instead of text:
+        .. code-block:: python
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=PromptReplacementDetails(
+                    full=([image_bos_id] + [image_token_id] * image_feature_size
+                          + [image_eos_id]),
+                    features=[image_token_id] * image_feature_size,
+                ),
+            )
+    """
+    modality: str
+    """The modality for which the replacement is made."""
+    target: PromptSeq
+    """The token sequence (or text) to find and replace."""
+    replacement: Union[Callable[[int], PromptRepl],
+                       PromptRepl] = field(repr=False)
+    """
+    Given the index of the processed item within :attr:`modality`,
+    output the replacement token sequence (or text).
+    For convenience, you can directly pass in the replacement token sequence
+    (or text) instead of a function if it does not depend on the input.
+    """
+    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
+        return BoundPromptReplacement(
+            tokenizer=tokenizer,
+            modality=self.modality,
+            _target=self.target,
+            _replacement=self.replacement,
+        )
+@lru_cache(maxsize=2048)
+def _cached_encode(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    add_special_tokens: bool = False,
+) -> list[int]:
+    return encode_tokens(tokenizer,
+                         text,
+                         add_special_tokens=add_special_tokens)
+@lru_cache(maxsize=2048)
+def _cached_decode(
+    tokenizer: AnyTokenizer,
+    token_ids: tuple[int, ...],
+    *,
+    skip_special_tokens: bool = False,
+) -> str:
+    return decode_tokens(tokenizer,
+                         list(token_ids),
+                         skip_special_tokens=skip_special_tokens)
+class _HasModalityAttr(Protocol):
+    modality: str
+class _HasModalityProp(Protocol):
+    @property
+    def modality(self) -> str:
+        ...
+_M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
+def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
+    """Convenience function to apply :func:`full_groupby` based on modality."""
+    return full_groupby(values, key=lambda x: x.modality)
+@dataclass
+class _BoundPromptSequence:
+    """
+    A :data:`_PromptSeq` bound to a tokenizer to automatically
+    convert between token sequence and text representations.
+    """
+    tokenizer: AnyTokenizer = field(repr=False)
+    _text: Optional[str]
+    _token_ids: Optional[list[int]]
+    @staticmethod
+    def from_seq(
+        tokenizer: AnyTokenizer,
+        seq: PromptSeq,
+    ) -> "_BoundPromptSequence":
+        return _BoundPromptSequence(
+            tokenizer=tokenizer,
+            _text=seq if isinstance(seq, str) else None,
+            _token_ids=seq if isinstance(seq, list) else None,
+        )
+    def __post_init__(self) -> None:
+        if self._text is None and self._token_ids is None:
+            raise ValueError("At least one of 'text' and 'token_ids' must be "
+                             "specified")
+    @property
+    def text(self) -> str:
+        if self._text is None:
+            assert self._token_ids is not None
+            self._text = _cached_decode(self.tokenizer, tuple(self._token_ids))
+        return self._text
+    @property
+    def token_ids(self) -> list[int]:
+        if self._token_ids is None:
+            assert self._text is not None
+            self._token_ids = _cached_encode(self.tokenizer, self._text)
+        return self._token_ids
+@dataclass
+class _BoundPromptReplacementGroup:
+    full: _BoundPromptSequence
+    features: _BoundPromptSequence
+@dataclass
+class BoundPromptReplacement:
+    """
+    A :class:`PromptReplacement` bound to a tokenizer to automatically
+    convert :attr:`target` and the result of :meth:`get_replacement` between
+    token sequence and text representations.
+    """
+    tokenizer: AnyTokenizer = field(repr=False)
+    modality: str
+    _target: PromptSeq
+    _replacement: Union[Callable[[int], PromptRepl],
+                        PromptRepl] = field(repr=False)
+    def __post_init__(self) -> None:
+        self._replacement_cache = dict[int, _BoundPromptReplacementGroup]()
+    @property
+    def target(self) -> _BoundPromptSequence:
+        """The token sequence (or text) to find and replace."""
+        return _BoundPromptSequence.from_seq(self.tokenizer, self._target)
+    def get_replacement(self, item_idx: int) -> _BoundPromptReplacementGroup:
+        """
+        Given the index of the processed item within :attr:`modality`,
+        output the replacement token sequence (or text).
+        """
+        replacement = self._replacement
+        if callable(replacement):
+            cache_key = item_idx
+            if cache_key in self._replacement_cache:
+                return self._replacement_cache[cache_key]
+            replacement = replacement(item_idx)
+        else:
+            cache_key = None
+        if not isinstance(replacement, PromptReplacementDetails):
+            replacement = PromptReplacementDetails.from_seq(replacement)
+        bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
+                                                   replacement.full)
+        bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
+                                                       replacement.features)
+        bound_replacement = _BoundPromptReplacementGroup(
+            full=bound_full,
+            features=bound_features,
+        )
+        if cache_key is not None:
+            self._replacement_cache[cache_key] = bound_replacement
+        return bound_replacement
+class _TokenMatch(NamedTuple):
+    start_idx: int
+    end_idx: int
+def iter_token_matches(
+    token_ids: list[int],
+    match_ids: list[int],
+) -> Generator[_TokenMatch]:
+    """
+    Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
+    Note that empty matches are ignored.
+    """
+    prompt_len = len(token_ids)
+    match_len = len(match_ids)
+    if match_len == 0:
+        return
+    start_idx = 0
+    while start_idx < prompt_len - match_len + 1:
+        end_idx = start_idx + match_len
+        if token_ids[start_idx:end_idx] == match_ids:
+            yield _TokenMatch(start_idx=start_idx, end_idx=end_idx)
+            # Exclude overlapping matches
+            start_idx = end_idx
+        else:
+            start_idx += 1
+@dataclass(repr=False)
+class _PromptReplacementMatch(ABC):
+    prompt_repl: BoundPromptReplacement
+    @property
+    def modality(self) -> str:
+        return self.prompt_repl.modality
+    @property
+    @abstractmethod
+    def start_idx(self) -> int:
+        raise NotImplementedError
+    @property
+    @abstractmethod
+    def end_idx(self) -> int:
+        raise NotImplementedError
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r}, "
+                f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
+@dataclass(repr=False)
+class _PromptReplacementTokenMatch(_PromptReplacementMatch):
+    match: _TokenMatch
+    @property
+    def start_idx(self) -> int:
+        return self.match.start_idx
+    @property
+    def end_idx(self) -> int:
+        return self.match.end_idx
+@dataclass(repr=False)
+class _PromptReplacementTextMatch(_PromptReplacementMatch):
+    match: re.Match[str]
+    @property
+    def start_idx(self) -> int:
+        return self.match.start()
+    @property
+    def end_idx(self) -> int:
+        return self.match.end()
+@dataclass
+class PlaceholderFeaturesInfo:
+    modality: str
+    item_idx: int
+    start_idx: int
+    tokens: list[int]
+    @property
+    def length(self) -> int:
+        return len(self.tokens)
+    def to_range(self) -> PlaceholderRange:
+        return PlaceholderRange(
+            offset=self.start_idx,
+            length=self.length,
+        )
+def find_token_matches(
+    prompt: list[int],
+    prompt_repls: Sequence[BoundPromptReplacement],
+) -> list[_PromptReplacementTokenMatch]:
+    """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
+    return [
+        _PromptReplacementTokenMatch(prompt_repl, match)
+        for prompt_repl in prompt_repls
+        for match in iter_token_matches(prompt, prompt_repl.target.token_ids)
+    ]
+def find_text_matches(
+    prompt: str,
+    prompt_repls: Sequence[BoundPromptReplacement],
+) -> list[_PromptReplacementTextMatch]:
+    """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
+    return [
+        _PromptReplacementTextMatch(prompt_repl, match)
+        for prompt_repl in prompt_repls
+        for match in re.finditer(re.escape(prompt_repl.target.text), prompt)
+    ]
+def _resolve_matches(
+    prompt: PromptSeq,
+    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
+) -> list[_PromptReplacementMatch]:
+    """
+    Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
+    and sort them such that earlier matches take priority over later ones.
+    """
+    matches = [m for matches in mm_matches.values() for m in matches]
+    seen_matches: list[Optional[_PromptReplacementMatch]] = [None
+                                                             ] * len(prompt)
+    for match in matches:
+        for idx in range(match.start_idx, match.end_idx):
+            if seen_matches[idx] is not None:
+                raise ValueError("Found overlapping matches "
+                                 f"({seen_matches[idx]} and {match}) "
+                                 f"at index={idx} of prompt={prompt}")
+            seen_matches[idx] = match
+    return sorted(matches, key=lambda x: x.start_idx)
+def _replace_matches(
+    prompt: _S,
+    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
+    mm_item_counts: Mapping[str, int],
+) -> list[_S]:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    out_seqs = list[_S]()
+    prev_end_idx = 0
+    next_idx_by_modality = defaultdict[str, int](lambda: 0)
+    for match in _resolve_matches(prompt, mm_matches):
+        modality = match.modality
+        item_idx = next_idx_by_modality[modality]
+        if item_idx >= mm_item_counts.get(modality, 0):
+            continue
+        start_idx = match.start_idx
+        end_idx = match.end_idx
+        repl_info = match.prompt_repl
+        replacement = repl_info.get_replacement(item_idx)
+        if isinstance(prompt, str):
+            repl_seq = replacement.full.text
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
+        else:
+            repl_seq = replacement.full.token_ids
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
+        prev_end_idx = end_idx
+        next_idx_by_modality[modality] += 1
+    out_seqs.append(prompt[prev_end_idx:])
+    return out_seqs
+def replace_token_matches(
+    prompt: list[int],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]],
+    mm_item_counts: Mapping[str, int],
+) -> list[int]:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    if not mm_matches:
+        return prompt
+    token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts)
+    return flatten_2d_lists(token_id_seqs)
+def replace_text_matches(
+    prompt: str,
+    mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]],
+    mm_item_counts: Mapping[str, int],
+) -> str:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    if not mm_matches:
+        return prompt
+    texts = _replace_matches(prompt, mm_matches, mm_item_counts)
+    return "".join(texts)
+def _iter_placeholders(
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+    prompt: list[int],
+    mm_item_counts: Mapping[str, int],
+) -> Iterable[PlaceholderFeaturesInfo]:
+    """
+    Yield each set of placeholder tokens found in :code:`prompt`.
+    Matches are exclusive even when multiple modalities share
+    the same placeholder tokens. In that case, the modality that
+    appears earlier in `mm_prompt_repls` takes priority.
+    Note that empty matches are ignored.
+    """
+    prompt_len = len(prompt)
+    item_idx_by_modality = defaultdict[str, int](lambda: 0)
+    start_idx = 0
+    while start_idx < prompt_len:
+        found = False
+        for modality, modality_repls in mm_prompt_repls.items():
+            item_idx = item_idx_by_modality[modality]
+            if item_idx >= mm_item_counts.get(modality, 0):
+                continue
+            for repl_info in modality_repls:
+                replacement = repl_info.get_replacement(item_idx)
+                repl_tokens_full = replacement.full.token_ids
+                repl_len_full = len(repl_tokens_full)
+                end_idx_full = start_idx + repl_len_full
+                if repl_len_full == 0 or end_idx_full > prompt_len:
+                    continue
+                if prompt[start_idx:end_idx_full] == repl_tokens_full:
+                    repl_tokens_feat = replacement.features.token_ids
+                    try:
+                        match = next(
+                            iter_token_matches(repl_tokens_full,
+                                               repl_tokens_feat))
+                        yield PlaceholderFeaturesInfo(
+                            modality=modality,
+                            item_idx=item_idx,
+                            start_idx=start_idx + match.start_idx,
+                            tokens=repl_tokens_feat,
+                        )
+                    except StopIteration:
+                        raise AssertionError(
+                            f"{repl_tokens_feat=} should be a "
+                            f"subsequence of {repl_tokens_full=}") from None
+                    # Exclude overlapping matches
+                    start_idx = end_idx_full
+                    item_idx_by_modality[modality] += 1
+                    found = True
+                    break
+            if found:
+                break  # Go back to the outer while loop
+        if not found:
+            start_idx += 1
+def find_mm_placeholders(
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+    prompt: list[int],
+    mm_item_counts: Mapping[str, int],
+) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+    it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
+    return dict(full_groupby_modality(it))
+class ProcessingCache:
+    def __init__(self, capacity: int) -> None:
+        super().__init__()
+        # DEBUG: Set to None to disable
+        self.debug_cache_hit_ratio_steps: Optional[int] = None
+        self._cache = LRUCache[str, MultiModalKwargsItem](capacity)
+    def _maybe_log_cache_stats(self) -> None:
+        steps = self.debug_cache_hit_ratio_steps
+        if not steps:
+            return
+        cache_stats = self._cache.stat()
+        if cache_stats.total % steps == 0:
+            logger.debug("ProcessingCache: hit_ratio = %.2f",
+                         cache_stats.hit_ratio)
+    def get(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+    ) -> Optional[MultiModalKwargsItem]:
+        """
+        Get a processed multi-modal item from the cache
+        according to its dependencies, including:
+        - The model ID
+        - The modality of the item
+        - The original data item passed to the HF processor
+        - The configuration options of the HF processor
+        """
+        self._maybe_log_cache_stats()
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
+        return self._cache.get(cache_key)
+    def put(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+        output_kwargs: MultiModalKwargsItem,
+    ) -> None:
+        """
+        Put a processed multi-modal item into the cache
+        according to its dependencies (see :meth:`get`).
+        """
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
+        self._cache.put(cache_key, output_kwargs)
+class BaseProcessingInfo:
+    """Base class to provide the information necessary for data processing."""
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__()
+        self.ctx = ctx
+    @property
+    def model_id(self) -> str:
+        return self.ctx.model_config.model
+    def get_tokenizer(self) -> AnyTokenizer:
+        return self.ctx.tokenizer
+    def get_hf_config(self) -> PretrainedConfig:
+        return self.ctx.get_hf_config()
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        """
+        Subclasses can override this method to handle
+        specific kwargs from model config or user inputs.
+        """
+        return self.ctx.get_hf_processor(**kwargs)
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+        A value of `None` means unlimited number of items.
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+class BaseMultiModalProcessor(ABC, Generic[_I]):
+    """
+    Abstract base class to process multi-modal inputs to be used in vLLM.
+    Not to be confused with :class:`transformers.ProcessorMixin`.
+    """
+    def __init__(self,
+                 info: _I,
+                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
+        super().__init__()
+        self.info = info
+        self.dummy_inputs = dummy_inputs
+        self.cache = cache
+        self.enable_sanity_checks = enable_sanity_checks
+        self.data_parser = self._get_data_parser()
+    def __call__(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
+    def _get_data_parser(self) -> MultiModalDataParser:
+        """
+        Construct a parser to preprocess multi-modal data items
+        before passing them to :meth:`_get_hf_mm_data`.
+        You can support additional modalities by creating a subclass
+        of :class:`MultiModalDataParser` that has additional subparsers.
+        """
+        return MultiModalDataParser()
+    def _to_mm_items(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> MultiModalDataItems:
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
+        before passing them to :meth:`_get_hf_mm_data`.
+        """
+        mm_items = self.data_parser.parse_mm_data(mm_data)
+        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
+        for modality, items in mm_items.items():
+            limit = mm_limits.get(modality, 1)
+            if len(items) > limit:
+                raise ValueError(
+                    f"You set {modality}={limit} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but passed {len(items)} "
+                    f"{modality} items in the same prompt.")
+        return mm_items
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        """Given the HF-processed data, output the metadata of each field."""
+        raise NotImplementedError
+    @abstractmethod
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        """
+        Given the original multi-modal items for this modality
+        and HF-processed data, output the replacements to perform.
+        Notes:
+            - You should not assume that HF processor always performs prompt
+              replacement: in :meth:`_apply_hf_processor_missing`, this method
+              is called on text-only and multimodal-only inputs separately,
+              instead of passing them in the same call.
+            - The replacement information returned by this method is also used
+              to determine the placeholder token positions for each multi-modal
+              item.
+        """
+        raise NotImplementedError
+    def _find_mm_placeholders(
+        self,
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+        new_token_ids: list[int],
+        mm_item_counts: Mapping[str, int],
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        return find_mm_placeholders(mm_prompt_repls, new_token_ids,
+                                    mm_item_counts)
+    def _get_hf_mm_data(
+        self,
+        mm_items: MultiModalDataItems,
+    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
+        processor_data = dict[str, object]()
+        passthrough_data = dict[str, object]()
+        for items in mm_items.values():
+            processor_data.update(items.get_processor_data())
+            passthrough_data.update(items.get_passthrough_data())
+        return processor_data, passthrough_data
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        # Not to be confused with `mm_data` in `self.apply`.
+        # This refers to the data to be passed to HF processor.
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        """
+        Call the HF processor on the prompt text and
+        associated multi-modal data.
+        """
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            mm_kwargs,
+        )
+    def _apply_hf_processor_text_mm(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the prompt text and multi-modal data
+        together.
+        """
+        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
+        processed_data = self._call_hf_processor(
+            prompt=prompt_text,
+            mm_data=processor_data,
+            mm_kwargs=hf_processor_mm_kwargs,
+        )
+        processed_data.update(passthrough_data)
+        prompt_ids, = processed_data.pop("input_ids").tolist()
+        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+            processed_data,
+            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
+        )
+        return prompt_ids, mm_kwargs
+    def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
+        """
+        Apply the HF processor on the prompt text only.
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we create dummy multi-modal items
+        to go along with the text.
+        """
+        prompt_ids, _ = self._apply_hf_processor_text_mm(
+            prompt_text=prompt_text,
+            mm_items=MultiModalDataItems({}),
+            hf_processor_mm_kwargs={},
+        )
+        return prompt_ids
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        """
+        Apply the HF processor on the prompt tokens only.
+        Most HF processors accept prompt text but not prompt tokens.
+        If the HF processor adds or removes tokens that are not related to
+        multi-modal data, you should override this method so it is consistent
+        with the output of :meth:`_apply_hf_processor_text_only` on the
+        corresponding text.
+        """
+        return prompt_tokens
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Apply the HF processor on the multi-modal data only.
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we generate dummy text using
+        :class:`DummyInputsBuilder` to go along with the multi-modal data.
+        """
+        mm_counts = mm_items.get_all_counts()
+        dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
+            self.info.ctx.model_config.max_model_len,
+            mm_counts,
+        )
+        _, mm_kwargs = self._apply_hf_processor_text_mm(
+            prompt_text=dummy_inputs.prompt_text,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+        return mm_kwargs
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_replacement: bool,
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the prompt text and multi-modal data.
+        Note:
+            If :code:`enable_hf_prompt_replacement=False`, the prompt should
+            correspond to the multi-modal items.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_replacement:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                )
+            prompt_ids = self._apply_hf_processor_text_only(prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+        mm_missing_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+        return prompt_ids, mm_missing_kwargs
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the full prompt text,
+        caching the results and reusing cached results.
+        """
+        cache = self.cache
+        model_id = self.info.model_id
+        _, passthrough_data = self._get_hf_mm_data(mm_data_items)
+        if cache is None or passthrough_data:
+            return self._apply_hf_processor_main(
+                prompt=prompt,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_replacement=True,
+            )
+        mm_maybe_cached_kw_items = {
+            modality: [
+                cache.get(model_id, modality, item, hf_processor_mm_kwargs)
+                for item in items
+            ]
+            for modality, items in mm_data_items.items()
+        }
+        mm_missing_idxs = {
+            modality:
+            [idx for idx, item in enumerate(kw_items) if item is None]
+            for modality, kw_items in mm_maybe_cached_kw_items.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+        mm_missing_data_items = self._to_mm_items(mm_missing_data)
+        # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
+        # so we need to pass `enable_hf_prompt_replacement=False`
+        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=mm_missing_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            enable_hf_prompt_replacement=False,
+        )
+        mm_missing_next_idx = {
+            modality: 0
+            for modality in mm_missing_data_items
+        }
+        merged_kw_items = list[MultiModalKwargsItem]()
+        for modality, kw_items in mm_maybe_cached_kw_items.items():
+            for idx, kw_item in enumerate(kw_items):
+                if kw_item is None:
+                    kw_item = mm_missing_kwargs.get_item(
+                        modality,
+                        mm_missing_next_idx[modality],
+                    )
+                    cache.put(
+                        model_id,
+                        modality,
+                        mm_data_items[modality][idx],
+                        hf_processor_mm_kwargs,
+                        kw_item,
+                    )
+                    mm_missing_next_idx[modality] += 1
+                merged_kw_items.append(kw_item)
+        if self.enable_sanity_checks:
+            mm_missing_counts = mm_missing_data_items.get_all_counts()
+            assert all(
+                item_count == mm_missing_counts[modality]
+                for modality, item_count in mm_missing_next_idx.items()), dict(
+                    mm_missing_next_idx=mm_missing_next_idx,
+                    mm_missing_counts=mm_missing_counts)
+        mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
+        return prompt_ids, mm_kwargs
+    def _bind_and_group_repls(
+        self,
+        prompt_repls: list[PromptReplacement],
+    ) -> dict[str, list[BoundPromptReplacement]]:
+        tokenizer = self.info.get_tokenizer()
+        it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
+        return dict(full_groupby_modality(it))
+    def _always_apply_prompt_replacements(self) -> bool:
+        """
+        A flag which can be overridden so that
+        :meth:`_apply_prompt_replacements` is always called even if we
+        detect that HF has performed processing via
+        :meth:`_find_placeholders_by_modality`.
+        This is useful in cases where :meth:`_find_placeholders_by_modality`
+        cannot be reliably used to detect whether HF has performed processing.
+        """
+        return False
+    def _apply_prompt_replacements(
+        self,
+        token_ids: list[int],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+        mm_item_counts: Mapping[str, int],
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        tokenizer = self.info.get_tokenizer()
+        mm_token_matches = {
+            modality: find_token_matches(token_ids, prompt_repls)
+            for modality, prompt_repls in mm_prompt_repls.items()
+        }
+        mm_match_counts = {
+            modality: len(matches)
+            for modality, matches in mm_token_matches.items()
+        }
+        # If the search text does not represent a special token,
+        # it may have different token IDs in the prompt, because
+        # the tokens may go across the boundaries of the search text.
+        # ----
+        # e.g. when searching for "foo" in "food", if "food" itself makes
+        # up a token, then the token ID of "foo" will not appear at all
+        # ----
+        # Since it is inefficient to search for all possible tokenizations
+        # of the search text in the prompt, we instead perform string
+        # replacement on the decoded token IDs, then encode them back.
+        if all(
+            mm_match_counts.get(modality, 0) >= item_count
+            for modality, item_count in mm_item_counts.items()
+        ):  # yapf: disable
+            token_ids = replace_token_matches(
+                token_ids,
+                mm_token_matches,
+                mm_item_counts,
+            )
+            text = decode_tokens(tokenizer, token_ids)
+            matched_repls = {
+                modality: [match.prompt_repl for match in token_matches]
+                for modality, token_matches in mm_token_matches.items()
+            }
+        else:
+            text = decode_tokens(tokenizer, token_ids)
+            mm_text_matches = {
+                modality: find_text_matches(text, prompt_repls)
+                for modality, prompt_repls in mm_prompt_repls.items()
+            }
+            text = replace_text_matches(
+                text,
+                mm_text_matches,
+                mm_item_counts,
+            )
+            token_ids = encode_tokens(tokenizer,
+                                      text,
+                                      add_special_tokens=False)
+            matched_repls = {
+                modality: [match.prompt_repl for match in token_matches]
+                for modality, token_matches in mm_text_matches.items()
+            }
+        placeholders = self._find_mm_placeholders(
+            matched_repls,
+            token_ids,
+            mm_item_counts,
+        )
+        return token_ids, text, placeholders
+    def _validate_mm_kwargs(
+        self,
+        mm_kwargs: MultiModalKwargs,
+        mm_item_counts: Mapping[str, int],
+    ) -> None:
+        for modality, item_count in mm_item_counts.items():
+            if modality in mm_kwargs.modalities:
+                items = mm_kwargs.get_items(modality)
+            else:
+                items = []
+            if len(items) != item_count:
+                raise RuntimeError(
+                    f"Expected there to be {item_count} {modality} items in "
+                    f"keyword arguments corresponding to {item_count} "
+                    f"{modality} data items, but only found {len(items)}! "
+                    "There is likely a problem with your "
+                    "implementation of merged multi-modal processor for this "
+                    "model (usually arising from an inconsistency between "
+                    "`_call_hf_processor` and `_get_mm_fields_config`).")
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+        *,
+        allow_missing: bool = False,
+    ) -> Mapping[str, int]:
+        missing_repl_counts = dict[str, int]()
+        for modality, item_count in mm_item_counts.items():
+            placeholders = mm_placeholders.get(modality, [])
+            if len(placeholders) != item_count and not allow_missing:
+                raise RuntimeError(
+                    f"Expected there to be {item_count} prompt replacements "
+                    f"corresponding to {item_count} {modality} items, but only "
+                    f"found {len(placeholders)} prompt replacements! Either "
+                    "the prompt text has missing/incorrect tokens for "
+                    "multi-modal inputs, or there is a problem with your "
+                    "implementation of merged multi-modal processor for this "
+                    "model (usually arising from an inconsistency between "
+                    "`_call_hf_processor` and `_get_prompt_replacements`).")
+            missing_repl_counts[modality] = item_count - len(placeholders)
+        return missing_repl_counts
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+        The main steps are:
+        1. Apply HF Processor on prompt text and multi-modal data together,
+           outputting token IDs and processed tensors.
+        2. Find and replace sequences in the token IDs with placeholder tokens.
+           The number of placeholder tokens equals the feature size of the
+           multi-modal data outputted by the multi-modal encoder.
+        3. Extract information about the placeholder tokens from the
+           processed token IDs.
+        """
+        mm_items = self._to_mm_items(mm_data)
+        # Create MM hashes (only used in V1)
+        # TODO: Use these hash keys for caching operations in apply_hf_processor
+        # instead of rehashing.
+        if envs.VLLM_USE_V1:
+            model_id = self.info.model_id
+            mm_hashes = {
+                modality: [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs)
+                    for item in items
+                ]
+                for modality, items in mm_items.items()
+            }
+        else:
+            mm_hashes = None
+        prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
+            prompt,
+            mm_items,
+            hf_processor_mm_kwargs,
+        )
+        unbound_prompt_repls = self._get_prompt_replacements(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls)
+        mm_item_counts = mm_items.get_all_counts()
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+        hf_mm_placeholders = self._find_mm_placeholders(
+            mm_prompt_repls,
+            prompt_ids,
+            mm_item_counts,
+        )
+        if self._always_apply_prompt_replacements():
+            mm_missing_repl_counts = mm_item_counts
+            mm_missing_repls = dict(mm_prompt_repls)
+        else:
+            mm_missing_repl_counts = self._validate_mm_placeholders(
+                hf_mm_placeholders,
+                mm_item_counts,
+                allow_missing=True,
+            )
+            mm_missing_repls = dict[str, list[BoundPromptReplacement]]()
+            for modality, missing_repl_count in mm_missing_repl_counts.items():
+                if missing_repl_count == 0:
+                    mm_missing_repls[modality] = []
+                elif missing_repl_count == mm_item_counts.get(modality, 0):
+                    mm_missing_repls[modality] = mm_prompt_repls[modality]
+                else:
+                    raise ValueError("Partial prompt replacement within "
+                                     f"{modality=} is not supported")
+        # If HF processor already inserts placeholder tokens,
+        # there is no need for us to insert them
+        if all(len(repls) == 0 for repls in mm_missing_repls.values()):
+            tokenizer = self.info.get_tokenizer()
+            prompt = decode_tokens(tokenizer, prompt_ids)
+            mm_placeholders = hf_mm_placeholders
+        else:
+            (
+                prompt_ids,
+                prompt,
+                missing_mm_placeholders,
+            ) = self._apply_prompt_replacements(
+                prompt_ids,
+                mm_missing_repls,
+                mm_missing_repl_counts,
+            )
+            mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders}
+        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
+        }
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
+            mm_placeholders=mm_placeholder_ranges,
+        )

.venv/lib/python3.11/site-packages/vllm/multimodal/profiling.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from typing import Generic, TypeVar
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+import vllm.envs as envs
+from vllm.inputs import DummyData
+from vllm.logger import init_logger
+from .inputs import MultiModalDataDict, MultiModalInputs
+from .processing import BaseMultiModalProcessor, BaseProcessingInfo
+logger = init_logger(__name__)
+@dataclass
+class ProcessorInputs:
+    """
+    Represents the keyword arguments to
+    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    """
+    prompt_text: str
+    mm_data: MultiModalDataDict
+    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+class BaseDummyInputsBuilder(ABC, Generic[_I]):
+    """
+    Abstract base class that constructs the dummy data to profile
+    multi-modal models.
+    """
+    def __init__(self, info: _I) -> None:
+        super().__init__()
+        self.info = info
+    @abstractmethod
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        """
+        Build the input which, after processing, results in
+        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
+        """
+        raise NotImplementedError
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        image = Image.new("RGB", (width, height), color=0)
+        return [image] * num_images
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        video = np.zeros((num_frames, width, height, 3))
+        return [video] * num_videos
+class MultiModalProfiler(Generic[_I]):
+    """
+    Contains code for running memory profiling for multi-modal models.
+    """
+    def __init__(
+        self,
+        processor: BaseMultiModalProcessor[_I],
+    ) -> None:
+        super().__init__()
+        self.processor = processor
+    @property
+    def processing_info(self) -> BaseProcessingInfo:
+        return self.processor.info
+    @property
+    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
+        return self.processor.dummy_inputs
+    def get_mm_limits(self) -> Mapping[str, int]:
+        mm_config = self.processing_info.ctx.get_mm_config()
+        mm_limit_per_prompt = mm_config.limit_per_prompt
+        supported_mm_limits = self.processing_info.get_supported_mm_limits()
+        mm_limits = {
+            modality: mm_limit_per_prompt.get(modality, 1)
+            for modality in supported_mm_limits
+        }
+        for modality, supported_limit in supported_mm_limits.items():
+            limit = mm_limits[modality]
+            if supported_limit is not None and supported_limit < limit:
+                raise ValueError(
+                    f"You set {modality}={limit} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but this model only supports "
+                    f"at most {supported_limit} {modality} items.")
+        return mm_limits
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalInputs:
+        factory = self.dummy_inputs
+        processor_inputs = factory.get_dummy_processor_inputs(
+            seq_len, mm_counts)
+        return self.processor.apply(
+            prompt=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
+    def get_dummy_data(self, seq_len: int) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+        mm_counts = self.get_mm_limits()
+        info = self.processing_info
+        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(
+            seq_len, mm_counts)
+        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+            raise AssertionError(
+                "The keys returned by `get_supported_mm_limits`"
+                f"({set(mm_counts.keys())}) should be the same as those "
+                "returned by `get_mm_max_tokens_per_item` "
+                f"({set(mm_max_tokens_per_item.keys())})")
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+        total_placeholders_by_modality = {
+            modality: sum(item["length"] for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+        expected_placeholders_by_modality = {
+            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
+            for modality in placeholders_by_modality
+        }
+        if total_placeholders_by_modality != expected_placeholders_by_modality:
+            raise AssertionError(
+                f"The processed dummy data has a total of "
+                f"{total_placeholders_by_modality} placeholder tokens, which "
+                f"is not the expected {expected_placeholders_by_modality} "
+                "tokens.")
+        total_len = len(prompt_token_ids)
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
+            return DummyData(
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
+                multi_modal_data=None,
+                multi_modal_placeholders=None,
+            )
+        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+        return DummyData(
+            seq_data=SequenceData.from_seqs(prompt_token_ids),
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
+        )

.venv/lib/python3.11/site-packages/vllm/multimodal/registry.py ADDED Viewed

	@@ -0,0 +1,458 @@

+# SPDX-License-Identifier: Apache-2.0
+import functools
+from collections import UserDict
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
+                    Protocol, Sequence, Type, TypeVar)
+import torch.nn as nn
+from vllm.inputs import InputProcessingContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import ClassRegistry
+from .audio import AudioPlugin
+from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
+from .image import ImagePlugin
+from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         ProcessingCache)
+from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
+from .utils import cached_get_tokenizer
+from .video import VideoPlugin
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+logger = init_logger(__name__)
+# TODO: Tune the MM cache size
+MM_CACHE_SIZE = 256
+N = TypeVar("N", bound=Type[nn.Module])
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
+class ProcessingInfoFactory(Protocol[_I_co]):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+    def __call__(
+        self,
+        ctx: InputProcessingContext,
+    ) -> _I_co:
+        ...
+class DummyInputsBuilderFactory(Protocol[_I]):
+    """
+    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
+    """
+    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
+        ...
+class MultiModalProcessorFactory(Protocol[_I]):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+    def __call__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ) -> BaseMultiModalProcessor[_I]:
+        ...
+@dataclass(frozen=True)
+class _ProcessorFactories(Generic[_I]):
+    info: ProcessingInfoFactory[_I]
+    processor: MultiModalProcessorFactory[_I]
+    dummy_inputs: DummyInputsBuilderFactory[_I]
+    def build_processor(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ):
+        info = self.info(ctx)
+        dummy_inputs_builder = self.dummy_inputs(info)
+        return self.processor(info, dummy_inputs_builder, cache=cache)
+class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
+    """
+    Wraps `_limits_by_model` for a more informative error message
+    when attempting to access a model that does not exist.
+    """
+    def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
+        try:
+            return super().__getitem__(key)
+        except KeyError as exc:
+            msg = (f"Cannot find `mm_limits` for model={key.model}. Did you "
+                   "forget to call `init_mm_limits_per_prompt`?")
+            raise KeyError(msg) from exc
+class MultiModalRegistry:
+    """
+    A registry that dispatches data processing according to the model.
+    """
+    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
+    def __init__(
+            self,
+            *,
+            plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
+        self._plugins = {p.get_data_key(): p for p in plugins}
+        self._processor_factories = ClassRegistry[nn.Module,
+                                                  _ProcessorFactories]()
+        # This is used for non-multimodal models
+        self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
+        self._limits_by_model = _MultiModalLimits()
+        self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
+    def register_plugin(self, plugin: MultiModalPlugin) -> None:
+        """
+        Register a multi-modal plugin so it can be recognized by vLLM.
+        """
+        data_type_key = plugin.get_data_key()
+        if data_type_key in self._plugins:
+            logger.warning(
+                "A plugin is already registered for data type %s, "
+                "and will be overwritten by the new plugin %s.", data_type_key,
+                plugin)
+        self._plugins[data_type_key] = plugin
+    def _get_plugin(self, data_type_key: str):
+        plugin = self._plugins.get(data_type_key)
+        if plugin is not None:
+            return plugin
+        msg = f"Unknown multi-modal data type: {data_type_key}"
+        raise NotImplementedError(msg)
+    def register_input_mapper(
+        self,
+        data_type_key: str,
+        mapper: Optional[MultiModalInputMapper] = None,
+    ):
+        """
+        Register an input mapper for a specific modality to a model class.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
+        """
+        return self._get_plugin(data_type_key).register_input_mapper(mapper)
+    def register_image_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper] = None,
+    ):
+        """
+        Register an input mapper for image data to a model class.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
+        """
+        return self.register_input_mapper("image", mapper)
+    def map_input(
+        self,
+        model_config: "ModelConfig",
+        data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> MultiModalKwargs:
+        """
+        Apply an input mapper to the data passed to the model.
+        The data belonging to each modality is passed to the corresponding
+        plugin which in turn converts the data into into keyword arguments
+        via the input mapper registered for that model.
+        See :meth:`MultiModalPlugin.map_input` for more details.
+        Note:
+            This should be called after :meth:`init_mm_limits_per_prompt`.
+        """
+        merged_dict: Dict[str, NestedTensors] = {}
+        for data_key, data_value in data.items():
+            plugin = self._get_plugin(data_key)
+            num_items = len(data_value) if isinstance(data_value, list) else 1
+            max_items = self._limits_by_model[model_config][data_key]
+            if num_items > max_items:
+                raise ValueError(
+                    f"You set {data_key}={max_items} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but found {num_items} items "
+                    "in the same prompt.")
+            input_dict = plugin.map_input(model_config, data_value,
+                                          mm_processor_kwargs)
+            for input_key, input_tensor in input_dict.items():
+                if input_key in merged_dict:
+                    raise ValueError(f"The input mappers (keys={set(data)}) "
+                                     f"resulted in a conflicting keyword "
+                                     f"argument to `forward()`: {input_key}")
+                merged_dict[input_key] = input_tensor
+        return MultiModalKwargs(merged_dict)
+    def create_input_mapper(self, model_config: "ModelConfig"):
+        """
+        Create an input mapper (see :meth:`map_input`) for a specific model.
+        """
+        # NOTE - we currently make the assumption that if a model has multiple
+        # supported modalities, they take the same kwargs. For the default,
+        # this could be an issue in the future if it falls back to two HF
+        # resources and we can't inspect the signature easily since it's
+        # getting initialized through the autoclass.
+        #
+        # If this is a problem in the future, we should revisit it, but since
+        # it potentially introduces a lot of complexity for a currently
+        # uncommon case, we do not for simplicity of both use & implementation
+        return functools.partial(self.map_input, model_config)
+    def register_max_multimodal_tokens(
+        self,
+        data_type_key: str,
+        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
+    ):
+        """
+        Register the maximum number of tokens, corresponding to a single
+        instance of multimodal data belonging to a specific modality, that are
+        passed to the language model for a model class.
+        """
+        return self._get_plugin(data_type_key) \
+            .register_max_multimodal_tokens(max_mm_tokens)
+    def register_max_image_tokens(
+        self,
+        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
+    ):
+        """
+        Register the maximum number of image tokens, corresponding to a single
+        image, that are passed to the language model for a model class.
+        """
+        return self.register_max_multimodal_tokens("image", max_mm_tokens)
+    def get_max_tokens_per_item_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality based
+        on underlying model configuration.
+        """
+        if self.has_processor(model_config):
+            tokenizer = cached_get_tokenizer(
+                model_config.tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
+            )
+            processor = self.create_processor(model_config, tokenizer)
+            seq_len = model_config.max_model_len
+            mm_limits = self.get_mm_limits_per_prompt(model_config)
+            return processor.info.get_mm_max_tokens_per_item(
+                seq_len, mm_limits)
+        return {
+            key: plugin.get_max_multimodal_tokens(model_config)
+            for key, plugin in self._plugins.items()
+        }
+    def get_max_tokens_per_item_by_nonzero_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality based
+        on underlying model configuration, excluding modalities that user
+        explicitly disabled via `limit_mm_per_prompt`.
+        Note:
+            This is currently directly used only in V1 for profiling the memory
+            usage of a model.
+        """
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        return {
+            key: max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+            if mm_limits[key] > 0
+        }
+    def get_max_tokens_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens from each modality
+        for profiling the memory usage of a model.
+        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
+        Note:
+            This should be called after :meth:`init_mm_limits_per_prompt`.
+        """
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        return {
+            key: mm_limits[key] * max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+        }
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
+        """
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
+        Note:
+            This should be called after :meth:`init_mm_limits_per_prompt`.
+        """
+        return sum(self.get_max_tokens_by_modality(model_config).values())
+    def init_mm_limits_per_prompt(
+        self,
+        model_config: "ModelConfig",
+    ) -> None:
+        """
+        Initialize the maximum number of multi-modal input instances for each
+        modality that are allowed per prompt for a model class.
+        """
+        if model_config in self._limits_by_model:
+            logger.warning(
+                "`mm_limits` has already been set for model=%s, and will "
+                "be overwritten by the new values.", model_config.model)
+        multimodal_config = model_config.multimodal_config
+        if multimodal_config is None:
+            limits_per_plugin = self._disabled_limits_per_plugin
+        else:
+            config_limits_per_plugin = multimodal_config.limit_per_prompt
+            extra_keys = config_limits_per_plugin.keys() - self._plugins.keys()
+            if extra_keys:
+                logger.warning(
+                    "Detected extra keys in `--limit-mm-per-prompt` which "
+                    "are not registered as multi-modal plugins: %s. "
+                    "They will be ignored.", extra_keys)
+            # NOTE: Currently the default is set to 1 for each plugin
+            # TODO: Automatically determine the limits based on budget
+            # once more models support multi-image inputs
+            limits_per_plugin = {
+                key: config_limits_per_plugin.get(key, 1)
+                for key in self._plugins
+            }
+        self._limits_by_model[model_config] = limits_per_plugin
+    def get_mm_limits_per_prompt(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of multi-modal input instances for each modality
+        that are allowed per prompt for a model class.
+        Note:
+            This should be called after :meth:`init_mm_limits_per_prompt`.
+        """
+        if self.has_processor(model_config):
+            tokenizer = cached_get_tokenizer(
+                model_config.tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
+            )
+            processor = self.create_processor(model_config, tokenizer)
+            profiler = MultiModalProfiler(processor)
+            return profiler.get_mm_limits()
+        return self._limits_by_model[model_config]
+    def register_processor(
+        self,
+        processor: MultiModalProcessorFactory[_I],
+        *,
+        info: ProcessingInfoFactory[_I],
+        dummy_inputs: DummyInputsBuilderFactory[_I],
+    ):
+        """
+        Register a multi-modal processor to a model class. The processor
+        is constructed lazily, hence a factory method should be passed.
+        When the model receives multi-modal data, the provided function is
+        invoked to transform the data into a dictionary of model inputs.
+        See also:
+            :ref:`mm-processing`
+        """
+        def wrapper(model_cls: N) -> N:
+            if self._processor_factories.contains(model_cls, strict=True):
+                logger.warning(
+                    "Model class %s already has a multi-modal processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+            self._processor_factories[model_cls] = _ProcessorFactories(
+                info=info,
+                dummy_inputs=dummy_inputs,
+                processor=processor,
+            )
+            return model_cls
+        return wrapper
+    def _get_model_cls(self, model_config: "ModelConfig"):
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+        model_cls, _ = get_model_architecture(model_config)
+        return model_cls
+    def has_processor(self, model_config: "ModelConfig") -> bool:
+        """
+        Test whether a multi-modal processor is defined for a specific model.
+        See also:
+            :ref:`mm-processing`
+        """
+        return self._get_model_cls(model_config) in self._processor_factories
+    def create_processor(
+        self,
+        model_config: "ModelConfig",
+        tokenizer: AnyTokenizer,
+    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
+        """
+        Create a multi-modal processor for a specific model and tokenizer.
+        See also:
+            :ref:`mm-processing`
+        """
+        model_cls = self._get_model_cls(model_config)
+        factories = self._processor_factories[model_cls]
+        ctx = InputProcessingContext(model_config, tokenizer)
+        cache = (None if model_config.disable_mm_preprocessor_cache else
+                 self._processing_cache)
+        return factories.build_processor(ctx, cache=cache)

.venv/lib/python3.11/site-packages/vllm/multimodal/utils.py ADDED Viewed

	@@ -0,0 +1,518 @@

+# SPDX-License-Identifier: Apache-2.0
+from functools import lru_cache
+from itertools import groupby
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, TypeVar, Union
+from urllib.parse import ParseResult, urlparse
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+import vllm.envs as envs
+from vllm.connections import HTTPConnection, global_http_connection
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from .audio import AudioMediaIO
+from .base import MediaIO
+from .image import ImageMediaIO
+from .inputs import PlaceholderRange
+from .video import VideoMediaIO
+logger = init_logger(__name__)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+_M = TypeVar("_M")
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+    from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
+class MediaConnector:
+    def __init__(
+        self,
+        connection: HTTPConnection = global_http_connection,
+        *,
+        allowed_local_media_path: str = "",
+    ) -> None:
+        super().__init__()
+        self.connection = connection
+        if allowed_local_media_path:
+            allowed_local_media_path_ = Path(allowed_local_media_path)
+            if not allowed_local_media_path_.exists():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} does not exist.")
+            if not allowed_local_media_path_.is_dir():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} must be a directory.")
+        else:
+            allowed_local_media_path_ = None
+        self.allowed_local_media_path = allowed_local_media_path_
+    def _load_data_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        data_spec, data = url_spec.path.split(",", 1)
+        media_type, data_type = data_spec.split(";", 1)
+        if data_type != "base64":
+            msg = "Only base64 data URLs are supported for now."
+            raise NotImplementedError(msg)
+        return media_io.load_base64(media_type, data)
+    def _load_file_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        allowed_local_media_path = self.allowed_local_media_path
+        if allowed_local_media_path is None:
+            raise RuntimeError("Cannot load local files without "
+                               "`--allowed-local-media-path`.")
+        filepath = Path(url_spec.path)
+        if allowed_local_media_path not in filepath.resolve().parents:
+            raise ValueError(
+                f"The file path {filepath} must be a subpath "
+                f"of `--allowed-local-media-path` {allowed_local_media_path}.")
+        return media_io.load_file(filepath)
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = connection.get_bytes(url, timeout=fetch_timeout)
+            return media_io.load_bytes(data)
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = await connection.async_get_bytes(url, timeout=fetch_timeout)
+            return media_io.load_bytes(data)
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
+    def fetch_audio(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Load audio from a URL.
+        """
+        audio_io = AudioMediaIO()
+        return self.load_from_url(
+            audio_url,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
+    async def fetch_audio_async(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Asynchronously fetch audio from a URL.
+        """
+        audio_io = AudioMediaIO()
+        return await self.load_from_url_async(
+            audio_url,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
+    def fetch_image(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Load a PIL image from a HTTP or base64 data URL.
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        return self.load_from_url(
+            image_url,
+            image_io,
+            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
+    async def fetch_image_async(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Asynchronously load a PIL image from a HTTP or base64 data URL.
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        return await self.load_from_url_async(
+            image_url,
+            image_io,
+            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
+    def fetch_video(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+        num_frames: int = 32,
+    ) -> npt.NDArray:
+        """
+        Load video from a HTTP or base64 data URL.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+        return self.load_from_url(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+    async def fetch_video_async(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+        num_frames: int = 32,
+    ) -> npt.NDArray:
+        """
+        Asynchronously load video from a HTTP or base64 data URL.
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+        return await self.load_from_url_async(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+global_media_connector = MediaConnector()
+"""The global :class:`MediaConnector` instance used by vLLM."""
+fetch_audio = global_media_connector.fetch_audio
+fetch_image = global_media_connector.fetch_image
+fetch_video = global_media_connector.fetch_video
+def encode_audio_base64(
+    audio: np.ndarray,
+    sampling_rate: int,
+) -> str:
+    """Encode audio as base64."""
+    audio_io = AudioMediaIO()
+    return audio_io.encode_base64((audio, sampling_rate))
+def encode_image_base64(
+    image: Image.Image,
+    *,
+    image_mode: str = "RGB",
+    format: str = "JPEG",
+) -> str:
+    """
+    Encode a pillow image to base64 format.
+    By default, the image is converted into RGB format before being encoded.
+    """
+    image_io = ImageMediaIO(image_mode=image_mode)
+    return image_io.encode_base64(image, image_format=format)
+def encode_video_base64(frames: npt.NDArray) -> str:
+    image_io = ImageMediaIO()
+    video_io = VideoMediaIO(image_io)
+    return video_io.encode_base64(frames)
+# Utilities for input processors
+_T = TypeVar("_T", str, int)
+def repeat_and_pad_token(
+    token: _T,
+    *,
+    repeat_count: int = 1,
+    pad_token_left: Optional[_T] = None,
+    pad_token_right: Optional[_T] = None,
+) -> list[_T]:
+    replacement = [token] * repeat_count
+    if pad_token_left is not None:
+        replacement = [pad_token_left] + replacement
+    if pad_token_right is not None:
+        replacement = replacement + [pad_token_right]
+    return replacement
+def repeat_and_pad_placeholder_tokens(
+    tokenizer: AnyTokenizer,
+    prompt: Optional[str],
+    prompt_token_ids: list[int],
+    *,
+    placeholder_token_id: int,
+    repeat_count: Union[int, list[int]],
+    pad_token_left: Optional[int] = None,
+    pad_token_right: Optional[int] = None,
+) -> tuple[Optional[str], list[int], list[PlaceholderRange]]:
+    if isinstance(repeat_count, int):
+        repeat_count = [repeat_count]
+    if prompt is None:
+        new_prompt = None
+    else:
+        placeholder_token_str = tokenizer.decode(placeholder_token_id)
+        pad_token_str_left = (None if pad_token_left is None else
+                              tokenizer.decode(pad_token_left))
+        pad_token_str_right = (None if pad_token_right is None else
+                               tokenizer.decode(pad_token_right))
+        placeholder_token_count = prompt.count(placeholder_token_str)
+        # This is an arbitrary number to distinguish between the two cases
+        if placeholder_token_count > 16:
+            logger.warning(
+                "Please follow the prompt format that is "
+                "documented on HuggingFace which does not involve "
+                "repeating %s tokens.", placeholder_token_str)
+        if placeholder_token_count < len(repeat_count):
+            logger.warning(
+                "The number of multi-modal placeholder tokens in the prompt "
+                "is less than the number of multi-modal inputs. Extra "
+                "placeholder tokens will be treated as plain text")
+            repeat_count = repeat_count[:placeholder_token_count]
+        prompt_parts = prompt.split(placeholder_token_str,
+                                    maxsplit=len(repeat_count))
+        new_prompt = ""
+        for i, repeat_count_item in enumerate(repeat_count):
+            replacement_str = "".join(
+                repeat_and_pad_token(
+                    placeholder_token_str,
+                    repeat_count=repeat_count_item,
+                    pad_token_left=pad_token_str_left,
+                    pad_token_right=pad_token_str_right,
+                ))
+            # The image tokens are removed to be consistent with HuggingFace
+            new_prompt += prompt_parts[i] + replacement_str
+        new_prompt += prompt_parts[-1]
+    new_token_ids = list[int]()
+    placeholder_ranges = list[PlaceholderRange]()
+    placeholder_token_idx = 0
+    for i, token in enumerate(prompt_token_ids):
+        if token == placeholder_token_id:
+            curr_repeat_count = repeat_count[placeholder_token_idx]
+            replacement_ids = repeat_and_pad_token(
+                placeholder_token_id,
+                repeat_count=curr_repeat_count,
+                pad_token_left=pad_token_left,
+                pad_token_right=pad_token_right,
+            )
+            offset = len(new_token_ids)
+            if pad_token_left is not None:
+                offset += 1
+            placeholder_ranges.append({
+                "offset": offset,
+                "length": curr_repeat_count,
+            })
+            new_token_ids.extend(replacement_ids)
+            placeholder_token_idx += 1
+            # No need to further scan the list since we replaced all tokens
+            if placeholder_token_idx >= len(repeat_count):
+                new_token_ids.extend(prompt_token_ids[i + 1:])
+                break
+        else:
+            new_token_ids.append(token)
+    return new_prompt, new_token_ids, placeholder_ranges
+def consecutive_placeholder_ranges(
+        num_items: int,
+        item_size: int,
+        initial_offset: int = 0) -> list[PlaceholderRange]:
+    """Returns a list of consecutive PlaceholderRanges of a fixed size"""
+    return [
+        PlaceholderRange(offset=initial_offset + i * item_size,
+                         length=item_size) for i in range(num_items)
+    ]
+def merge_and_sort_multimodal_metadata(
+    mm_positions: "MultiModalPlaceholderDict",
+    mm_hashes: Optional["MultiModalHashDict"],
+) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
+    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
+    objects from all available modalities into a single list of
+    PlaceholderRange, sorted by their offset (starting index in the input
+    sequence) in the ascending order.
+    Optionally if a MultiModalHashDict is given, same operation will be
+    applied to the object and the sorted list of hashes will be returned.
+    Raises:
+        ValueError: If the input prompt has interleaved placeholders from
+            different modalities (e.g, "<image><audio><image> Describe the
+            content.")
+    Returns:
+        list[str]: Sorted list of involved modalities.
+        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
+            mm_positions.
+        Optional[list[str]]: Sorted list of all hashes from mm_hashes if
+            given, None otherwise.
+    """
+    modalities = list(mm_positions.keys())
+    assert len(modalities) > 0, "No modalities found in the mm_positions."
+    # For single modality, placeholder ranges and hashes are already sorted
+    # so we can return the list directly.
+    if len(modalities) == 1:
+        if mm_hashes is None:
+            return modalities, list(mm_positions[modalities[0]]), None
+        else:
+            return modalities, list(mm_positions[modalities[0]]), list(
+                mm_hashes[modalities[0]])
+    placeholder_lists_with_modality = [(modality, mm_positions[modality])
+                                       for modality in modalities]
+    if mm_hashes is None:
+        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
+                                          key=lambda x: x[1][0]['offset'])
+        sorted_hash_lists = None
+    else:
+        hashes_lists = [
+            mm_hashes[modality] for modality in modalities
+            if modality in mm_hashes
+        ]
+        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
+                                  hashes_lists),
+                              key=lambda x: x[0][1][0]['offset'])
+        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
+        sorted_placeholder_lists = list(sorted_placeholder_tuple)
+        sorted_hash_lists = list(sorted_hash_tuple)
+    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
+    # Flatten sorted list of lists to a single list and verify there is no
+    # interleaving of placeholders from different modalities.
+    merged_placeholders: list[PlaceholderRange] = []
+    for modality, placeholder_list in sorted_placeholder_lists:
+        if merged_placeholders and placeholder_list[0][
+                'offset'] < merged_placeholders[-1]['offset']:
+            raise ValueError(
+                "Interleaved mixed-modality inference is currently not "
+                "supported.")
+        merged_placeholders.extend(placeholder_list)
+    if sorted_hash_lists is not None:
+        merged_hashes = []
+        for hash_list in sorted_hash_lists:
+            merged_hashes.extend(hash_list)
+    else:
+        merged_hashes = None
+    return sorted_modalities, merged_placeholders, merged_hashes
+def group_mm_inputs_by_modality(
+        mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]:
+    """Group consecutive MultiModalKwargs from mm_inputs with the same modality
+    together into the same list for batching purpose. For MultiModalKwargs with
+    multiple modalities, put them into their own list.
+    Args:
+        mm_inputs: List of MultiModalKwargs.
+    Returns:
+        list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each
+        inner list contains consecutive MultiModalKwargs with same modality, or
+        one with multimodal modalities.
+    """
+    if not mm_inputs:
+        return []
+    def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]:
+        # If the input has multiple modalities, return a id as the unique key
+        # for the mm_input input.
+        if len(mm_input.modalities) > 1:
+            return id(mm_input)
+        elif len(mm_input.modalities) == 1:
+            return list(mm_input.modalities)[0]
+        # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
+        # this is used to make InternVL with legacy pipeline still work with v1.
+        else:
+            return ""
+    return [
+        list(group) for _, group in groupby(mm_inputs, key=modality_group_func)
+    ]

.venv/lib/python3.11/site-packages/vllm/multimodal/video.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# SPDX-License-Identifier: Apache-2.0
+import base64
+from functools import lru_cache, partial
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Optional
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+from vllm.inputs.registry import InputContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.processor import get_video_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import PlaceholderModule, is_list_of
+from .base import MediaIO, ModalityData
+from .image import ImageMediaIO, ImagePlugin
+from .inputs import MultiModalKwargs, VideoItem
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+try:
+    import decord
+except ImportError:
+    decord = PlaceholderModule("decord")  # type: ignore[assignment]
+logger = init_logger(__name__)
+cached_get_video_processor = lru_cache(get_video_processor)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+class VideoPlugin(ImagePlugin):
+    """Plugin for video data."""
+    def get_data_key(self) -> str:
+        return "video"
+    def _get_hf_video_processor(
+        self,
+        model_config: "ModelConfig",
+        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+        return cached_get_video_processor(
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code,
+            **mm_processor_kwargs)
+    def _default_input_mapper(
+        self,
+        ctx: InputContext,
+        data: ModalityData[VideoItem],
+        **mm_processor_kwargs,
+    ) -> MultiModalKwargs:
+        model_config = ctx.model_config
+        if isinstance(data, list) and len(data) == 1:
+            data = data[0]  # type: ignore
+        if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
+            video_processor = self._get_hf_video_processor(
+                model_config,
+                mm_processor_kwargs,
+            )
+            if video_processor is None:
+                raise RuntimeError("No HuggingFace processor is available "
+                                   "to process the video object")
+            try:
+                # NOTE: Similar to image; it may be a good idea to filter and
+                # pass mm_processor_kwargs here too, but for now we don't to
+                # avoid extra complexity if the initializer and preprocess
+                # signatures of the processor don't align
+                batch_data = video_processor(data, return_tensors="pt").data
+            except Exception:
+                logger.error("Failed to process video (%s)", data)
+                raise
+            return MultiModalKwargs(batch_data)
+        raise TypeError(f"Invalid video type: {type(data)}")
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 4096
+def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    # lazy import cv2 to avoid bothering users who only use text models
+    import cv2
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+    return resize_video(frames, (new_height, new_width))
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    sampled_frames = frames[frame_indices, ...]
+    return sampled_frames
+class VideoMediaIO(MediaIO[npt.NDArray]):
+    def __init__(
+        self,
+        image_io: ImageMediaIO,
+        *,
+        num_frames: int = 32,
+    ) -> None:
+        super().__init__()
+        self.image_io = image_io
+        self.num_frames = num_frames
+    def load_bytes(self, data: bytes) -> npt.NDArray:
+        vr = decord.VideoReader(BytesIO(data), num_threads=1)
+        total_frame_num = len(vr)
+        num_frames = self.num_frames
+        if total_frame_num > num_frames:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frame_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+        else:
+            frame_idx = list(range(0, total_frame_num))
+        return vr.get_batch(frame_idx).asnumpy()
+    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
+        if media_type.lower() == "video/jpeg":
+            load_frame = partial(
+                self.image_io.load_base64,
+                "image/jpeg",
+            )
+            return np.stack([
+                np.array(load_frame(frame_data))
+                for frame_data in data.split(",")
+            ])
+        return self.load_bytes(base64.b64decode(data))
+    def load_file(self, filepath: Path) -> npt.NDArray:
+        with filepath.open("rb") as f:
+            data = f.read()
+        return self.load_bytes(data)
+    def encode_base64(
+        self,
+        media: npt.NDArray,
+        *,
+        video_format: str = "JPEG",
+    ) -> str:
+        video = media
+        if video_format == "JPEG":
+            encode_frame = partial(
+                self.image_io.encode_base64,
+                image_format=video_format,
+            )
+            return ",".join(
+                encode_frame(Image.fromarray(frame)) for frame in video)
+        msg = "Only JPEG format is supported for now."
+        raise NotImplementedError(msg)

.venv/lib/python3.11/site-packages/vllm/triton_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# SPDX-License-Identifier: Apache-2.0
+from vllm.triton_utils.importing import HAS_TRITON
+__all__ = ["HAS_TRITON"]
+if HAS_TRITON:
+    from vllm.triton_utils.custom_cache_manager import (
+        maybe_set_triton_cache_manager)
+    __all__ += ["maybe_set_triton_cache_manager"]

.venv/lib/python3.11/site-packages/vllm/triton_utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (473 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-311.pyc ADDED Viewed

Binary file (3.53 kB). View file

.venv/lib/python3.11/site-packages/vllm/triton_utils/__pycache__/importing.cpython-311.pyc ADDED Viewed

Binary file (779 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/worker/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (184 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cache_engine.cpython-311.pyc ADDED Viewed

Binary file (7.42 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-311.pyc ADDED Viewed

Binary file (13.9 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cpu_model_runner.cpython-311.pyc ADDED Viewed

Binary file (33.2 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cpu_pooling_model_runner.cpython-311.pyc ADDED Viewed

Binary file (6.72 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/cpu_worker.cpython-311.pyc ADDED Viewed

Binary file (20.6 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/enc_dec_model_runner.cpython-311.pyc ADDED Viewed

Binary file (22.7 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/hpu_worker.cpython-311.pyc ADDED Viewed

Binary file (27.2 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/model_runner.cpython-311.pyc ADDED Viewed

Binary file (87.4 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/model_runner_base.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/multi_step_model_runner.cpython-311.pyc ADDED Viewed

Binary file (36.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-311.pyc ADDED Viewed

Binary file (4.61 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/multi_step_worker.cpython-311.pyc ADDED Viewed

Binary file (7.57 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/neuron_model_runner.cpython-311.pyc ADDED Viewed

Binary file (16 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/neuron_worker.cpython-311.pyc ADDED Viewed

Binary file (6.76 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/openvino_model_runner.cpython-311.pyc ADDED Viewed

Binary file (14.1 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/openvino_worker.cpython-311.pyc ADDED Viewed

Binary file (27 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/pooling_model_runner.cpython-311.pyc ADDED Viewed

Binary file (9.95 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/tpu_model_runner.cpython-311.pyc ADDED Viewed

Binary file (40.6 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/tpu_worker.cpython-311.pyc ADDED Viewed

Binary file (15.2 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (2.26 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/worker.cpython-311.pyc ADDED Viewed

Binary file (29.1 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/worker_base.cpython-311.pyc ADDED Viewed

Binary file (30.1 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/xpu_model_runner.cpython-311.pyc ADDED Viewed

Binary file (26.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/worker/__pycache__/xpu_worker.cpython-311.pyc ADDED Viewed

Binary file (9.11 kB). View file