diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__init__.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..01d5bb4b574895b2cd5ec7f515c4b6d17d85b707 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__init__.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm.envs import VLLM_USE_MODELSCOPE + +if VLLM_USE_MODELSCOPE: + # Patch here, before each import happens + import modelscope + from packaging import version + + # patch_hub begins from modelscope>=1.18.1 + if version.parse(modelscope.__version__) <= version.parse('1.18.0'): + raise ImportError( + 'Using vLLM with ModelScope needs modelscope>=1.18.1, please ' + 'install by `pip install modelscope -U`') + + from modelscope.utils.hf_util import patch_hub + + # Patch hub to download models from modelscope to speed up. + patch_hub() diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d6b34e34261dace5ac2ce2f52bf8487bf8bb372 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78aafabcbd6bf982aedadb06f27c4be628dc96ce Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0872536ef255f85f6e5c7f03de946d040b7b6e33 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..614069a4ec17e0f3db18bc40bfbb38e94a2e54bf Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/processor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbe3080eb1571a9baff66e0399e3ceea8142b1b9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/processor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/s3_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/s3_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c84d02a7b6a0127a9e16758902b0a11c25a1c2e9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/s3_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/tokenizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/tokenizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b83d275c84e1d39e43a8cc5e670f411a0e3debe Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/tokenizer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..019a29b8cd23abcaabbcfa5ea3638a881ec55b75 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/config.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..1c0f20a6e045b23e52ebdc55705db7e4c79579cb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/config.py @@ -0,0 +1,605 @@ +# SPDX-License-Identifier: Apache-2.0 + +import enum +import json +import os +from pathlib import Path +from typing import Any, Dict, Optional, Type, Union + +import huggingface_hub +from huggingface_hub import (file_exists, hf_hub_download, + try_to_load_from_cache) +from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, + LocalEntryNotFoundError, + RepositoryNotFoundError, + RevisionNotFoundError) +from torch import nn +from transformers import GenerationConfig, PretrainedConfig +from transformers.models.auto.image_processing_auto import ( + get_image_processor_config) +from transformers.models.auto.modeling_auto import ( + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) +from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME + +from vllm.envs import VLLM_USE_MODELSCOPE +from vllm.logger import init_logger +# yapf conflicts with isort for this block +# yapf: disable +from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, + DbrxConfig, DeepseekVLV2Config, + EAGLEConfig, ExaoneConfig, + H2OVLChatConfig, + InternVLChatConfig, JAISConfig, + MedusaConfig, MllamaConfig, + MLPSpeculatorConfig, MPTConfig, + NemotronConfig, NVLM_D_Config, + Olmo2Config, RWConfig, + SolarConfig, Telechat2Config, + UltravoxConfig) +# yapf: enable +from vllm.transformers_utils.utils import check_gguf_file +from vllm.utils import resolve_obj_by_qualname + +if VLLM_USE_MODELSCOPE: + from modelscope import AutoConfig +else: + from transformers import AutoConfig + +MISTRAL_CONFIG_NAME = "params.json" +HF_TOKEN = os.getenv('HF_TOKEN', None) + +logger = init_logger(__name__) + +_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = { + "mllama": MllamaConfig +} + +_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { + "chatglm": ChatGLMConfig, + "cohere2": Cohere2Config, + "dbrx": DbrxConfig, + "deepseek_vl_v2": DeepseekVLV2Config, + "mpt": MPTConfig, + "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) + "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) + "jais": JAISConfig, + "mlp_speculator": MLPSpeculatorConfig, + "medusa": MedusaConfig, + "eagle": EAGLEConfig, + "exaone": ExaoneConfig, + "h2ovl_chat": H2OVLChatConfig, + "internvl_chat": InternVLChatConfig, + "nemotron": NemotronConfig, + "NVLM_D": NVLM_D_Config, + "olmo2": Olmo2Config, + "solar": SolarConfig, + "telechat": Telechat2Config, + "ultravox": UltravoxConfig, + **_CONFIG_REGISTRY_OVERRIDE_HF +} + + +class ConfigFormat(str, enum.Enum): + AUTO = "auto" + HF = "hf" + MISTRAL = "mistral" + + +def file_or_path_exists(model: Union[str, Path], config_name: str, + revision: Optional[str]) -> bool: + if Path(model).exists(): + return (Path(model) / config_name).is_file() + + # Offline mode support: Check if config file is cached already + cached_filepath = try_to_load_from_cache(repo_id=model, + filename=config_name, + revision=revision) + if isinstance(cached_filepath, str): + # The config file exists in cache- we can continue trying to load + return True + + # NB: file_exists will only check for the existence of the config file on + # hf_hub. This will fail in offline mode. + try: + return file_exists(model, + config_name, + revision=revision, + token=HF_TOKEN) + except huggingface_hub.errors.OfflineModeIsEnabled: + # Don't raise in offline mode, all we know is that we don't have this + # file cached. + return False + + +def patch_rope_scaling(config: PretrainedConfig) -> None: + """Provide backwards compatibility for RoPE.""" + text_config = getattr(config, "text_config", None) + if text_config is not None: + patch_rope_scaling(text_config) + + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None: + patch_rope_scaling_dict(rope_scaling) + + +def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None: + if "rope_type" in rope_scaling and "type" in rope_scaling: + rope_type = rope_scaling["rope_type"] + rope_type_legacy = rope_scaling["type"] + if rope_type != rope_type_legacy: + raise ValueError( + f"Found conflicts between 'rope_type={rope_type}' (modern " + f"field) and 'type={rope_type_legacy}' (legacy field). " + "You should only specify one of them.") + + if "rope_type" not in rope_scaling and "type" in rope_scaling: + rope_scaling["rope_type"] = rope_scaling["type"] + logger.info("Replacing legacy 'type' key with 'rope_type'") + + if "rope_type" not in rope_scaling: + raise ValueError("rope_scaling should have a 'rope_type' key") + + if rope_scaling["rope_type"] == "su": + rope_scaling["rope_type"] = "longrope" + logger.warning("Replacing legacy rope_type 'su' with 'longrope'") + elif rope_scaling["rope_type"] == "mrope": + assert "mrope_section" in rope_scaling + rope_scaling["rope_type"] = "default" + logger.warning("Replacing legacy rope_type 'mrope' with 'default'") + + +def uses_mrope(config: PretrainedConfig) -> bool: + """Detect if the model with this config uses M-ROPE.""" + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is None: + return False + + return "mrope_section" in rope_scaling + + +def is_encoder_decoder(config: PretrainedConfig) -> bool: + """Detect if the model with this config is used as an encoder/decoder.""" + text_config = getattr(config, "text_config", None) + if text_config is not None: + return is_encoder_decoder(text_config) + + return getattr(config, "is_encoder_decoder", False) + + +def get_config( + model: Union[str, Path], + trust_remote_code: bool, + revision: Optional[str] = None, + code_revision: Optional[str] = None, + config_format: ConfigFormat = ConfigFormat.AUTO, + **kwargs, +) -> PretrainedConfig: + # Separate model folder from file path for GGUF models + + is_gguf = check_gguf_file(model) + if is_gguf: + kwargs["gguf_file"] = Path(model).name + model = Path(model).parent + + if config_format == ConfigFormat.AUTO: + if is_gguf or file_or_path_exists( + model, HF_CONFIG_NAME, revision=revision): + config_format = ConfigFormat.HF + elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, + revision=revision): + config_format = ConfigFormat.MISTRAL + else: + # If we're in offline mode and found no valid config format, then + # raise an offline mode error to indicate to the user that they + # don't have files cached and may need to go online. + # This is conveniently triggered by calling file_exists(). + file_exists(model, + HF_CONFIG_NAME, + revision=revision, + token=HF_TOKEN) + + raise ValueError(f"No supported config format found in {model}") + + if config_format == ConfigFormat.HF: + config_dict, _ = PretrainedConfig.get_config_dict( + model, + revision=revision, + code_revision=code_revision, + token=HF_TOKEN, + **kwargs, + ) + + # Use custom model class if it's in our registry + model_type = config_dict.get("model_type") + if model_type in _CONFIG_REGISTRY: + config_class = _CONFIG_REGISTRY[model_type] + config = config_class.from_pretrained( + model, + revision=revision, + code_revision=code_revision, + token=HF_TOKEN, + **kwargs, + ) + else: + try: + config = AutoConfig.from_pretrained( + model, + trust_remote_code=trust_remote_code, + revision=revision, + code_revision=code_revision, + token=HF_TOKEN, + **kwargs, + ) + except ValueError as e: + if (not trust_remote_code + and "requires you to execute the configuration file" + in str(e)): + err_msg = ( + "Failed to load the model config. If the model " + "is a custom model not yet available in the " + "HuggingFace transformers library, consider setting " + "`trust_remote_code=True` in LLM or using the " + "`--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + + elif config_format == ConfigFormat.MISTRAL: + config = load_params_config(model, revision, token=HF_TOKEN, **kwargs) + else: + raise ValueError(f"Unsupported config format: {config_format}") + + # Special architecture mapping check for GGUF models + if is_gguf: + if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: + raise RuntimeError( + f"Can't get gguf config for {config.model_type}.") + model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] + config.update({"architectures": [model_type]}) + + patch_rope_scaling(config) + + if trust_remote_code: + maybe_register_config_serialize_by_value() + + return config + + +def get_hf_file_to_dict(file_name: str, + model: Union[str, Path], + revision: Optional[str] = 'main'): + """ + Downloads a file from the Hugging Face Hub and returns + its contents as a dictionary. + + Parameters: + - file_name (str): The name of the file to download. + - model (str): The name of the model on the Hugging Face Hub. + - revision (str): The specific version of the model. + + Returns: + - config_dict (dict): A dictionary containing + the contents of the downloaded file. + """ + file_path = Path(model) / file_name + + if file_or_path_exists(model=model, + config_name=file_name, + revision=revision): + + if not file_path.is_file(): + try: + hf_hub_file = hf_hub_download(model, + file_name, + revision=revision) + except (RepositoryNotFoundError, RevisionNotFoundError, + EntryNotFoundError, LocalEntryNotFoundError) as e: + logger.debug("File or repository not found in hf_hub_download", + e) + return None + except HfHubHTTPError as e: + logger.warning( + "Cannot connect to Hugging Face Hub. Skipping file " + "download for '%s':", + file_name, + exc_info=e) + return None + file_path = Path(hf_hub_file) + + with open(file_path) as file: + return json.load(file) + return None + + +def get_pooling_config(model: str, revision: Optional[str] = 'main'): + """ + This function gets the pooling and normalize + config from the model - only applies to + sentence-transformers models. + + Args: + model (str): The name of the Hugging Face model. + revision (str, optional): The specific version + of the model to use. Defaults to 'main'. + + Returns: + dict: A dictionary containing the pooling + type and whether normalization is used. + """ + + modules_file_name = "modules.json" + modules_dict = get_hf_file_to_dict(modules_file_name, model, revision) + + if modules_dict is None: + return None + + pooling = next((item for item in modules_dict + if item["type"] == "sentence_transformers.models.Pooling"), + None) + normalize = bool( + next((item for item in modules_dict + if item["type"] == "sentence_transformers.models.Normalize"), + False)) + + if pooling: + + pooling_file_name = "{}/config.json".format(pooling["path"]) + pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision) + pooling_type_name = next( + (item for item, val in pooling_dict.items() if val is True), None) + + if pooling_type_name is not None: + pooling_type_name = get_pooling_config_name(pooling_type_name) + + return {"pooling_type": pooling_type_name, "normalize": normalize} + + return None + + +def get_pooling_config_name(pooling_name: str) -> Union[str, None]: + if "pooling_mode_" in pooling_name: + pooling_name = pooling_name.replace("pooling_mode_", "") + + if "_" in pooling_name: + pooling_name = pooling_name.split("_")[0] + + if "lasttoken" in pooling_name: + pooling_name = "last" + + supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN'] + pooling_type_name = pooling_name.upper() + + try: + if pooling_type_name in supported_pooling_types: + return pooling_type_name + except NotImplementedError as e: + logger.debug("Pooling type not supported", e) + return None + return None + + +def get_sentence_transformer_tokenizer_config(model: str, + revision: Optional[str] = 'main' + ): + """ + Returns the tokenization configuration dictionary for a + given Sentence Transformer BERT model. + + Parameters: + - model (str): The name of the Sentence Transformer + BERT model. + - revision (str, optional): The revision of the m + odel to use. Defaults to 'main'. + + Returns: + - dict: A dictionary containing the configuration parameters + for the Sentence Transformer BERT model. + """ + for config_name in [ + "sentence_bert_config.json", + "sentence_roberta_config.json", + "sentence_distilbert_config.json", + "sentence_camembert_config.json", + "sentence_albert_config.json", + "sentence_xlm-roberta_config.json", + "sentence_xlnet_config.json", + ]: + encoder_dict = get_hf_file_to_dict(config_name, model, revision) + if encoder_dict: + break + + if not encoder_dict: + return None + + if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")): + return encoder_dict + return None + + +def maybe_register_config_serialize_by_value() -> None: + """Try to register HF model configuration class to serialize by value + + If trust_remote_code is set, and the model's config file specifies an + `AutoConfig` class, then the config class is typically an instance of + a custom class imported from the HF modules cache. + + Examples: + + >>> from transformers import AutoConfig + >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True) + >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig + >>> import transformers_modules # error, not initialized + >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True) + >>> import transformers_modules # success, initialized + >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config + + In the DeepSeek example, the config class is an instance of a custom + class that is not serializable by default. This class will not be + importable in spawned workers, and won't exist at all on + other nodes, which breaks serialization of the config. + + In this function we tell the cloudpickle serialization library to pass + instances of these generated classes by value instead of by reference, + i.e. the class definition is serialized along with its data so that the + class module does not need to be importable on the receiving end. + + See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs + """ # noqa + try: + import transformers_modules + except ImportError: + # the config does not need trust_remote_code + return + + try: + import cloudpickle + cloudpickle.register_pickle_by_value(transformers_modules) + + # ray vendors its own version of cloudpickle + from vllm.executor.ray_utils import ray + if ray: + ray.cloudpickle.register_pickle_by_value(transformers_modules) + + # multiprocessing uses pickle to serialize arguments when using spawn + # Here we get pickle to use cloudpickle to serialize config objects + # that contain instances of the custom config class to avoid + # serialization problems if the generated module (and model) has a `.` + # in its name + import multiprocessing + import pickle + + from vllm.config import VllmConfig + + def _reduce_config(config: VllmConfig): + return (pickle.loads, (cloudpickle.dumps(config), )) + + multiprocessing.reducer.register(VllmConfig, _reduce_config) + + except Exception as e: + logger.warning( + "Unable to register remote classes used by" + " trust_remote_code with by-value serialization. This may" + " lead to a later error. If remote code is not needed" + " remove `--trust-remote-code`", + exc_info=e) + + +def load_params_config(model: Union[str, Path], revision: Optional[str], + **kwargs) -> PretrainedConfig: + # This function loads a params.json config which + # should be used when loading models in mistral format + + config_file_name = "params.json" + + config_dict = get_hf_file_to_dict(config_file_name, model, revision) + assert isinstance(config_dict, dict) + + config_mapping = { + "dim": "hidden_size", + "norm_eps": "rms_norm_eps", + "n_kv_heads": "num_key_value_heads", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "hidden_dim": "intermediate_size", + } + + def recurse_elems(elem: Any): + if isinstance(elem, dict): + config_dict = {} + for key, value in elem.items(): + key = config_mapping.get(key, key) + config_dict[key] = recurse_elems(value) + return PretrainedConfig(**config_dict) + else: + return elem + + config_dict["model_type"] = config_dict.get("model_type", "transformer") + config_dict["hidden_act"] = config_dict.get("activation", "silu") + config_dict["tie_word_embeddings"] = config_dict.get( + "tie_embeddings", False) + config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000) + config_dict["max_position_embeddings"] = config_dict.get( + "max_position_embeddings", 128_000) + + if config_dict.get("moe") is not None: + config_dict["architectures"] = ["MixtralForCausalLM"] + else: + config_dict["architectures"] = ["MistralForCausalLM"] + + if config_dict.get("vision_encoder") is not None: + multimodal_config = config_dict.pop("vision_encoder") + + config_dict = { + "text_config": config_dict, + "vision_config": multimodal_config + } + config_dict["architectures"] = ["PixtralForConditionalGeneration"] + config_dict["model_type"] = "pixtral" + + config_dict.update(kwargs) + + config = recurse_elems(config_dict) + return config + + +def get_hf_image_processor_config( + model: Union[str, Path], + revision: Optional[str] = None, + **kwargs, +) -> Dict[str, Any]: + # ModelScope does not provide an interface for image_processor + if VLLM_USE_MODELSCOPE: + return dict() + # Separate model folder from file path for GGUF models + if check_gguf_file(model): + model = Path(model).parent + return get_image_processor_config(model, revision=revision, **kwargs) + + +def get_hf_text_config(config: PretrainedConfig): + """Get the "sub" config relevant to llm for multi modal models. + No op for pure text models. + """ + if hasattr(config, "text_config"): + # The code operates under the assumption that text_config should have + # `num_attention_heads` (among others). Assert here to fail early + # if transformers config doesn't align with this assumption. + assert hasattr(config.text_config, "num_attention_heads") + return config.text_config + else: + return config + + +def try_get_generation_config( + model: str, + trust_remote_code: bool, + revision: Optional[str] = None, +) -> Optional[GenerationConfig]: + try: + return GenerationConfig.from_pretrained( + model, + revision=revision, + ) + except OSError: # Not found + try: + config = get_config( + model, + trust_remote_code=trust_remote_code, + revision=revision, + ) + return GenerationConfig.from_model_config(config) + except OSError: # Not found + return None + + +def get_cross_encoder_activation_function(config: PretrainedConfig): + if (hasattr(config, "sbert_ce_default_activation_function") + and config.sbert_ce_default_activation_function is not None): + + function_name = config.sbert_ce_default_activation_function + assert function_name.startswith("torch.nn.modules."), \ + "Loading of activation functions is restricted to " \ + "torch.nn.modules for security reasons" + return resolve_obj_by_qualname(function_name)() + else: + return nn.Sigmoid() if config.num_labels == 1 else nn.Identity() diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/arctic.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/arctic.py new file mode 100644 index 0000000000000000000000000000000000000000..6625ccf0f2a84eecf3b78a14f401027f1e6064dd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/arctic.py @@ -0,0 +1,206 @@ +# SPDX-License-Identifier: Apache-2.0 + +# yapf: disable +# ruff: noqa: E501 +# coding=utf-8 +# Copied from +# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py +""" Arctic model configuration""" + +from dataclasses import asdict, dataclass +from typing import Any, Dict + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json", +} + + +@dataclass +class ArcticLoraConfig: + lora_r: int = 64 + lora_alpha: float = 16 + shard_base_weights: bool = False + + +@dataclass +class ArcticQuantizationConfig: + q_bits: int = 8 + rounding: str = "nearest" + mantissa_bits: int = 3 + group_size: int = 128 + + +class ArcticConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an + Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config.. + + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ArcticModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to `4096*32`): + The maximum sequence length that this model might ever be used with. Arctic's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*): + Sliding window attention window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + num_experts_per_tok (`int`, *optional*, defaults to 2): + The number of experts to root per-token, can be also interpreted as the `top-p` routing + parameter + num_local_experts (`int`, *optional*, defaults to 8): + Number of experts per Sparse MLP layer. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + + ```python + >>> from transformers import ArcticModel, ArcticConfig + + >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to. + >>> configuration = ArcticConfig() + + >>> # Initializing a model from the Arctic 7B style configuration + >>> model = ArcticModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "arctic" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=1e6, + sliding_window=None, + attention_dropout=0.0, + num_experts_per_tok=1, + num_local_experts=8, + router_aux_loss_coef=0.001, + moe_layer_frequency=2, + parallel_attn_mlp_res=False, + moe_train_capacity_factor=1, + moe_eval_capacity_factor=1, + enable_expert_tensor_parallelism=False, + moe_min_capacity=0, + moe_token_dropping=True, + quantization=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + self.num_experts_per_tok = num_experts_per_tok + self.num_local_experts = num_local_experts + self.router_aux_loss_coef = router_aux_loss_coef + self.moe_layer_frequency = moe_layer_frequency + self.moe_train_capacity_factor = moe_train_capacity_factor + self.moe_eval_capacity_factor = moe_eval_capacity_factor + self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism + self.moe_min_capacity = moe_min_capacity + self.moe_token_dropping = moe_token_dropping + self.parallel_attn_mlp_res = parallel_attn_mlp_res + if isinstance(quantization, dict): + self.quantization = ArcticQuantizationConfig(**quantization) + else: + self.quantization = quantization + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig": + result = super().from_dict(config_dict, **kwargs) + config = result[0] if isinstance(result, tuple) else result + if isinstance(config.quantization, dict): + config.quantization = ArcticQuantizationConfig(**config.quantization) + return result + + def to_dict(self) -> Dict[str, Any]: + ret = super().to_dict() + if isinstance(ret["quantization"], ArcticQuantizationConfig): + ret["quantization"] = asdict(ret["quantization"]) + return ret diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/chatglm.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..43e9503ffe03f5e4eae1675b3cc215c393591df6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/chatglm.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/THUDM/ChatGLM2-6B +from transformers import PretrainedConfig + + +class ChatGLMConfig(PretrainedConfig): + model_type = "chatglm" + attribute_map = { + "num_hidden_layers": "num_layers", + "n_head_kv": "multi_query_group_num", + } + + def __init__(self, + num_layers=28, + padded_vocab_size=65024, + hidden_size=4096, + ffn_hidden_size=13696, + kv_channels=128, + num_attention_heads=32, + seq_length=2048, + hidden_dropout=0.0, + attention_dropout=0.0, + layernorm_epsilon=1e-5, + rmsnorm=True, + apply_residual_connection_post_layernorm=False, + post_layer_norm=True, + add_bias_linear=False, + add_qkv_bias=False, + interleaved_qkv=False, + bias_dropout_fusion=True, + multi_query_attention=False, + multi_query_group_num=1, + apply_query_key_layer_scaling=True, + attention_softmax_in_fp32=True, + fp32_residual_connection=False, + quantization_bit=0, + pre_seq_len=None, + prefix_projection=False, + **kwargs): + self.num_layers = num_layers + self.vocab_size = padded_vocab_size + self.padded_vocab_size = padded_vocab_size + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.kv_channels = kv_channels + self.num_attention_heads = num_attention_heads + self.seq_length = seq_length + # It is to be compatible with long lora. + self.max_position_embeddings = seq_length + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.layernorm_epsilon = layernorm_epsilon + self.rmsnorm = rmsnorm + self.apply_residual_connection_post_layernorm = ( + apply_residual_connection_post_layernorm) + self.post_layer_norm = post_layer_norm + self.add_bias_linear = add_bias_linear + self.add_qkv_bias = add_qkv_bias + self.bias_dropout_fusion = bias_dropout_fusion + self.multi_query_attention = multi_query_attention + self.multi_query_group_num = multi_query_group_num + self.apply_query_key_layer_scaling = apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.fp32_residual_connection = fp32_residual_connection + self.quantization_bit = quantization_bit + self.pre_seq_len = pre_seq_len + self.prefix_projection = prefix_projection + self.interleaved_qkv = interleaved_qkv + super().__init__(**kwargs) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/cohere2.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/cohere2.py new file mode 100644 index 0000000000000000000000000000000000000000..e30409b3af5f0c97001dca5b03ae3f56ea2a220f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/cohere2.py @@ -0,0 +1,194 @@ +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa + +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py +from transformers import PretrainedConfig +from transformers.modeling_rope_utils import rope_config_validation + + +class Cohere2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere + model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model. + + + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CohereModel`] + hidden_size (`int`, *optional*, defaults to 8192): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22528): + Dimension of the MLP representations. + logit_scale (`float`, *optional*, defaults to 0.0625): + The scaling factor for the output logits. + num_hidden_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 64): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 5): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 255001): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + sliding_window (`int`, *optional*, defaults to 4096): + Size of the sliding window attention context. + sliding_window_pattern (`int`, *optional*, defaults to 4): + Pattern for the sliding window attention. + cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`. + + ```python + >>> from transformers import Cohere2Model, Cohere2Config + + >>> # Initializing a Cohere Nextmodel configuration + >>> configuration = Cohere2Config() + + >>> # Initializing a model from the Cohere2 configuration + >>> model = Cohere2Model(configuration) # doctest: +SKIP + + >>> # Accessing the model configuration + >>> configuration = model.config # doctest: +SKIP + ``` + """ + + model_type = "cohere2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=256000, + hidden_size=8192, + intermediate_size=22528, + logit_scale=0.0625, + num_hidden_layers=40, + num_attention_heads=64, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=8192, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=5, + eos_token_id=255001, + tie_word_embeddings=True, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + sliding_window=4096, + sliding_window_pattern=4, + cache_implementation="hybrid", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.logit_scale = logit_scale + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.sliding_window = sliding_window + self.sliding_window_pattern = sliding_window_pattern + # Need to specify head_dim in the config so it can be used in the attention forward functions + self.head_dim = hidden_size // num_attention_heads + self.cache_implementation = cache_implementation + + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["Cohere2Config"] diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/deepseek_vl2.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/deepseek_vl2.py new file mode 100644 index 0000000000000000000000000000000000000000..24d4052d872116e2978e2c2d757091a7531b2ff7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/deepseek_vl2.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 + +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 +from typing import Tuple + +from transformers.configuration_utils import PretrainedConfig + + +class VisionEncoderConfig(PretrainedConfig): + model_type: str = "vision" + + model_name: str = "vit_so400m_patch14_siglip_384.webli" + image_size: int = 384 + patch_size: int = 16 + width: int = 1024 + layers: int = 24 + heads: int = 16 + mlp_ratio: int = 4 + global_pool: str = "map" + ignore_head: bool = True + class_token: bool = False + num_classes: int = 0 + use_checkpoint: bool = False + weight_init: str = "skip" + deterministic: bool = False + num_recomputing_layers: int = 0 + + def __init__(self, + model_name: str = "vit_so400m_patch14_siglip_384.webli", + image_size: int = 384, + patch_size: int = 16, + width: int = 1024, + layers: int = 24, + heads: int = 16, + mlp_ratio: int = 4, + global_pool: str = "map", + ignore_head: bool = True, + class_token: bool = False, + num_classes: int = 0, + use_checkpoint: bool = False, + **kwargs): + self.model_name = model_name + self.image_size = image_size + self.patch_size = patch_size + self.width = width + self.layers = layers + self.heads = heads + self.mlp_ratio = mlp_ratio + self.global_pool = global_pool + self.ignore_head = ignore_head + self.class_token = class_token + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + + super().__init__(**kwargs) + + +class MlpProjectorConfig(PretrainedConfig): + model_type = "mlp_projector" + projector_type: str = "downsample_mlp_gelu" + input_dim: int = 1152 + n_embed: int = 2048 + depth: int = 2 + mlp_ratio: int = 1 + downsample_ratio: int = 2 + token_pooling: bool = False + + def __init__(self, + projector_type: str = "downsample_mlp_gelu", + input_dim: int = 1152, + n_embed: int = 2048, + depth: int = 2, + mlp_ratio: int = 1, + downsample_ratio: int = 2, + **kwargs): + self.projector_type = projector_type + self.input_dim = input_dim + self.n_embed = n_embed + self.depth = depth + self.mlp_ratio = mlp_ratio + self.downsample_ratio = downsample_ratio + + super().__init__(**kwargs) + + +class DeepseekV2Config(PretrainedConfig): + + model_type = "deepseek_v2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=102400, + hidden_size=4096, + intermediate_size=11008, + moe_intermediate_size=1407, + num_hidden_layers=30, + num_attention_heads=32, + num_key_value_heads=32, + n_shared_experts=None, + n_routed_experts=None, + ep_size=1, + routed_scaling_factor=1.0, + kv_lora_rank=512, + q_lora_rank=1536, + qk_rope_head_dim=64, + v_head_dim=128, + qk_nope_head_dim=128, + topk_method='gready', + n_group=None, + topk_group=None, + num_experts_per_tok=None, + moe_layer_freq=1, + first_k_dense_replace=0, + norm_topk_prob=False, + scoring_func='softmax', + aux_loss_alpha=0.001, + seq_aux=True, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=100000, + eos_token_id=100001, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + use_mla=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.moe_intermediate_size = moe_intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.n_shared_experts = n_shared_experts + self.n_routed_experts = n_routed_experts + self.ep_size = ep_size + self.routed_scaling_factor = routed_scaling_factor + self.kv_lora_rank = kv_lora_rank + self.q_lora_rank = q_lora_rank + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.qk_nope_head_dim = qk_nope_head_dim + self.topk_method = topk_method + self.n_group = n_group + self.topk_group = topk_group + self.num_experts_per_tok = num_experts_per_tok + self.moe_layer_freq = moe_layer_freq + self.first_k_dense_replace = first_k_dense_replace + self.norm_topk_prob = norm_topk_prob + self.scoring_func = scoring_func + self.aux_loss_alpha = aux_loss_alpha + self.seq_aux = seq_aux + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = float(rms_norm_eps) + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.use_mla = use_mla + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +class DeepseekVLV2Config(PretrainedConfig): + model_type = "deepseek_vl_v2" + vision_config: VisionEncoderConfig + projector_config: MlpProjectorConfig + + tile_tag: str = "2D" + global_view_pos: str = "head" + candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), ) + + def __init__(self, + tile_tag: str = "tile_tag", + global_view_pos: str = "head", + candidate_resolutions: Tuple[Tuple[int, + int]] = ((384, 384), ), + **kwargs): + super().__init__(**kwargs) + + vision_config = kwargs.get("vision_config", {}) + self.vision_config = VisionEncoderConfig(**vision_config) + + projector_config = kwargs.get("projector_config", {}) + self.projector_config = MlpProjectorConfig(**projector_config) + + language_config = kwargs.get("language_config", {}) + self.text_config = DeepseekV2Config(**language_config) + + self.tile_tag = tile_tag + self.global_view_pos = global_view_pos + self.candidate_resolutions = candidate_resolutions + self.vocab_size = self.text_config.vocab_size diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/eagle.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/eagle.py new file mode 100644 index 0000000000000000000000000000000000000000..b26aba66699fdbd713fade7b770d379194ff2e05 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/eagle.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Optional, Union + +from transformers import AutoConfig, PretrainedConfig + + +class EAGLEConfig(PretrainedConfig): + model_type = "eagle" + + def __init__(self, + model: Union[PretrainedConfig, dict, None] = None, + truncated_vocab_size: Optional[int] = None, + **kwargs): + + model_config = None if model is None else (AutoConfig.for_model( + **model) if isinstance(model, dict) else model) + + for k, v in kwargs.items(): + if k != "architectures" and k != "model_type" and hasattr( + model_config, k): + setattr(model_config, k, v) + + self.model = model_config + + if self.model is None: + self.truncated_vocab_size = None + else: + self.truncated_vocab_size = self.model.vocab_size if \ + truncated_vocab_size is None else truncated_vocab_size + + if "architectures" not in kwargs: + kwargs["architectures"] = ["EAGLEModel"] + + super().__init__(**kwargs) + + if self.model is not None: + for k, v in self.model.to_dict().items(): + if not hasattr(self, k): + setattr(self, k, v) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + **kwargs, + ) -> "EAGLEConfig": + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs) + return cls.from_dict(config_dict, **kwargs) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/exaone.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/exaone.py new file mode 100644 index 0000000000000000000000000000000000000000..39364367e30316d1f8540825168b5b35aeb0ce32 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/exaone.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copied from +# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py +# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Exaone model configuration""" + +from typing import Dict + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {} + + +class ExaoneConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class: + `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model + according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar + configuration to that of the Exaone + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` + and can be used to control the model outputs. Read the documentation from : + class:`~transformers.PretrainedConfig` for more information. + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 50257): + Vocabulary size of the GPT Lingvo model. Defines the number of + different tokens that can be represented by the :obj:`inputs_ids` + passed when calling :class:`~transformers.ExaoneModel`. Vocabulary + size of the model. + Defines the different tokens that can be represented by the + `inputs_ids` passed to the forward method of :class: + `~transformers.EXAONEModel`. + hidden_size (:obj:`int`, `optional`, defaults to 2048): + Dimensionality of the encoder layers and the pooler layer. + num_layers (:obj:`int`, `optional`, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the + Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to + implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi + Head Attention (MHA), if `num_key_value_heads=1 the model will use + Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, + each group key and value head should be constructed by meanpooling + all the original heads within that group. For more details checkout + [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not + specified, will default to `num_attention_heads`. + rotary_pct (`float`, *optional*, defaults to 0.25): + percentage of hidden dimensions to allocate to rotary embeddings + intermediate_size (:obj:`int`, `optional`, defaults to 8192): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in + the Transformer encoder. + activation_function (:obj:`str` or :obj:`function`, `optional`, + defaults to :obj:`"gelu_new"`): + The non-linear activation function (function or string) in the + encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`, + :obj:`"selu"` and :obj:`"gelu_new"` are supported. + embed_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout probabilitiy for all fully connected layers in the + embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, `optional`, defaults to 2048): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, `optional`, defaults to 2): + The vocabulary size of the :obj:`token_type_ids` passed when calling + :class:`~transformers.EXAONEModel`. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5): + The epsilon used by the layer normalization layers. + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return the last key/values + attentions (not used by all models). + Only relevant if ``config.is_decoder=True``. + gradient_checkpointing (:obj:`bool`, `optional`, + defaults to :obj:`False`): + If True, use gradient checkpointing to save memory at the expense + of slower backward pass. + Example:: + + >>> from transformers import ExoneModel, ExaoneConfig + + >>> # Initializing a EXAONE configuration + >>> configuration = ExaoneConfig() + + >>> # Initializing a model from configuration + >>> model = ExoneModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + + model_type = "exaone" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_hidden_layers": "num_layers"} + + def __init__( + self, + vocab_size=102400, + max_position_embeddings=2048, + hidden_size=2048, + num_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + intermediate_size=None, + activation_function="silu", + rotary_pct=0.25, + resid_dropout=0.0, + embed_dropout=0.0, + attention_dropout=0.0, + layer_norm_epsilon=1e-6, + initializer_range=0.02, + use_cache=True, + bos_token_id=0, + eos_token_id=2, + tie_word_embeddings=True, + **kwargs, + ): + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_layers = num_layers + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_layers + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + if intermediate_size: + self.intermediate_size = intermediate_size + else: + self.intermediate_size = hidden_size * 4 + self.activation_function = activation_function + self.resid_dropout = resid_dropout + self.embed_dropout = embed_dropout + self.attention_dropout = attention_dropout + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.rotary_pct = rotary_pct + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + self.use_logit_cap = kwargs.pop("use_logit_cap", False) + self.ln_no_scale = kwargs.pop("ln_no_scale", False) + self.use_gated = kwargs.pop("use_gated", False) + self.use_emb_norm = kwargs.pop("use_emb_norm", False) + self.use_rotary_pos = kwargs.pop("use_rotary_pos", False) + self.rotary_type = kwargs.pop("rotary_type", None) + self.scaling_factor = kwargs.pop("scaling_factor", 1) + self.use_absolute_pos = kwargs.pop("use_absolute_pos", True) + self.use_extra_logit = kwargs.pop("use_extra_logit", True) + self.rotary_expand_length = kwargs.pop("rotary_expand_length", None) + self.rotary_base = kwargs.pop("rotary_base", 10000.0) + self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False) + self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head", + (rotary_pct == 0.25)) + if self.use_rotary_pos: + self.use_absolute_pos = False diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/falcon.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/falcon.py new file mode 100644 index 0000000000000000000000000000000000000000..f161a06f34238204ec034a21a88dbf92c047082a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/falcon.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py +# Copyright 2023 The vLLM team. +# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Falcon configuration""" +from transformers.configuration_utils import PretrainedConfig + + +class RWConfig(PretrainedConfig): + model_type = "falcon" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_hidden_layers": "n_layer", + "num_attention_heads": "n_head", + "num_kv_heads": "n_head_kv", + } + + def __init__( + self, + vocab_size=250880, + hidden_size=64, + n_layer=2, + n_head=8, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + bos_token_id=1, + eos_token_id=2, + hidden_dropout=0.0, + attention_dropout=0.0, + multi_query=True, + n_head_kv=None, + alibi=False, + bias=False, + parallel_attn=False, + new_decoder_architecture=False, + **kwargs, + ) -> None: + self.vocab_size = vocab_size + # Backward compatibility with n_embed kwarg + n_embed = kwargs.pop("n_embed", None) + self.hidden_size = hidden_size if n_embed is None else n_embed + self.n_layer = n_layer + self.n_head = n_head + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.multi_query = multi_query + self.n_head_kv = 1 if n_head_kv is None else n_head_kv + self.alibi = alibi + self.bias = bias + self.parallel_attn = parallel_attn + self.new_decoder_architecture = new_decoder_architecture + + if self.hidden_size == 8192: + # Hack for falcon-40b + self.new_decoder_architecture = True + + super().__init__(bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs) + + @property + def head_dim(self): + return self.hidden_size // self.n_head + + @property + def rotary(self): + return not self.alibi diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/h2ovl.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/h2ovl.py new file mode 100644 index 0000000000000000000000000000000000000000..48b5d79ff950ba6f4332bf0cf0c6b4a194a4fca6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/h2ovl.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py +# -------------------------------------------------------- +# H2OVL-Mississippi +# Copyright (c) 2024 H2O.AI +# Licensed under Apache 2.0 License [see LICENSE for details] +# -------------------------------------------------------- + +from .internvl import InternVLChatConfig + + +class H2OVLChatConfig(InternVLChatConfig): + model_type = "h2ovl_chat" diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/internvl.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/internvl.py new file mode 100644 index 0000000000000000000000000000000000000000..8ea62546e21336107ac7bb509b857978f700f13e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/internvl.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from transformers.configuration_utils import PretrainedConfig + + +class InternVLChatConfig(PretrainedConfig): + model_type = 'internvl_chat' + is_composition = True + + def __init__(self, + vision_config=None, + llm_config=None, + use_backbone_lora=0, + use_llm_lora=0, + select_layer=-1, + force_image_size=None, + downsample_ratio=0.5, + template=None, + dynamic_image_size=False, + use_thumbnail=False, + ps_version='v1', + min_dynamic_patch=1, + max_dynamic_patch=6, + **kwargs): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + + if llm_config is None: + llm_config = {} + + self.vision_config = PretrainedConfig(**vision_config) + self.text_config = PretrainedConfig(**llm_config) + + self.use_backbone_lora = use_backbone_lora + self.use_llm_lora = use_llm_lora + self.select_layer = select_layer + self.force_image_size = force_image_size + self.downsample_ratio = downsample_ratio + self.template = template + self.dynamic_image_size = dynamic_image_size + self.use_thumbnail = use_thumbnail + self.ps_version = ps_version # pixel shuffle version + self.min_dynamic_patch = min_dynamic_patch + self.max_dynamic_patch = max_dynamic_patch diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/medusa.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/medusa.py new file mode 100644 index 0000000000000000000000000000000000000000..885713c5d6cd06cd3d4a411b370417f94c41cced --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/medusa.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Optional, Union + +from transformers import PretrainedConfig + + +class MedusaConfig(PretrainedConfig): + model_type = "medusa" + + def __init__(self, + hidden_size: int = 4096, + vocab_size: int = 32001, + num_heads: int = 5, + num_hidden_layers: int = 1, + max_paths: int = 64, + topk: int = 10, + truncated_vocab_size: Optional[int] = None, + **kwargs): + + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.num_heads = num_heads + self.num_hidden_layers = num_hidden_layers + self.max_paths = max_paths + self.topk = topk + self.max_seq_len = int(2**20) + self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\ + else truncated_vocab_size + if "architectures" not in kwargs: + kwargs["architectures"] = ["MedusaModel"] + + super().__init__(**kwargs) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + **kwargs, + ) -> "MedusaConfig": + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs) + for k in list(config_dict.keys()): + if 'num' in k: + if 'heads' in k: + config_dict["num_heads"] = config_dict.pop(k) + elif 'layers' in k: + config_dict["num_hidden_layers"] = config_dict.pop(k) + return cls.from_dict(config_dict, **kwargs) + + @property + def num_attention_heads(self): + return 0 + + @property + def num_lookahead_tokens(self): + return self.num_heads + + @num_lookahead_tokens.setter + def num_lookahead_tokens(self, num_lookahead_tokens: int): + self.num_heads = num_lookahead_tokens diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mllama.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mllama.py new file mode 100644 index 0000000000000000000000000000000000000000..eb77e09adca489bf0aecf46ef66d15884d3cdb56 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mllama.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: Apache-2.0 + +from transformers.models.mllama import configuration_mllama as mllama_hf_config + + +class MllamaTextConfig(mllama_hf_config.MllamaTextConfig): + ''' + Use this class to override is_encoder_decoder: + - transformers regards mllama as is_encoder_decoder=False + - vllm needs is_encoder_decoder=True to enable cross-attention + ''' + + def __init__( + self, + **kwargs, + ): + super().__init__(**kwargs) + self.is_encoder_decoder = True + + +class MllamaConfig(mllama_hf_config.MllamaConfig): + + def __init__( + self, + text_config=None, + **kwargs, + ): + if isinstance(text_config, dict): + text_config = MllamaTextConfig(**text_config) + super().__init__(text_config=text_config, **kwargs) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mpt.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mpt.py new file mode 100644 index 0000000000000000000000000000000000000000..96356135f6b28e6c3043efc6c5b000b37e254e7d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mpt.py @@ -0,0 +1,179 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copied from +# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py +"""A HuggingFace-style model configuration.""" +import warnings +from typing import Any, Dict, Optional, Union + +from transformers import PretrainedConfig + +attn_config_defaults: Dict = { + 'attn_type': 'multihead_attention', + 'attn_pdrop': 0.0, + 'attn_impl': 'triton', + 'qk_ln': False, + 'clip_qkv': None, + 'softmax_scale': None, + 'prefix_lm': False, + 'attn_uses_sequence_id': False, + 'alibi': False, + 'alibi_bias_max': 8 +} +ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'} +init_config_defaults: Dict = { + 'name': 'kaiming_normal_', + 'fan_mode': 'fan_in', + 'init_nonlinearity': 'relu', + 'init_div_is_residual': True, + 'emb_init_std': None, + 'emb_init_uniform_lim': None, + 'init_std': None, + 'init_gain': 0.0 +} + + +class MPTConfig(PretrainedConfig): + model_type = 'mpt' + attribute_map = { + 'num_attention_heads': 'n_heads', + 'hidden_size': 'd_model', + 'num_hidden_layers': 'n_layers', + } + + # pylint: disable=dangerous-default-value + def __init__(self, + d_model: int = 2048, + n_heads: int = 16, + n_layers: int = 24, + expansion_ratio: int = 4, + max_seq_len: int = 2048, + vocab_size: int = 50368, + resid_pdrop: float = 0.0, + emb_pdrop: float = 0.0, + learned_pos_emb: bool = True, + attn_config: Dict = attn_config_defaults, + ffn_config: Dict = ffn_config_defaults, + init_device: str = 'cpu', + logit_scale: Optional[Union[float, str]] = None, + no_bias: bool = False, + embedding_fraction: float = 1.0, + norm_type: str = 'low_precision_layernorm', + use_cache: bool = False, + init_config: Dict = init_config_defaults, + fc_type: str = 'torch', + verbose: Optional[int] = None, + **kwargs: Any): + self.d_model = d_model + self.n_heads = n_heads + self.n_layers = n_layers + self.expansion_ratio = expansion_ratio + self.max_seq_len = max_seq_len + self.vocab_size = vocab_size + self.resid_pdrop = resid_pdrop + self.emb_pdrop = emb_pdrop + self.learned_pos_emb = learned_pos_emb + self.attn_config = attn_config + self.ffn_config = ffn_config + self.init_device = init_device + self.logit_scale = logit_scale + self.no_bias = no_bias + self.embedding_fraction = embedding_fraction + self.norm_type = norm_type + self.use_cache = use_cache + self.init_config = init_config + self.fc_type = fc_type + if verbose is not None: + warnings.warn(DeprecationWarning( + 'verbose argument for MPTConfig is now ignored and ' + 'will be removed. Use python_log_level instead.'), + stacklevel=2) + if 'name' in kwargs: + del kwargs['name'] + if 'loss_fn' in kwargs: + del kwargs['loss_fn'] + if self.attn_config.get('alibi', False): + self.learned_pos_emb = False + warnings.warn( + f'alibi is turned on, setting `learned_pos_emb` ' + f'to {self.learned_pos_emb}`', + stacklevel=2) + super().__init__(**kwargs) + self._validate_config() + + def _set_config_defaults( + self, config: Dict[str, Any], + config_defaults: Dict[str, Any]) -> Dict[str, Any]: + for (k, v) in config_defaults.items(): + if k not in config: + config[k] = v + return config + + def _validate_config(self) -> None: + self.attn_config = self._set_config_defaults(self.attn_config, + attn_config_defaults) + self.ffn_config = self._set_config_defaults(self.ffn_config, + ffn_config_defaults) + self.init_config = self._set_config_defaults(self.init_config, + init_config_defaults) + if self.d_model % self.n_heads != 0: + raise ValueError('d_model must be divisible by n_heads') + if any( + prob < 0 or prob > 1 for prob in + [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop + ]): + raise ValueError( + "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " + "probabilities and must be between 0 and 1") + if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: + raise ValueError( + f"Unknown attn_impl={self.attn_config['attn_impl']}") + if self.attn_config['prefix_lm'] and self.attn_config[ + 'attn_impl'] not in ['torch', 'triton']: + raise NotImplementedError( + 'prefix_lm only implemented with torch and triton attention.') + if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [ + 'torch', 'triton' + ]: + raise NotImplementedError( + 'alibi only implemented with torch and triton attention.') + if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ + 'attn_impl'] not in ['torch', 'triton']: + raise NotImplementedError( + 'attn_uses_sequence_id only implemented with torch ' + 'and triton attention.') + if self.embedding_fraction > 1 or self.embedding_fraction <= 0: + raise ValueError( + 'model.embedding_fraction must be between 0 (exclusive) ' + 'and 1 (inclusive)!') + if isinstance(self.logit_scale, + str) and self.logit_scale != 'inv_sqrt_d_model': + raise ValueError( + f"self.logit_scale={self.logit_scale!r} is not recognized as " + "an option; use numeric value or 'inv_sqrt_d_model'.") + if self.init_config.get('name', None) is None: + raise ValueError( + f"self.init_config={self.init_config!r} 'name' needs to be set." + ) + if not self.learned_pos_emb and (not self.attn_config['alibi']): + warnings.warn( + 'Positional information not being provided to the model.', + stacklevel=2) + if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp': + try: + # pylint: disable=import-outside-toplevel + import transformer_engine.pytorch as te + del te + except Exception as exc: + raise ImportError( + 'TransformerEngine import fail. `fc_type: te` requires ' + 'TransformerEngine be installed. ' + 'The required version of transformer_engine also requires ' + 'FlashAttention v1.0.6 is installed:\n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' + ) from exc + if self.ffn_config['ffn_type'] == 'mptmlp': + self.ffn_config['fc_type'] = self.fc_type + elif self.ffn_config['ffn_type'] == 'te_ln_mlp': + self.ffn_config['bias'] = not self.no_bias diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nemotron.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nemotron.py new file mode 100644 index 0000000000000000000000000000000000000000..fdf4fa2a53e5706eeed69f0ce8a6d1ec81584d74 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nemotron.py @@ -0,0 +1,204 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Nemotron model configuration""" + +from transformers import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class NemotronConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a + [`NemotronModel`]. It is used to instantiate an Nemotron model + according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar + configuration to that of the Nemotron-8B. + + Configuration objects inherit from [`PretrainedConfig`] and can be + used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Nemotron model. Defines the number of + different tokens that can be represented by the + `inputs_ids` passed when calling [`NemotronModel`] + hidden_size (`int`, *optional*, defaults to 6144): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 24576): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 48): + Number of attention heads for each attention layer in the + Transformer decoder. + head_dim (`int`, *optional*): + Projection weights dimension in multi-head attention. Set to + hidden_size // num_attention_heads if None + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to + implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use + Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention + (MQA) otherwise GQA is used. When converting a multi-head + checkpoint to a GQA checkpoint, each group key and value + head should be constructed by meanpooling all the original + heads within that group. For more details checkout + [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it + is not specified, will default to `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`): + The non-linear activation function (function or string) in the + decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used + with. + initializer_range (`float`, *optional*, defaults to 0.0134): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices. + norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values + attentions (not used by all models). Only relevant if + `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 3): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + partial_rotary_factor (`float`, *optional*, defaults to 0.5): + Percentage of the query and keys which will have rotary embedding. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output + projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj and down_proj layers in the MLP + layers. + + ```python + >>> from transformers import NemotronModel, NemotronConfig + >>> # Initializing a Nemotron nemotron-15b style configuration + >>> configuration = NemotronConfig() + >>> # Initializing a model from the nemotron-15b style configuration + >>> model = NemotronModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "nemotron" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=256000, + hidden_size=6144, + intermediate_size=24576, + num_hidden_layers=32, + num_attention_heads=48, + head_dim=None, + num_key_value_heads=None, + hidden_act="relu2", + max_position_embeddings=4096, + initializer_range=0.0134, + norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=2, + eos_token_id=3, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + partial_rotary_factor=0.5, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + head_dim = head_dim or kwargs.get("kv_channels") + self.head_dim = head_dim if head_dim is not None else ( + hidden_size // num_attention_heads) + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.norm_eps = norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + # for backward compatibility + partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get( + "rope_percentage") or partial_rotary_factor + self.partial_rotary_factor = partial_rotary_factor + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len( + self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with two fields, " + f"`type` and `factor`, got {self.rope_scaling}") + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in [ + "linear", "dynamic" + ]: + raise ValueError( + "`rope_scaling`'s type field must be one of ['linear', " + f"'dynamic'], got {rope_scaling_type}") + if rope_scaling_factor is None or not isinstance( + rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError( + "`rope_scaling`'s factor field must be a float > 1, got " + f"{rope_scaling_factor}") diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nvlm_d.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nvlm_d.py new file mode 100644 index 0000000000000000000000000000000000000000..300f6e21168e55e7d15a9fc87a41878ccaa82cbb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nvlm_d.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py +# -------------------------------------------------------- +# NVLM-D +# Copyright (c) 2024 NVIDIA +# Licensed under Apache 2.0 License [see LICENSE for details] +# -------------------------------------------------------- +from .internvl import InternVLChatConfig + + +class NVLM_D_Config(InternVLChatConfig): + model_type = 'NVLM_D' diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/solar.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/solar.py new file mode 100644 index 0000000000000000000000000000000000000000..0d5db896b93d360c5026ff163da7d0ccd11f183c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/solar.py @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Solar model configuration""" + +from transformers import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class SolarConfig(PretrainedConfig): + r""" + This is the configuration class to store + the configuration of a [`SolarModel`]. + It is used to instantiate an LLaMA model + according to the specified arguments, + defining the model architecture. + Instantiating a configuration with the + defaults will yield a similar + configuration to that of the LLaMA-7B. + Configuration objects inherit from [`PretrainedConfig`] + and can be used to control the model outputs. + Read the documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. + Defines the number of different tokens + that can be represented by the `inputs_ids` + passed when calling [`SolarModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer + in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that + should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, + the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model + will use Multi Query Attention (MQA) + otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, + each group key and value head should be constructed + by meanpooling all the original heads within that group. + For more details checkout [this paper] + (https://arxiv.org/pdf/2305.13245.pdf). + If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) + in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + Solar 1 supports up to 2048 tokens, + Solar 2 up to 4096, CodeSolar up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of + the truncated_normal_initializer for initializing + all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return + the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank + used during pretraining. + Please refer to [this + document](https://huggingface.co/docs/ + transformers/main/ + perf_train_gpu_many#tensor-parallelism) + to understand more about it. This value is + necessary to ensure exact reproducibility + of the pretraining results. + Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for + the RoPE embeddings. + Currently supports two scaling + strategies: linear and dynamic. + Their scaling factor must be a float greater than 1. + The expected format is + `{"type": strategy name, "factor": scaling factor}`. + When using this flag, don't update + `max_position_embeddings` to the expected new maximum. + See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/ + dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking + API changes in future versions. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value + and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj + layers in the MLP layers. + sliding_window (`int`, *optional*, defaults to 2047): + Sliding window attention window size. If not specified, + will default to `2047`. + ```python + >>> from transformers import SolarModel, SolarConfig + >>> # Initializing a Solar-pro style configuration + >>> configuration = SolarConfig() + >>> # Initializing a model from the Solar-pro style configuration + >>> model = SolarModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "solar" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + sliding_window=2047, + bskcn_1=None, + bskcn_2=None, + bskcn_3=None, + bskcn_4=None, + bskcn_tv=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.sliding_window = sliding_window + self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44] + self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32] + self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48] + self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40] + self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8] + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if (not isinstance(self.rope_scaling, dict) + or len(self.rope_scaling) != 2): + raise ValueError( + "`rope_scaling` must be a dictionary with two fields," + " `type` and `factor`, " + f"got {self.rope_scaling}") + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in [ + "linear", + "dynamic", + ]: + raise ValueError(f"`rope_scaling`'s type field must be one of " + f"['linear', 'dynamic'], got {rope_scaling_type}") + if (rope_scaling_factor is None + or not isinstance(rope_scaling_factor, float) + or rope_scaling_factor <= 1.0): + raise ValueError( + f"`rope_scaling`'s factor field must be a float > 1," + f" got {rope_scaling_factor}") diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/ultravox.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/ultravox.py new file mode 100644 index 0000000000000000000000000000000000000000..99715ba6d0b09e18f8988a46cc7e78549e597f3f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/ultravox.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py +from typing import Any, Dict, Optional + +import transformers + + +class UltravoxConfig(transformers.PretrainedConfig): + r""" + This is the configuration class to store the configuration of a + [`UltravoxForConditionalGeneration`]. It is used to instantiate an + Ultravox model according to the specified arguments, defining the model + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to + control the model outputs. Read the documentation from [`PretrainedConfig`] + for more information. + + Args: + audio_config (`Union[AutoConfig, dict]`, *optional*): + Custom audio config or dict + text_config (`Union[AutoConfig, dict]`, *optional*): + The config object of the text backbone. Can be any of `LlamaConfig` + or `MistralConfig`. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + audio_token_index (`int`, *optional*, defaults to 32000): + The audio token index to encode the audio prompt. + stack_factor (`int`, *optional*, defaults to 8): + Audio downsampling factor for the multimodal projector. + norm_init (`float`, *optional*, defaults to 0.4): + The initialization value for the layer normalization. + projector_act (`str`, *optional*, defaults to `"swiglu"`): + The activation function used by the multimodal projector. + text_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the text model. + audio_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the audio model. + """ + + model_type = "ultravox" + is_composition = False + + def __init__( + self, + audio_config: Optional[Dict[str, Any]] = None, + text_config: Optional[Dict[str, Any]] = None, + audio_model_id: Optional[str] = None, + text_model_id: Optional[str] = None, + ignore_index: int = -100, + audio_token_index: int = 32000, + hidden_size: int = 4096, + stack_factor: int = 8, + norm_init: float = 0.4, + projector_act: str = "swiglu", + text_model_lora_config: Optional[Dict[str, Any]] = None, + audio_model_lora_config: Optional[Dict[str, Any]] = None, + **kwargs, + ): + self.ignore_index = ignore_index + + self.audio_model_id = audio_model_id + self.text_model_id = text_model_id + self.audio_token_index = audio_token_index + + self.hidden_size = hidden_size + self.stack_factor = stack_factor + self.norm_init = norm_init + self.projector_act = projector_act + + if text_model_id is not None: + # Avoid circular import + from vllm.transformers_utils.config import get_config + + self.text_config = get_config(text_model_id, + trust_remote_code=False) + else: + text_config = text_config or {} + self.text_config = transformers.CONFIG_MAPPING[text_config.get( + "model_type", "llama")](**text_config) + + if audio_model_id is not None: + # Avoid circular import + from vllm.transformers_utils.config import get_config + + self.audio_config = get_config(audio_model_id, + trust_remote_code=False) + else: + audio_config = audio_config or {} + self.audio_config = transformers.CONFIG_MAPPING[audio_config.get( + "model_type", "whisper")](**audio_config) + + self.text_model_lora_config = text_model_lora_config or {} + self.audio_model_lora_config = audio_model_lora_config or {} + + self.vocab_size = self.text_config.vocab_size + + self.initializer_range = self.text_config.initializer_range + + super().__init__(**kwargs) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..9d1d4bb92e4ab5043007f79b69ccd14ef380e51b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer.py @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Optional + +from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams, + Sequence, SequenceGroup) + +from .detokenizer_utils import (convert_prompt_ids_to_tokens, + detokenize_incrementally) +from .tokenizer import AnyTokenizer +from .tokenizer_group import BaseTokenizerGroup + + +class Detokenizer: + """Provides methods to decode the output of a model into text.""" + + def __init__(self, tokenizer_group: BaseTokenizerGroup): + self.tokenizer_group = tokenizer_group + + def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer: + """Returns the HF tokenizer to use for a given sequence.""" + return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request) + + def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, + prompt_logprobs: List[Optional[Dict[ + int, Logprob]]], + position_offset: int) -> None: + """Decodes the logprobs for the prompt of a sequence group. + + Args: + seq_group: The sequence group to decode. + prompt_logprobs: The logprobs to decode. + position_offset: Offset of the first index of the logprobs + relative to the start of the sequence (for chunked prefill). + + Returns: + The prompt logprobs with the decoded tokens. + """ + prms = seq_group.sampling_params + assert prms is not None + + # We can pick any sequence for the prompt. + seq = seq_group.get_seqs()[0] + # Only prompt, without the generated token. + all_token_ids = seq.get_token_ids() + prompt_token_ids = all_token_ids[:-1] + tokenizer = self.get_tokenizer_for_seq(seq) + prefix_offset = 0 + read_offset = 0 + next_iter_prefix_offset = 0 + next_iter_read_offset = 0 + next_iter_tokens: List[str] = [] + prev_tokens = None + + for token_position_in_logprob, prompt_logprobs_for_token in enumerate( + prompt_logprobs): + + # Absolute token position equals the index in the logprobs + # list plus the offset of the entire logprobs list relative + # to the start of the sequence. + token_position = token_position_in_logprob + position_offset + if not prompt_logprobs_for_token: + continue + for token_id, sample_logprob in prompt_logprobs_for_token.items(): + if (sample_logprob.decoded_token is None + and token_id != VLLM_INVALID_TOKEN_ID): + prompt_token_ids_with_token = ( + prompt_token_ids[:token_position] + [token_id]) + (new_tokens, new_text, new_prefix_offset, + new_read_offset) = detokenize_incrementally( + tokenizer=tokenizer, + all_input_ids=prompt_token_ids_with_token, + prev_tokens=prev_tokens, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=prms.skip_special_tokens, + spaces_between_special_tokens=prms. + spaces_between_special_tokens, + ) + + sample_logprob.decoded_token = new_text + + # Use the offsets & prev tokens corresponding to + # real tokens to ensure detokenization is consistent + # actual with prompt. + if token_id == all_token_ids[token_position]: + next_iter_prefix_offset = new_prefix_offset + next_iter_read_offset = new_read_offset + next_iter_tokens = new_tokens + + # Advance to the next token position. + prefix_offset = next_iter_prefix_offset + read_offset = next_iter_read_offset + if prev_tokens is None: + prev_tokens = next_iter_tokens.copy() + else: + prev_tokens.extend(next_iter_tokens) + + def decode_sequence_inplace(self, seq: Sequence, + prms: SamplingParams) -> int: + """Decodes the new token for a sequence. In-place operation. + + Args: + seq: The sequence to decode. + prms: The sampling parameters used to generate the sequence. + + Returns: + The number of characters added to the output text. + """ + all_input_ids = seq.get_token_ids() + token_id_generated_this_iteration = all_input_ids[-1] + tokenizer = self.get_tokenizer_for_seq(seq) + + # Convert prompt token IDs to tokens if necessary. + # Do it here so that we don't have to repeat this + # computation for each logprob. + if seq.tokens is None: + (seq.tokens, seq.prefix_offset, + seq.read_offset) = convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=all_input_ids[:-1], + skip_special_tokens=prms.skip_special_tokens, + ) + + (new_tokens, new_decoded_token_text, prefix_offset, + read_offset) = detokenize_incrementally( + tokenizer=tokenizer, + all_input_ids=all_input_ids, + prev_tokens=seq.tokens, + prefix_offset=seq.prefix_offset, + read_offset=seq.read_offset, + skip_special_tokens=prms.skip_special_tokens, + spaces_between_special_tokens=prms.spaces_between_special_tokens, + ) + + # Decode logprobs + logprobs = seq.output_logprobs[-1] + if logprobs: + previous_tokens = all_input_ids[:-1] + for token_id, sample_logprob in logprobs.items(): + # If the token was generated this iteration, + # use the provided text. + if token_id == token_id_generated_this_iteration: + sample_logprob.decoded_token = new_decoded_token_text + continue + + if (sample_logprob.decoded_token is None + and token_id != VLLM_INVALID_TOKEN_ID): + all_input_ids_with_logprob = previous_tokens + [token_id] + (_, new_text, _, _) = detokenize_incrementally( + tokenizer=tokenizer, + all_input_ids=all_input_ids_with_logprob, + prev_tokens=seq.tokens, + prefix_offset=seq.prefix_offset, + read_offset=seq.read_offset, + skip_special_tokens=prms.skip_special_tokens, + spaces_between_special_tokens=prms. + spaces_between_special_tokens, + ) + sample_logprob.decoded_token = new_text + + seq.tokens.extend(new_tokens) + seq.prefix_offset = prefix_offset + seq.read_offset = read_offset + seq.output_text += new_decoded_token_text + + return len(new_decoded_token_text) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer_utils.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8160a35ff2228f1eb503ab9d001bf808ff097c86 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer_utils.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Optional, Tuple + +from .tokenizer import AnyTokenizer + + +def _replace_none_with_empty(tokens: List[Optional[str]]): + for i, token in enumerate(tokens): + if token is None: + tokens[i] = "" + + +def _convert_tokens_to_string_with_added_encoders( + tokenizer: AnyTokenizer, + output_tokens: List[str], + skip_special_tokens: bool, + spaces_between_special_tokens: bool, +) -> str: + # Adapted from + # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921 + # NOTE(woosuk): The following code is slow because it runs a for loop over + # the output_tokens. In Python, running a for loop over a list can be slow + # even when the loop body is very simple. + sub_texts: List[str] = [] + current_sub_text: List[str] = [] + all_special_tokens = set(tokenizer.all_special_tokens) + for token in output_tokens: + if skip_special_tokens and token in all_special_tokens: + continue + if token in tokenizer.get_added_vocab(): + if current_sub_text: + sub_text = tokenizer.convert_tokens_to_string(current_sub_text) + sub_texts.append(sub_text) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_text = tokenizer.convert_tokens_to_string(current_sub_text) + sub_texts.append(sub_text) + if spaces_between_special_tokens: + return " ".join(sub_texts) + else: + return "".join(sub_texts) + + +# 5 is an arbitrary value that should work for all +# tokenizers (bigger = more conservative). +INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5 + + +def convert_prompt_ids_to_tokens( + tokenizer: AnyTokenizer, + prompt_ids: List[int], + skip_special_tokens: bool = False, +) -> Tuple[List[str], int, int]: + """Converts the prompt ids to tokens and returns the tokens and offsets + for incremental detokenization. + + Note that not all tokens are converted to strings. Only the tokens that + are necessary for incremental detokenization are converted to strings. + """ + # We do not need to convert the whole prompt to tokens. + # Offset a little more in case we have special tokens. + new_tokens = tokenizer.convert_ids_to_tokens( + prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:], + skip_special_tokens=skip_special_tokens) + read_offset = len(new_tokens) + prefix_offset = max( + read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0) + # This is required to guard against out-of-vocab prompt token ids + _replace_none_with_empty(new_tokens) # type: ignore[arg-type] + return new_tokens, prefix_offset, read_offset + + +# Based on +# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15 +# under Apache 2.0 license +def detokenize_incrementally( + tokenizer: AnyTokenizer, + all_input_ids: List[int], + prev_tokens: Optional[List[str]], + prefix_offset: int, + read_offset: int, + skip_special_tokens: bool = False, + spaces_between_special_tokens: bool = True, +) -> Tuple[List[str], str, int, int]: + """Detokenizes the input ids incrementally and returns the new tokens + and the new text. + + If `prev_tokens` is None, this function will convert the input ids to + tokens and return the tokens and the new text. Otherwise, it will return the + new tokens and the new text. + + This function will also return the new prefix offset and the new read + offset to be used in the next iteration. + + The offsets are necessary to defeat cleanup algorithms in the decode which + decide to add a space or not depending on the surrounding ids. + + Args: + tokenizer: The tokenizer to use. + all_input_ids: The input ids. The last id is the new token id. + prev_tokens: The previous tokens. If None, this function will convert + the input ids to tokens and return the tokens and the new text. + prefix_offset: The prefix offset. + read_offset: The read offset. + skip_special_tokens: Whether to skip special tokens. + spaces_between_special_tokens: Whether to add spaces between special + tokens. + """ + new_token_id = all_input_ids[-1] + # This is the first iteration for this sequence + is_first_iter = prev_tokens is None + if is_first_iter: + (prev_tokens, prefix_offset, + read_offset) = convert_prompt_ids_to_tokens( + tokenizer, + all_input_ids[:-1], + skip_special_tokens=skip_special_tokens) + assert prev_tokens is not None + + # If the new token id is out of bounds, return an empty string. + if 0 <= new_token_id < len(tokenizer): + # Put new_token_id in a list so skip_special_tokens is respected + new_tokens = tokenizer.convert_ids_to_tokens( + [new_token_id], skip_special_tokens=skip_special_tokens) + if isinstance(new_tokens, str): + new_tokens = [new_tokens] + else: + new_tokens = [""] + output_tokens = prev_tokens + new_tokens + + # If this is the first iteration, return all tokens. + if is_first_iter: + new_tokens = output_tokens + + # The prefix text is necessary only to defeat cleanup algorithms in + # the decode which decide to add a space or not depending on the + # surrounding ids. + if tokenizer.is_fast or not tokenizer.get_added_vocab(): + prefix_text = tokenizer.convert_tokens_to_string( + output_tokens[prefix_offset:read_offset]) + new_text = tokenizer.convert_tokens_to_string( + output_tokens[prefix_offset:]) + else: + prefix_text = _convert_tokens_to_string_with_added_encoders( + tokenizer, + output_tokens[prefix_offset:read_offset], + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + ) + new_text = _convert_tokens_to_string_with_added_encoders( + tokenizer, + output_tokens[prefix_offset:], + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + ) + + if len(new_text) <= len(prefix_text) or new_text.endswith("�"): + # utf-8 char at the end means it's a potential unfinished byte sequence + # from byte fallback tokenization. + # If it's in the middle, it's probably a real invalid id generated + # by the model + return new_tokens, "", prefix_offset, read_offset + + new_text = new_text[len(prefix_text):] + return new_tokens, new_text, read_offset, len(output_tokens) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processor.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processor.py new file mode 100644 index 0000000000000000000000000000000000000000..3197b07d8a4687a999695f1972755c87e5dfe366 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processor.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 + +from functools import lru_cache +from typing import Any, cast + +from transformers.processing_utils import ProcessorMixin + + +def get_processor( + processor_name: str, + *args: Any, + trust_remote_code: bool = False, + processor_cls: type[ProcessorMixin] = ProcessorMixin, + **kwargs: Any, +): + """Load a processor for the given model name via HuggingFace.""" + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor + + processor_factory = (AutoProcessor + if processor_cls == ProcessorMixin else processor_cls) + + try: + processor = processor_factory.from_pretrained( + processor_name, + *args, + trust_remote_code=trust_remote_code, + **kwargs, + ) + except ValueError as e: + # If the error pertains to the processor class not existing or not + # currently being imported, suggest using the --trust-remote-code flag. + # Unlike AutoTokenizer, AutoProcessor does not separate such errors + if not trust_remote_code: + err_msg = ( + "Failed to load the processor. If the processor is " + "a custom processor not yet available in the HuggingFace " + "transformers library, consider setting " + "`trust_remote_code=True` in LLM or using the " + "`--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + + return cast(ProcessorMixin, processor) + + +cached_get_processor = lru_cache(get_processor) + + +def get_image_processor( + processor_name: str, + *args: Any, + trust_remote_code: bool = False, + **kwargs: Any, +): + """Load an image processor for the given model name via HuggingFace.""" + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoImageProcessor + from transformers.image_processing_utils import BaseImageProcessor + + try: + processor = AutoImageProcessor.from_pretrained( + processor_name, + *args, + trust_remote_code=trust_remote_code, + **kwargs) + except ValueError as e: + # If the error pertains to the processor class not existing or not + # currently being imported, suggest using the --trust-remote-code flag. + # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors + if not trust_remote_code: + err_msg = ( + "Failed to load the image processor. If the image processor is " + "a custom processor not yet available in the HuggingFace " + "transformers library, consider setting " + "`trust_remote_code=True` in LLM or using the " + "`--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + + return cast(BaseImageProcessor, processor) + + +def get_video_processor( + processor_name: str, + *args: Any, + trust_remote_code: bool = False, + **kwargs: Any, +): + """Load a video processor for the given model name via HuggingFace.""" + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers.image_processing_utils import BaseImageProcessor + + processor = get_processor( + processor_name, + *args, + trust_remote_code=trust_remote_code, + **kwargs, + ) + + return cast(BaseImageProcessor, processor.video_processor) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__init__.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4696f0c49df96dfe3969d4a3a8bdb98dd18b216f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) + +__all__ = ["DeepseekVLV2Processor"] diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbe3a694a93c432cf216ce983eaa537a09d319d5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/deepseek_vl2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/deepseek_vl2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac2e6eacb5c059d3983325d1a6913dcffc380d62 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/deepseek_vl2.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/deepseek_vl2.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/deepseek_vl2.py new file mode 100644 index 0000000000000000000000000000000000000000..d37381ea9925fc23d1f926ca8d1f7e42c49eb1e4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/deepseek_vl2.py @@ -0,0 +1,363 @@ +# SPDX-License-Identifier: Apache-2.0 + +# yapf: disable +# ruff: noqa: E501 +# coding=utf-8 +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py +# Copyright (c) 2023-2024 DeepSeek. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import math +from typing import List, Tuple + +import torch +import torchvision.transforms as T +from PIL import Image, ImageOps +from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast +from transformers.processing_utils import ProcessorMixin + + +class ImageTransform: + + def __init__(self, + mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True): + self.mean = mean + self.std = std + self.normalize = normalize + + transform_pipelines = [T.ToTensor()] + + if normalize: + transform_pipelines.append(T.Normalize(mean, std)) + + self.transform = T.Compose(transform_pipelines) + + def __call__(self, pil_img: Image.Image): + x = self.transform(pil_img) + return x + + +class DeepseekVLV2Processor(ProcessorMixin): + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + attributes = ["tokenizer"] + + def __init__( + self, + tokenizer: LlamaTokenizerFast, + candidate_resolutions: Tuple[Tuple[int, int]], + patch_size: int, + downsample_ratio: int, + image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True, + image_token: str = "", + pad_token: str = "<|▁pad▁|>", + add_special_token: bool = False, + sft_format: str = "deepseek", + mask_prompt: bool = True, + ignore_id: int = -100, + **kwargs, + ): + + self.candidate_resolutions = candidate_resolutions + self.image_size = candidate_resolutions[0][0] + self.patch_size = patch_size + self.image_mean = image_mean + self.image_std = image_std + self.normalize = normalize + self.downsample_ratio = downsample_ratio + + self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize) + self.tokenizer = tokenizer + self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference + + # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id' + if tokenizer.pad_token is None: + self.tokenizer.add_special_tokens({'pad_token': pad_token}) + + # add image token + image_token_id = self.tokenizer.vocab.get(image_token) + if image_token_id is None: + special_tokens = [image_token] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + self.image_token_id = self.tokenizer.vocab.get(image_token) + + # add five special tokens for grounding-related tasks + # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|> + special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + # add special tokens for SFT data + special_tokens = ["<|User|>", "<|Assistant|>"] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + self.image_token = image_token + self.pad_token = pad_token + self.add_special_token = add_special_token + self.sft_format = sft_format + self.mask_prompt = mask_prompt + self.ignore_id = ignore_id + + super().__init__( + tokenizer, + **kwargs, + ) + + def select_best_resolution(self, image_size): + # used for cropping + original_width, original_height = image_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float("inf") + + for width, height in self.candidate_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int( + original_width * scale), int(original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, + original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution + and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def pad_id(self): + return self.tokenizer.pad_token_id + + def encode(self, text: str, bos: bool = True, eos: bool = False): + t = self.tokenizer.encode(text, add_special_tokens=False) + + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + + return t + + def decode(self, t: List[int], **kwargs) -> str: + return self.tokenizer.decode(t, **kwargs) + + def process_one( + self, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + conversations (List[Dict]): conversations with a list of messages; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + system_prompt (str): the system prompt; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - target_ids (torch.LongTensor): [N + image tokens] + - pixel_values (torch.FloatTensor): [n_patches, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + assert (prompt is not None and images is not None + ), "prompt and images must be used at the same time." + + sft_format = prompt + tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images( + sft_format, images, bos=True, eos=True, cropping=len(images) <= 2) + masked_tokenized_str = [] + for token_index in tokenized_str: + if token_index != self.image_token_id: + masked_tokenized_str.append(token_index) + else: + masked_tokenized_str.append(self.ignore_id) + + assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \ + (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, " + f"imags_seq_mask's length {len(images_seq_mask)}, are not equal") + + input_ids = torch.LongTensor(tokenized_str) + target_ids = torch.LongTensor(masked_tokenized_str) + images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool) + + # set input_ids < 0 | input_ids == self.image_token_id as ignore_id + target_ids[(input_ids < 0) | + (input_ids == self.image_token_id)] = self.ignore_id + input_ids[input_ids < 0] = self.pad_id + + if inference_mode: + # 去掉结尾的eos token + assert input_ids[-1] == self.eos_id + input_ids = input_ids[:-1] + target_ids = target_ids[:-1] + images_seq_mask = images_seq_mask[:-1] + + if len(images_list) == 0: + pixel_values = torch.zeros((1, 3, self.image_size, self.image_size)) + images_spatial_crop = torch.zeros((1, 2), dtype=torch.long) + else: + pixel_values = torch.stack(images_list, dim=0) + images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long) + + input_ids = input_ids.unsqueeze(0) + + prepare = BatchFeature( + data=dict( + input_ids=input_ids, + pixel_values=pixel_values, + images_seq_mask=images_seq_mask, + images_spatial_crop=images_spatial_crop, + num_image_tokens=num_image_tokens, + ), + tensor_type="pt", + ) + return prepare + + def __call__( + self, + *, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - images (torch.FloatTensor): [n_images, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + prepare = self.process_one( + prompt=prompt, + images=images, + inference_mode=inference_mode, + ) + + return prepare + + def tokenize_with_images( + self, + conversation: str, + images: List[Image.Image], + bos: bool = True, + eos: bool = True, + cropping: bool = True, + ): + """Tokenize text with tags.""" + assert conversation.count(self.image_token) == len(images) + text_splits = conversation.split(self.image_token) + images_list, images_seq_mask, images_spatial_crop = [], [], [] + num_image_tokens = [] + tokenized_str = [] + for text_sep, image in zip(text_splits, images): + """encode text_sep""" + tokenized_sep = self.encode(text_sep, bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """select best resolution for anyres""" + if cropping: + best_width, best_height = self.select_best_resolution(image.size) + else: + best_width, best_height = self.image_size, self.image_size + + """process the global view""" + global_view = ImageOps.pad(image, (self.image_size, self.image_size), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + images_list.append(self.image_transform(global_view)) + + """process the local views""" + local_view = ImageOps.pad(image, (best_width, best_height), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + for i in range(0, best_height, self.image_size): + for j in range(0, best_width, self.image_size): + images_list.append( + self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size)))) + + """record height / width crop num""" + num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size + images_spatial_crop.append([num_width_tiles, num_height_tiles]) + + """add image tokens""" + h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio) + # global views tokens h * (w + 1), 1 is for line separator + tokenized_image = [self.image_token_id] * h * (w + 1) + # add a separator between global and local views + tokenized_image += [self.image_token_id] + # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1) + tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1) + + tokenized_str += tokenized_image + images_seq_mask += [True] * len(tokenized_image) + num_image_tokens.append(len(tokenized_image)) + + """process the last text split""" + tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """add the bos and eos tokens""" + if bos: + tokenized_str = [self.bos_id] + tokenized_str + images_seq_mask = [False] + images_seq_mask + if eos: + tokenized_str = tokenized_str + [self.eos_id] + images_seq_mask = images_seq_mask + [False] + + assert len(tokenized_str) == len( + images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}" + + return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens + + +AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/s3_utils.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/s3_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4fe744d285d35a2e117042f26a7c6fe44ef08958 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/s3_utils.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 + +import fnmatch +import os +import shutil +import signal +import tempfile +from pathlib import Path +from typing import Optional + +from vllm.utils import PlaceholderModule + +try: + import boto3 +except ImportError: + boto3 = PlaceholderModule("boto3") # type: ignore[assignment] + + +def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]: + return [ + path for path in paths if any( + fnmatch.fnmatch(path, pattern) for pattern in patterns) + ] + + +def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]: + return [ + path for path in paths + if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns) + ] + + +def glob(s3=None, + path: str = "", + allow_pattern: Optional[list[str]] = None) -> list[str]: + """ + List full file names from S3 path and filter by allow pattern. + + Args: + s3: S3 client to use. + path: The S3 path to list from. + allow_pattern: A list of patterns of which files to pull. + + Returns: + list[str]: List of full S3 paths allowed by the pattern + """ + if s3 is None: + s3 = boto3.client("s3") + bucket_name, _, paths = list_files(s3, + path=path, + allow_pattern=allow_pattern) + return [f"s3://{bucket_name}/{path}" for path in paths] + + +def list_files( + s3, + path: str, + allow_pattern: Optional[list[str]] = None, + ignore_pattern: Optional[list[str]] = None +) -> tuple[str, str, list[str]]: + """ + List files from S3 path and filter by pattern. + + Args: + s3: S3 client to use. + path: The S3 path to list from. + allow_pattern: A list of patterns of which files to pull. + ignore_pattern: A list of patterns of which files not to pull. + + Returns: + tuple[str, str, list[str]]: A tuple where: + - The first element is the bucket name + - The second element is string represent the bucket + and the prefix as a dir like string + - The third element is a list of files allowed or + disallowed by pattern + """ + parts = path.removeprefix('s3://').split('/') + prefix = '/'.join(parts[1:]) + bucket_name = parts[0] + + objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) + paths = [obj['Key'] for obj in objects.get('Contents', [])] + + paths = _filter_ignore(paths, ["*/"]) + if allow_pattern is not None: + paths = _filter_allow(paths, allow_pattern) + + if ignore_pattern is not None: + paths = _filter_ignore(paths, ignore_pattern) + + return bucket_name, prefix, paths + + +class S3Model: + """ + A class representing a S3 model mirrored into a temporary directory. + + Attributes: + s3: S3 client. + dir: The temporary created directory. + + Methods: + pull_files(): Pull model from S3 to the temporary directory. + """ + + def __init__(self) -> None: + self.s3 = boto3.client('s3') + for sig in (signal.SIGINT, signal.SIGTERM): + existing_handler = signal.getsignal(sig) + signal.signal(sig, self._close_by_signal(existing_handler)) + self.dir = tempfile.mkdtemp() + + def __del__(self): + self._close() + + def _close(self) -> None: + if os.path.exists(self.dir): + shutil.rmtree(self.dir) + + def _close_by_signal(self, existing_handler=None): + + def new_handler(signum, frame): + self._close() + if existing_handler: + existing_handler(signum, frame) + + return new_handler + + def pull_files(self, + s3_model_path: str = "", + allow_pattern: Optional[list[str]] = None, + ignore_pattern: Optional[list[str]] = None) -> None: + """ + Pull files from S3 storage into the temporary directory. + + Args: + s3_model_path: The S3 path of the model. + allow_pattern: A list of patterns of which files to pull. + ignore_pattern: A list of patterns of which files not to pull. + + """ + bucket_name, base_dir, files = list_files(self.s3, s3_model_path, + allow_pattern, + ignore_pattern) + if len(files) == 0: + return + + for file in files: + destination_file = os.path.join(self.dir, + file.removeprefix(base_dir)) + local_dir = Path(destination_file).parent + os.makedirs(local_dir, exist_ok=True) + self.s3.download_file(bucket_name, file, destination_file) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..520870b563c9e6074bedd1beb51b3ac0eb9ea628 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer.py @@ -0,0 +1,245 @@ +# SPDX-License-Identifier: Apache-2.0 + +import contextlib +import os +import warnings +from pathlib import Path +from types import MethodType +from typing import Optional, Union + +import huggingface_hub +from transformers import (AutoTokenizer, PreTrainedTokenizer, + PreTrainedTokenizerFast) + +from vllm.envs import VLLM_USE_MODELSCOPE +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.transformers_utils.tokenizers import MistralTokenizer +from vllm.transformers_utils.utils import check_gguf_file +from vllm.utils import make_async + +logger = init_logger(__name__) + +AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, + MistralTokenizer] + + +def decode_tokens( + tokenizer: AnyTokenizer, + token_ids: list[int], + *, + skip_special_tokens: bool = False, +) -> str: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. + """ + return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + + +def encode_tokens( + tokenizer: AnyTokenizer, + text: str, + *, + add_special_tokens: Optional[bool] = None, +) -> list[int]: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.encode(text, add_special_tokens=...)`. + """ + if isinstance(tokenizer, MistralTokenizer): + return tokenizer.tokenizer.encode(text, + bos=add_special_tokens, + eos=add_special_tokens) + elif add_special_tokens is not None: + return tokenizer.encode(text, add_special_tokens=add_special_tokens) + return tokenizer.encode(text) + + +def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: + """Get tokenizer with cached properties. + + This will patch the tokenizer object in place. + + By default, transformers will recompute multiple tokenizer properties + each time they are called, leading to a significant slowdown. This + function caches these properties for faster access.""" + + tokenizer_all_special_ids = set(tokenizer.all_special_ids) + tokenizer_all_special_tokens_extended = ( + tokenizer.all_special_tokens_extended) + tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) + tokenizer_vocab = tokenizer.get_vocab() + tokenizer_len = len(tokenizer) + + max_token_id = max(tokenizer_vocab.values()) + # Some tokenizers (e.g., QwenTokenizer) have special tokens that + # are added and included in the implementation of the vocab_size + # property, but not in get_vocab(); if there is an implementation + # of vocab size, we should take the greater value. + if hasattr(tokenizer, "vocab_size"): + with contextlib.suppress(NotImplementedError): + max_token_id = max(max_token_id, tokenizer.vocab_size) + + class CachedTokenizer(tokenizer.__class__): # type: ignore + + @property + def all_special_ids(self): + return tokenizer_all_special_ids + + @property + def all_special_tokens(self): + return tokenizer_all_special_tokens + + @property + def all_special_tokens_extended(self): + return tokenizer_all_special_tokens_extended + + @property + def max_token_id(self): + return max_token_id + + def get_vocab(self): + return tokenizer_vocab + + def __len__(self): + return tokenizer_len + + CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}" + + tokenizer.__class__ = CachedTokenizer + return tokenizer + + +def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None: + """Patch _pad method to accept `padding_side` for older tokenizers.""" + orig_pad = tokenizer._pad + + def _pad( + self: PreTrainedTokenizer, + *args, + padding_side: Optional[str] = None, + **kwargs, + ): + if padding_side is not None and padding_side != self.padding_side: + msg = ("`padding_side` argument is not supported by " + f"{type(tokenizer).__name__} and will be ignored.") + warnings.warn(msg, stacklevel=2) + + return orig_pad(*args, **kwargs) + + tokenizer._pad = MethodType(_pad, tokenizer) + + +def get_tokenizer( + tokenizer_name: Union[str, Path], + *args, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + revision: Optional[str] = None, + download_dir: Optional[str] = None, + **kwargs, +) -> AnyTokenizer: + """Gets a tokenizer for the given model name via HuggingFace or ModelScope. + """ + if VLLM_USE_MODELSCOPE: + # download model from ModelScope hub, + # lazy import so that modelscope is not required for normal use. + # pylint: disable=C. + from modelscope.hub.snapshot_download import snapshot_download + + # Only set the tokenizer here, model will be downloaded on the workers. + if not os.path.exists(tokenizer_name): + tokenizer_path = snapshot_download( + model_id=tokenizer_name, + cache_dir=download_dir, + revision=revision, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + # Ignore weights - we only need the tokenizer. + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + tokenizer_name = tokenizer_path + + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError( + "Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + + if "truncation_side" not in kwargs: + kwargs["truncation_side"] = "left" + + # Separate model folder from file path for GGUF models + is_gguf = check_gguf_file(tokenizer_name) + if is_gguf: + kwargs["gguf_file"] = Path(tokenizer_name).name + tokenizer_name = Path(tokenizer_name).parent + + # if tokenizer is from official mistral org + is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai" + if is_from_mistral_org and tokenizer_mode != "mistral": + warnings.warn( + 'It is strongly recommended to run mistral models with ' + '`--tokenizer-mode "mistral"` to ensure correct ' + 'encoding and decoding.', + FutureWarning, + stacklevel=2) + if tokenizer_mode == "mistral": + tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name), + revision=revision) + else: + try: + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + *args, + trust_remote_code=trust_remote_code, + revision=revision, + **kwargs, + ) + except ValueError as e: + # If the error pertains to the tokenizer class not existing or not + # currently being imported, + # suggest using the --trust-remote-code flag. + if not trust_remote_code and ( + "does not exist or is not currently imported." in str(e) + or "requires you to execute the tokenizer file" in str(e)): + err_msg = ("Failed to load the tokenizer. If the tokenizer " + "is a custom tokenizer not yet available in the " + "HuggingFace transformers library, consider " + "setting `trust_remote_code=True` in LLM or using " + "the `--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + + # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324 + if type(tokenizer).__name__ in ("ChatGLMTokenizer", + "ChatGLM4Tokenizer"): + assert isinstance(tokenizer, PreTrainedTokenizer) + patch_padding_side(tokenizer) + + if not isinstance(tokenizer, PreTrainedTokenizerFast): + logger.warning( + "Using a slow tokenizer. This might cause a significant " + "slowdown. Consider using a fast tokenizer instead.") + tokenizer = get_cached_tokenizer(tokenizer) + + return tokenizer + + +def get_lora_tokenizer(lora_request: LoRARequest, *args, + **kwargs) -> Optional[AnyTokenizer]: + if lora_request is None: + return None + try: + tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs) + except Exception as e: + # No tokenizer was found in the LoRA folder, + # use base model tokenizer + logger.warning( + "No tokenizer found in %s, using base model tokenizer instead. " + "(Exception: %s)", lora_request.lora_path, e) + tokenizer = None + return tokenizer + + +get_lora_tokenizer_async = make_async(get_lora_tokenizer) diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__init__.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c223768b16d6b73c42c73ce9d74b2a7092917a94 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__init__.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Type + +from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, + SchedulerConfig, TokenizerPoolConfig) +from vllm.executor.ray_utils import ray + +from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup +from .tokenizer_group import TokenizerGroup + +if ray: + from .ray_tokenizer_group import RayTokenizerGroupPool +else: + RayTokenizerGroupPool = None # type: ignore + + +def init_tokenizer_from_configs(model_config: ModelConfig, + scheduler_config: SchedulerConfig, + parallel_config: ParallelConfig, + lora_config: LoRAConfig): + init_kwargs = dict(tokenizer_id=model_config.tokenizer, + enable_lora=bool(lora_config), + max_num_seqs=scheduler_config.max_num_seqs, + max_loras=lora_config.max_loras if lora_config else 0, + max_input_length=None, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision, + truncation_side=model_config.truncation_side) + + return get_tokenizer_group(parallel_config.tokenizer_pool_config, + **init_kwargs) + + +def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig], + **init_kwargs) -> BaseTokenizerGroup: + tokenizer_cls: Type[BaseTokenizerGroup] + if tokenizer_pool_config is None: + tokenizer_cls = TokenizerGroup + elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass( + tokenizer_pool_config.pool_type, BaseTokenizerGroup): + tokenizer_cls = tokenizer_pool_config.pool_type + elif tokenizer_pool_config.pool_type == "ray": + if RayTokenizerGroupPool is None: + raise ImportError( + "RayTokenizerGroupPool is not available. Please install " + "the ray package to use the Ray tokenizer group pool.") + tokenizer_cls = RayTokenizerGroupPool + else: + raise ValueError( + f"Unknown pool type: {tokenizer_pool_config.pool_type}") + return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs) + + +__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"] diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86480780a6c7296a8f7450a1ff4a0b6be96d2c78 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53b0347698f10e134a076784c04b546345023c58 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52849f79c26fc4432ec3727e918846a9f34cb4a7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b689837f857f977753e5acabf648ad363fe60869 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py new file mode 100644 index 0000000000000000000000000000000000000000..fbdfa3e57e1721ff1aae54c3127269e3c1c43769 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 + +from abc import ABC, abstractmethod +from typing import List, Optional + +from vllm.config import TokenizerPoolConfig +from vllm.lora.request import LoRARequest +from vllm.transformers_utils.tokenizer import AnyTokenizer + + +class BaseTokenizerGroup(ABC): + """A group of tokenizers that can be used for LoRA adapters.""" + + @classmethod + @abstractmethod + def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig], + **init_kwargs) -> "BaseTokenizerGroup": + pass + + @abstractmethod + def ping(self) -> bool: + """Check if the tokenizer group is alive.""" + pass + + @abstractmethod + def get_max_input_len( + self, + lora_request: Optional[LoRARequest] = None, + ) -> Optional[int]: + """Get the maximum input length for the LoRA request.""" + pass + + @abstractmethod + def encode(self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: + """Encode a prompt using the tokenizer group.""" + pass + + @abstractmethod + async def encode_async( + self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: + """Encode a prompt using the tokenizer group.""" + pass + + @abstractmethod + def get_lora_tokenizer( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + """Get a tokenizer for a LoRA request.""" + pass + + @abstractmethod + async def get_lora_tokenizer_async( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + """Get a tokenizer for a LoRA request.""" + pass + + def check_health(self): + """Raise exception if the tokenizer group is unhealthy.""" + return diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py new file mode 100644 index 0000000000000000000000000000000000000000..30cab752ccf3c832b6f076b742776b082f503120 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -0,0 +1,250 @@ +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import os +from typing import List, Optional + +try: + from ray.exceptions import ActorDiedError # type: ignore +except ImportError: + # For older versions of Ray + from ray.exceptions import RayActorError as ActorDiedError # type: ignore +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + +from vllm.config import TokenizerPoolConfig +from vllm.executor.ray_utils import ray +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.transformers_utils.tokenizer import AnyTokenizer + +from .base_tokenizer_group import BaseTokenizerGroup +from .tokenizer_group import TokenizerGroup + +logger = init_logger(__name__) + + +class RayTokenizerGroupPool(BaseTokenizerGroup): + """A Ray-based pool of TokenizerGroups for async tokenization.""" + + # Class to use for workers making up the pool. + _worker_cls = TokenizerGroup + + @classmethod + def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig], + **init_kwargs) -> "RayTokenizerGroupPool": + if not tokenizer_pool_config: + raise ValueError("tokenizer_pool_config must not be None.") + ray_actor_options = (tokenizer_pool_config.extra_config or { + "num_cpus": 0 + }) + ray_actor_options.setdefault( + "scheduling_strategy", + NodeAffinitySchedulingStrategy( + node_id=ray.get_runtime_context().get_node_id(), soft=True)) + + # Carry over the env vars to the actors. + # This is necessary for API keys and such. + ray_actor_options.setdefault("runtime_env", {}) + _carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"]) + + init_kwargs["num_actors"] = tokenizer_pool_config.pool_size + init_kwargs["ray_actor_options"] = ray_actor_options + + return cls(**init_kwargs) + + def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, + max_input_length: Optional[int], num_actors: int, + ray_actor_options: dict, **tokenizer_config): + # Store a local copy of the TokenizerGroup for quick access + # to underlying HF tokenizers. + self._tokenizer_config = { + "tokenizer_id": tokenizer_id, + "enable_lora": enable_lora, + "max_num_seqs": max_num_seqs, + "max_input_length": max_input_length, + **tokenizer_config + } + self._local_tokenizer_group = self._worker_cls( + **self._tokenizer_config, ) + + self._ray_tokenizer_group_cls = ray.remote( + self._worker_cls).options(**ray_actor_options) # type: ignore + self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)] + self._idle_actors: Optional[asyncio.Queue] = None + + # If set, actor is unhealthy. Will reraise on the next + # check_health call. + self._exception: Optional[ActorDiedError] = None + + def _init_actor(self) -> ray.ObjectRef: + return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config) + + @property + def pool_size(self) -> int: + return len(self.tokenizer_actors) + + def ping(self): + return ray.get([ + actor.ping.remote() # type: ignore + for actor in self.tokenizer_actors + ]) + + def _ensure_queue_initialized(self): + if self._idle_actors is None: + self._idle_actors = asyncio.Queue() + for actor in self.tokenizer_actors: + self._idle_actors.put_nowait(actor) + + def _finalize_encode(self, actor: ray.ObjectRef, + original_actor: ray.ObjectRef, actor_is_alive: bool): + assert self._idle_actors is not None + # Cleanup the dead actor. + if not actor_is_alive or original_actor is not actor: + self.tokenizer_actors.remove(original_actor) + if actor_is_alive: + # Put the actor back in the queue. + # This is done in a finally block to ensure that the actor is + # always put back in the queue, even if an exception/cancellation + # is raised. + self._idle_actors.put_nowait(actor) + # Add back the new actor. + if original_actor is not actor: + self.tokenizer_actors.append(actor) + + def encode(self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: + """Encode a prompt using the tokenizer group. + + We pick an idle actor and use it to encode the prompt. + The actor is then put back in the queue for future use. + This is blocking. + """ + self.check_health() + self._ensure_queue_initialized() + assert self._idle_actors is not None + + if self._idle_actors.empty(): + raise RuntimeError("No idle actors available.") + actor = self._idle_actors.get_nowait() + actor_is_alive = True + original_actor = actor + try: + ret = ray.get( + actor.encode.remote(request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens)) + except ActorDiedError as e: + # If the actor is dead, we first try to reinitialize it. + logger.warning("%s died with ActorDiedError, reinitializing.", + actor, + exc_info=e) + actor = self._init_actor() + try: + ret = ray.get( + actor.encode.remote(request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens)) + except ActorDiedError as e: + logger.error( + "%s died for second time in a row, marking " + "RayTokenizerGroupPool as unhealthy.", actor) + actor_is_alive = False + if not self._exception: + self._exception = e + self.check_health() + finally: + self._finalize_encode(actor, original_actor, actor_is_alive) + return ret + + async def encode_async( + self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: + """Encode a prompt using the tokenizer group. + + We pick an idle actor and use it to encode the prompt. + If there are no idle actors, we wait until one becomes + available. + The actor is then put back in the queue for future use. + This is non-blocking. + """ + self.check_health() + self._ensure_queue_initialized() + assert self._idle_actors is not None + + actor = await self._idle_actors.get() + actor_is_alive = True + original_actor = actor + try: + ret = await actor.encode.remote( + request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens) + except ActorDiedError as e: + # If the actor is dead, we first try to reinitialize it. + logger.warning("%s died with ActorDiedError, reinitializing.", + actor, + exc_info=e) + actor = self._init_actor() + try: + ret = await actor.encode.remote( + request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens) + except ActorDiedError as e: + logger.error( + "%s died for second time in a row, marking " + "RayTokenizerGroupPool as unhealthy.", actor) + actor_is_alive = False + if not self._exception: + self._exception = e + self.check_health() + finally: + self._finalize_encode(actor, original_actor, actor_is_alive) + return ret + + def get_max_input_len(self, + lora_request: Optional[LoRARequest] = None + ) -> Optional[int]: + """Get the maximum input length for the LoRA request.""" + return self._local_tokenizer_group.get_max_input_len(lora_request) + + def get_lora_tokenizer( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + return self._local_tokenizer_group.get_lora_tokenizer(lora_request) + + async def get_lora_tokenizer_async( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + return await self._local_tokenizer_group.get_lora_tokenizer_async( + lora_request) + + def check_health(self): + if self._exception: + raise RuntimeError( + "TokenizerGroupPool is unhealthy.") from self._exception + + +def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None: + """Copy over all current process environment variables to the runtime_env. + + The variables in runtime_env will take precedence over the current process + environment variables. + + runtime_env will be modified in place.""" + env_vars = os.environ.copy() + runtime_env.setdefault("env_vars", {}) + env_vars.update(runtime_env["env_vars"]) + runtime_env["env_vars"] = env_vars diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/tokenizer_group.py new file mode 100644 index 0000000000000000000000000000000000000000..025971cb7e47787a81311f02901874b66a197925 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Optional + +from vllm.config import TokenizerPoolConfig +from vllm.lora.request import LoRARequest +from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, + get_lora_tokenizer, + get_lora_tokenizer_async, + get_tokenizer) +from vllm.utils import LRUCache + +from .base_tokenizer_group import BaseTokenizerGroup + + +class TokenizerGroup(BaseTokenizerGroup): + """A group of tokenizers that can be used for LoRA adapters.""" + + def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, + max_input_length: Optional[int], **tokenizer_config): + self.tokenizer_id = tokenizer_id + self.tokenizer_config = tokenizer_config + self.enable_lora = enable_lora + self.max_input_length = max_input_length + self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) + max_loras = tokenizer_config.get("max_loras", 0) + self.lora_tokenizers = LRUCache[int, AnyTokenizer]( + capacity=max(max_loras, max_num_seqs) if enable_lora else 0) + + @classmethod + def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig], + **init_kwargs) -> "TokenizerGroup": + return cls(**init_kwargs) + + def ping(self) -> bool: + """Check if the tokenizer group is alive.""" + return True + + def get_max_input_len(self, + lora_request: Optional[LoRARequest] = None + ) -> Optional[int]: + """Get the maximum input length for the LoRA request.""" + return self.max_input_length + + def _raise_if_input_too_long(self, + encoded_tokens: List[int], + lora_request: Optional[LoRARequest] = None): + input_length = len(encoded_tokens) + if lora_request: + max_input_length = (lora_request.long_lora_max_len + or self.max_input_length) + else: + max_input_length = self.max_input_length + if max_input_length is not None and input_length > max_input_length: + raise ValueError("Input too long.", input_length, max_input_length) + + def encode(self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: + tokenizer = self.get_lora_tokenizer(lora_request) + ret = encode_tokens(tokenizer, + prompt, + add_special_tokens=add_special_tokens) + self._raise_if_input_too_long(ret, lora_request) + return ret + + async def encode_async( + self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: + tokenizer = await self.get_lora_tokenizer_async(lora_request) + ret = encode_tokens(tokenizer, + prompt, + add_special_tokens=add_special_tokens) + self._raise_if_input_too_long(ret, lora_request) + return ret + + def get_lora_tokenizer( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + if not lora_request or not self.enable_lora: + return self.tokenizer + if lora_request.lora_int_id not in self.lora_tokenizers: + tokenizer = (get_lora_tokenizer( + lora_request, **self.tokenizer_config) or self.tokenizer) + self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) + return tokenizer + else: + return self.lora_tokenizers[lora_request.lora_int_id] + + async def get_lora_tokenizer_async( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + if not lora_request or not self.enable_lora: + return self.tokenizer + if lora_request.lora_int_id not in self.lora_tokenizers: + tokenizer = (await get_lora_tokenizer_async( + lora_request, **self.tokenizer_config) or self.tokenizer) + self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) + return tokenizer + else: + return self.lora_tokenizers[lora_request.lora_int_id] diff --git a/.venv/lib/python3.11/site-packages/vllm/transformers_utils/utils.py b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..71fe3ef0b23c5d3b43ac515db2aeaae96cea59f1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/transformers_utils/utils.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: Apache-2.0 + +from os import PathLike +from pathlib import Path +from typing import Union + + +def is_s3(model_or_path: str) -> bool: + return model_or_path.lower().startswith('s3://') + + +def check_gguf_file(model: Union[str, PathLike]) -> bool: + """Check if the file is a GGUF model.""" + model = Path(model) + if not model.is_file(): + return False + elif model.suffix == ".gguf": + return True + + with open(model, "rb") as f: + header = f.read(4) + return header == b"GGUF" diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__init__.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d5933cac50c202e2386b399520ee419450188ac5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__init__.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 + +import enum +from dataclasses import dataclass +from typing import TYPE_CHECKING, List, Optional, Union + +import msgspec + +from vllm.v1.metrics.stats import SchedulerStats + +if TYPE_CHECKING: + from vllm.lora.request import LoRARequest + from vllm.multimodal import MultiModalKwargs + from vllm.multimodal.inputs import PlaceholderRange + from vllm.sampling_params import SamplingParams + +# These are possible values of RequestOutput.finish_reason, +# so form part of the external API. +FINISH_REASON_STRINGS = ("stop", "length", "abort") + + +class FinishReason(enum.IntEnum): + """ + Reason a request finished - stop, length, or abort. + + Int rather than Str for more compact serialization. + + stop - a stop string was emitted + length - max_tokens was consumed, or max_model_len was reached + abort - aborted for another reason + + """ + STOP = 0 + LENGTH = 1 + ABORT = 2 + + def __str__(self): + return FINISH_REASON_STRINGS[self.value] + + +@dataclass +class EngineCoreRequest: + + # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput, + # but this object is currently not playing well with msgspec + # due to circular imports and typing we have in data.py + + request_id: str + # NOTE(ywang96): original text prompt is needed when a request is added to + # Detokenizer, but set to None when it is added to EngineCoreClient. + prompt: Optional[str] + prompt_token_ids: List[int] + mm_inputs: Optional[List[Optional["MultiModalKwargs"]]] + mm_hashes: Optional[List[str]] + mm_placeholders: Optional[List["PlaceholderRange"]] + sampling_params: "SamplingParams" + eos_token_id: Optional[int] + arrival_time: float + lora_request: Optional["LoRARequest"] + + +class EngineCoreOutput( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False): # type: ignore[call-arg] + + request_id: str + new_token_ids: List[int] + finished: bool + finish_reason: Optional[FinishReason] = None + stop_reason: Union[int, str, None] = None + + +class EngineCoreOutputs( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False): # type: ignore[call-arg] + + #NOTE(Nick): We could consider ways to make this more compact, + # e.g. columnwise layout + + # [num_reqs] + outputs: List[EngineCoreOutput] + scheduler_stats: SchedulerStats + + +@dataclass +class EngineCoreProfile: + is_start: bool + + +@dataclass +class EngineCoreResetPrefixCache: + pass + + +class EngineCoreRequestType(enum.Enum): + """ + Request types defined as hex byte strings, so it can be sent over sockets + without separate encoding step. + """ + ADD = b'\x00' + ABORT = b'\x01' + PROFILE = b'\x02' + RESET_PREFIX_CACHE = b'\x03' + + +EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, + EngineCoreResetPrefixCache, List[str]] diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca401633b523ad29376dc5032c04347b21aaf275 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/async_llm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/async_llm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffbb46ac206cd1068a39cf690e5cca2f01ce1910 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/async_llm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b63e17f53a47440b3931136e488ab1a84d2c4d69 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core_client.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core_client.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b99f92a7d4ea46c4a2848bdfd899c677699fc2ce Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core_client.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/detokenizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/detokenizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fd3b4ce1ff24666e06203b1424c8d3f47e08aaf Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/detokenizer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/llm_engine.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/llm_engine.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d80fef9887d3bf40596df323d782a7c1f50386f3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/llm_engine.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/mm_input_mapper.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/mm_input_mapper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77487d6c55c0738b1fca7db66bb6610860e795cf Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/mm_input_mapper.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/output_processor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/output_processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe9a702d490cc9cbfd3a8c9dcc6cd999359b88e6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/output_processor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/processor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72edc887fc6cb35fb6719452f0577266b94e9e8c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/processor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..3c4e35e4aa2749dfbb65eb049d77c68ebcced146 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py @@ -0,0 +1,378 @@ +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import os +from typing import AsyncGenerator, List, Mapping, Optional, Type, Union + +import numpy as np + +from vllm.config import ModelConfig, VllmConfig +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.protocol import EngineClient +from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE +from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType +from vllm.inputs.preprocess import InputPreprocessor +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.usage.usage_lib import UsageContext +from vllm.utils import cdiv, kill_process_tree +from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.engine.output_processor import OutputProcessor +from vllm.v1.engine.processor import Processor +from vllm.v1.executor.abstract import Executor +from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, + StatLoggerBase) +from vllm.v1.metrics.stats import IterationStats, SchedulerStats + +logger = init_logger(__name__) + + +class AsyncLLM(EngineClient): + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + input_registry: InputRegistry = INPUT_REGISTRY, + use_cached_outputs: bool = False, + log_requests: bool = True, + start_engine_loop: bool = True, + ) -> None: + + assert start_engine_loop + + self.model_config = vllm_config.model_config + + self.log_requests = log_requests + self.log_stats = log_stats + self.stat_loggers: List[StatLoggerBase] = [ + LoggingStatLogger(), + PrometheusStatLogger(vllm_config.model_config), + ] + + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + parallel_config=vllm_config.parallel_config, + lora_config=vllm_config.lora_config) + self.tokenizer.ping() + + # Processor (converts Inputs --> EngineCoreRequests). + self.processor = Processor( + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + tokenizer=self.tokenizer, + input_registry=input_registry, + ) + + # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). + self.output_processor = OutputProcessor(self.tokenizer, + log_stats=self.log_stats) + + # EngineCore (starts the engine in background process). + self.engine_core = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + ) + + self.output_handler: Optional[asyncio.Task] = None + + @classmethod + def from_engine_args( + cls, + engine_args: AsyncEngineArgs, + engine_config: Optional[VllmConfig] = None, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + ) -> "AsyncLLM": + """Create an AsyncLLM from the EngineArgs.""" + + # Create the engine configs. + if engine_config is None: + vllm_config = engine_args.create_engine_config(usage_context) + else: + vllm_config = engine_config + + executor_class = Executor.get_class(vllm_config) + + # Create the AsyncLLM. + return cls( + vllm_config=vllm_config, + executor_class=executor_class, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + start_engine_loop=start_engine_loop, + usage_context=usage_context, + ) + + def shutdown(self): + """Shutdown, cleaning up the background proc and IPC.""" + + if engine_core := getattr(self, "engine_core", None): + engine_core.shutdown() + + if handler := getattr(self, "output_handler", None): + handler.cancel() + + async def add_request( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> asyncio.Queue[RequestOutput]: + """Add new request to the AsyncLLM.""" + + # 1) Create a new output queue for the request. + if self.output_processor.is_request_active(request_id): + raise ValueError(f"Request id {request_id} already running.") + queue: asyncio.Queue[RequestOutput] = asyncio.Queue() + + # 2) Convert Input --> Request. + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) + + # 3) Add the request to OutputProcessor (this process). + self.output_processor.add_request(request, queue) + + # 4) Add the EngineCoreRequest to EngineCore (separate process). + await self.engine_core.add_request_async(request) + + if self.log_requests: + logger.info("Added request %s.", request_id) + + return queue + + # TODO: we should support multiple prompts in one call, as you + # can do with LLM.generate. So that for multi-prompt completion + # requests we don't need to send multiple messages to core proc, + # and so we don't need multiple streams which then get + # re-multiplexed in the API server anyhow. + async def generate( + self, + prompt: PromptType, + sampling_params: SamplingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> AsyncGenerator[RequestOutput, None]: + """ + Main function called by the API server to kick off a request + * 1) Making an AsyncStream corresponding to the Request. + * 2) Processing the Input. + * 3) Adding the Request to the Detokenizer. + * 4) Adding the Request to the EngineCore (separate process). + + A separate output_handler loop runs in a background AsyncIO task, + pulling outputs from EngineCore and putting them into the + per-request AsyncStream. + + The caller of generate() iterates the returned AsyncGenerator, + returning the RequestOutput back to the caller. + """ + + try: + # We start the output_handler on the first call to generate() so + # we can call __init__ before the event loop, which enables us + # to handle startup failure gracefully in the OpenAI server. + if self.output_handler is None: + self.output_handler = asyncio.create_task( + self._run_output_handler()) + + q = await self.add_request( + request_id, + prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, + ) + + # The output_handler task pushes items into the queue. + # This task pulls from the queue and yields to caller. + finished = False + while not finished: + # Note: drain queue without await if possible (avoids + # task switching under load which helps performance). + out = q.get_nowait() if not q.empty() else await q.get() + + # Coalesce any additional queued outputs + while not q.empty(): + next_out = q.get_nowait() + if sampling_params.output_kind == RequestOutputKind.DELTA: + out.add(next_out) + else: + out = next_out + + # Note: both OutputProcessor and EngineCore handle their + # own request cleanup based on finished. + finished = out.finished + yield out + + # If the request is disconnected by the client, the + # generate() task will be canceled. So, we abort the + # request if we end up here. + except asyncio.CancelledError: + await self.abort(request_id) + raise + + async def _run_output_handler(self): + """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + + try: + while True: + # 1) Pull EngineCoreOutputs from the EngineCore. + outputs = await self.engine_core.get_output_async() + + # Split outputs into chunks of at most + # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the + # event loop for too long. + num_outputs = len(outputs.outputs) + if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: + slices = (outputs.outputs, ) + else: + slices = np.array_split( + outputs.outputs, + cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE)) + + iteration_stats = None + for i, outputs_slice in enumerate(slices): + # 2) Process EngineCoreOutputs. + processed_outputs = self.output_processor.process_outputs( + outputs_slice, iteration_stats) + # NOTE: RequestOutputs are pushed to their queues. + assert not processed_outputs.request_outputs + iteration_stats = processed_outputs.iteration_stats + + # Allow other asyncio tasks to run between chunks + if i + 1 < len(slices): + await asyncio.sleep(0) + + # 3) Abort any reqs that finished due to stop strings. + await self.engine_core.abort_requests_async( + processed_outputs.reqs_to_abort) + + # 4) Logging. + # TODO(rob): make into a coroutine and launch it in + # background thread once Prometheus overhead is non-trivial. + assert iteration_stats is not None + self._log_stats( + scheduler_stats=outputs.scheduler_stats, + iteration_stats=iteration_stats, + ) + + except Exception as e: + logger.exception("EngineCore output handler hit an error: %s", e) + kill_process_tree(os.getpid()) + + async def abort(self, request_id: str) -> None: + """Abort RequestId in OutputProcessor and EngineCore.""" + + request_ids = [request_id] + await self.engine_core.abort_requests_async(request_ids) + self.output_processor.abort_requests(request_ids) + + if self.log_requests: + logger.info("Aborted request %s.", request_id) + + def _log_stats( + self, + scheduler_stats: SchedulerStats, + iteration_stats: IterationStats, + ): + if not self.log_stats: + return + + for logger in self.stat_loggers: + logger.log(scheduler_stats=scheduler_stats, + iteration_stats=iteration_stats) + + def encode( + self, + prompt: PromptType, + pooling_params: PoolingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + priority: int = 0, + ): + raise ValueError("Not Supported on V1 yet.") + + async def get_model_config(self) -> ModelConfig: + return self.model_config + + async def get_decoding_config(self): + raise ValueError("Not Supported on V1 yet.") + + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.processor.input_preprocessor + + async def get_tokenizer( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + return self.tokenizer.get_lora_tokenizer(lora_request) + + async def is_tracing_enabled(self) -> bool: + return False + + async def do_log_stats( + self, + scheduler_outputs=None, + model_output=None, + ) -> None: + logger.debug("Called do_log_stats.") + + async def check_health(self) -> None: + logger.debug("Called check_health.") + + async def start_profile(self) -> None: + await self.engine_core.profile_async(True) + + async def stop_profile(self) -> None: + await self.engine_core.profile_async(False) + + async def reset_prefix_cache(self) -> None: + await self.engine_core.reset_prefix_cache_async() + + @property + def is_running(self) -> bool: + return True + + @property + def is_stopped(self) -> bool: + return False + + @property + def errored(self) -> bool: + return False + + @property + def dead_error(self) -> BaseException: + return Exception() # TODO: implement + + async def add_lora(self, lora_request: LoRARequest) -> None: + """Load a new LoRA adapter into the engine for future requests.""" + raise NotImplementedError("LoRA not yet supported in V1") diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py new file mode 100644 index 0000000000000000000000000000000000000000..29a9ac1868f27869a0c80de847adb875f56197f1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py @@ -0,0 +1,303 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pickle +import queue +import signal +import threading +import time +from multiprocessing.connection import Connection +from typing import List, Tuple, Type + +import psutil +import zmq +import zmq.asyncio +from msgspec import msgpack + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) +from vllm.utils import get_exception_traceback, zmq_socket_ctx +from vllm.v1.core.kv_cache_utils import get_kv_cache_config +from vllm.v1.core.scheduler import Scheduler +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, + EngineCoreRequest, EngineCoreRequestType, + EngineCoreRequestUnion, EngineCoreResetPrefixCache) +from vllm.v1.engine.mm_input_mapper import MMInputMapperServer +from vllm.v1.executor.abstract import Executor +from vllm.v1.request import Request, RequestStatus +from vllm.v1.serial_utils import PickleEncoder +from vllm.version import __version__ as VLLM_VERSION + +logger = init_logger(__name__) + +POLLING_TIMEOUT_S = 2.5 + + +class EngineCore: + """Inner loop of vLLM's Engine.""" + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[Executor], + ): + assert vllm_config.model_config.runner_type != "pooling" + + logger.info("Initializing a V1 LLM engine (v%s) with config: %s", + VLLM_VERSION, vllm_config) + + # Setup Model. + self.model_executor = executor_class(vllm_config) + + # Setup KV Caches and update CacheConfig after profiling. + num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches( + vllm_config) + vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks + vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks + + # Setup scheduler. + self.scheduler = Scheduler( + scheduler_config=vllm_config.scheduler_config, + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + ) + + self.mm_input_mapper_server = MMInputMapperServer( + vllm_config.model_config) + + def _initialize_kv_caches(self, + vllm_config: VllmConfig) -> Tuple[int, int]: + start = time.time() + + # Get all kv cache needed by the model + kv_cache_spec = self.model_executor.get_kv_cache_spec() + + # Profiles the peak memory usage of the model to determine how much + # memory can be allocated for kv cache. + availble_gpu_memory = self.model_executor.determine_available_memory() + + # Get the kv cache tensor size + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + availble_gpu_memory) + num_gpu_blocks = kv_cache_config.num_blocks + num_cpu_blocks = 0 + + # Initialize kv cache and warmup the execution + self.model_executor.initialize(kv_cache_config) + + elapsed = time.time() - start + logger.info(("init engine (profile, create kv cache, " + "warmup model) took %.2f seconds"), elapsed) + return num_gpu_blocks, num_cpu_blocks + + def add_request(self, request: EngineCoreRequest): + """Add request to the scheduler.""" + + if request.mm_hashes is not None: + # Here, if hash exists for an image, then it will be fetched + # from the cache, else it will be added to the cache. + # Note that the cache here is mirrored with the client side of the + # MM mapper, so anything that has a hash must have a HIT cache + # entry here as well. + assert request.mm_inputs is not None + request.mm_inputs = self.mm_input_mapper_server.process_inputs( + request.mm_inputs, request.mm_hashes) + + req = Request.from_engine_core_request(request) + + self.scheduler.add_request(req) + + def abort_requests(self, request_ids: List[str]): + """Abort requests from the scheduler.""" + + # TODO: The scheduler doesn't really need to know the + # specific finish reason, TBD whether we propagate that + # (i.e. client-aborted vs stop criteria met). + self.scheduler.finish_requests(request_ids, + RequestStatus.FINISHED_ABORTED) + + def step(self) -> EngineCoreOutputs: + """Schedule, execute, and make output.""" + + if not self.scheduler.has_unfinished_requests(): + return EngineCoreOutputs( + outputs=[], scheduler_stats=self.scheduler.make_stats()) + + scheduler_output = self.scheduler.schedule() + output = self.model_executor.execute_model(scheduler_output) + engine_core_outputs = self.scheduler.update_from_output( + scheduler_output, output) + return engine_core_outputs + + def shutdown(self): + self.model_executor.shutdown() + + def profile(self, is_start: bool = True): + self.model_executor.profile(is_start) + + def reset_prefix_cache(self): + self.scheduler.reset_prefix_cache() + + +class EngineCoreProc(EngineCore): + """ZMQ-wrapper for running EngineCore in background process.""" + + def __init__( + self, + input_path: str, + output_path: str, + ready_pipe: Connection, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False, + ): + super().__init__(vllm_config, executor_class) + + self.log_stats = log_stats + + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue() + self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() + threading.Thread(target=self.process_input_socket, + args=(input_path, ), + daemon=True).start() + threading.Thread(target=self.process_output_socket, + args=(output_path, ), + daemon=True).start() + + # Send Readiness signal to EngineClient. + ready_pipe.send({"status": "READY"}) + + @staticmethod + def run_engine_core(*args, **kwargs): + """Launch EngineCore busy loop in background process.""" + + # Signal handler used for graceful termination. + # SystemExit exception is only raised once to allow this and worker + # processes to terminate without error + shutdown_requested = False + + # Ensure we can serialize transformer config after spawning + maybe_register_config_serialize_by_value() + + def signal_handler(signum, frame): + nonlocal shutdown_requested + if not shutdown_requested: + shutdown_requested = True + raise SystemExit() + + # Either SIGTERM or SIGINT will terminate the engine_core + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + + parent_process = psutil.Process().parent() + engine_core = None + try: + engine_core = EngineCoreProc(*args, **kwargs) + engine_core.run_busy_loop() + + except SystemExit: + logger.debug("EngineCore interrupted.") + + except Exception: + traceback = get_exception_traceback() + logger.error("EngineCore hit an exception: %s", traceback) + parent_process.send_signal(signal.SIGUSR1) + + finally: + if engine_core is not None: + engine_core.shutdown() + + def run_busy_loop(self): + """Core busy loop of the EngineCore.""" + + # Loop until process is sent a SIGINT or SIGTERM + while True: + # 1) Poll the input queue until there is work to do. + if not self.scheduler.has_unfinished_requests(): + while True: + try: + req = self.input_queue.get(timeout=POLLING_TIMEOUT_S) + self._handle_client_request(req) + break + except queue.Empty: + logger.debug("EngineCore busy loop waiting.") + # Break out the loop so we can log_stats in step(). + if self.log_stats: + break + except BaseException: + raise + + # 2) Handle any new client requests (Abort or Add). + while not self.input_queue.empty(): + req = self.input_queue.get_nowait() + self._handle_client_request(req) + + # 3) Step the engine core. + outputs = self.step() + + # 5) Put EngineCoreOutputs into the output queue. + self.output_queue.put_nowait(outputs) + + def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: + """Handle EngineCoreRequest or EngineCoreABORT from Client.""" + + if isinstance(request, EngineCoreRequest): + self.add_request(request) + elif isinstance(request, EngineCoreProfile): + self.model_executor.profile(request.is_start) + elif isinstance(request, EngineCoreResetPrefixCache): + self.reset_prefix_cache() + else: + # TODO: make an EngineCoreAbort wrapper + assert isinstance(request, list) + self.abort_requests(request) + + def process_input_socket(self, input_path: str): + """Input socket IO thread.""" + + # Msgpack serialization decoding. + decoder_add_req = PickleEncoder() + decoder_abort_req = PickleEncoder() + + with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket: + while True: + # (RequestType, RequestData) + type_frame, data_frame = socket.recv_multipart(copy=False) + request_type = type_frame.buffer + request_data = data_frame.buffer + + # Deserialize the request data. + if request_type == EngineCoreRequestType.ADD.value: + request = decoder_add_req.decode(request_data) + elif request_type == EngineCoreRequestType.ABORT.value: + request = decoder_abort_req.decode(request_data) + elif request_type in ( + EngineCoreRequestType.PROFILE.value, + EngineCoreRequestType.RESET_PREFIX_CACHE.value): + request = pickle.loads(request_data) + else: + raise ValueError(f"Unknown RequestType: {request_type}") + + # Push to input queue for core busy loop. + self.input_queue.put_nowait(request) + + def process_output_socket(self, output_path: str): + """Output socket IO thread.""" + + # Msgpack serialization encoding. + encoder = msgpack.Encoder() + # Reuse send buffer. + buffer = bytearray() + + with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: + while True: + outputs = self.output_queue.get() + encoder.encode_into(outputs, buffer) + socket.send_multipart((buffer, ), copy=False) diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py new file mode 100644 index 0000000000000000000000000000000000000000..247380ef7cfedae1986602ef59abd91192ab0226 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py @@ -0,0 +1,302 @@ +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import os +import signal +import weakref +from abc import ABC, abstractmethod +from typing import List, Optional, Type + +import msgspec +import zmq +import zmq.asyncio + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, + make_zmq_socket) +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, + EngineCoreRequest, EngineCoreRequestType, + EngineCoreRequestUnion, EngineCoreResetPrefixCache) +from vllm.v1.engine.core import EngineCore, EngineCoreProc +from vllm.v1.executor.abstract import Executor +from vllm.v1.serial_utils import PickleEncoder +from vllm.v1.utils import BackgroundProcHandle + +logger = init_logger(__name__) + + +class EngineCoreClient(ABC): + """ + EngineCoreClient: subclasses handle different methods for pushing + and pulling from the EngineCore for asyncio / multiprocessing. + + Subclasses: + * InprocClient: In process EngineCore (for V0-style LLMEngine use) + * SyncMPClient: ZMQ + background proc EngineCore (for LLM) + * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM) + """ + + @staticmethod + def make_client( + multiprocess_mode: bool, + asyncio_mode: bool, + vllm_config: VllmConfig, + executor_class: Type[Executor], + ) -> "EngineCoreClient": + + # TODO: support this for debugging purposes. + if asyncio_mode and not multiprocess_mode: + raise NotImplementedError( + "Running EngineCore in asyncio without multiprocessing " + "is not currently supported.") + + if multiprocess_mode and asyncio_mode: + return AsyncMPClient(vllm_config, executor_class) + + if multiprocess_mode and not asyncio_mode: + return SyncMPClient(vllm_config, executor_class) + + return InprocClient(vllm_config, executor_class) + + @abstractmethod + def shutdown(self): + ... + + def get_output(self) -> EngineCoreOutputs: + raise NotImplementedError + + def add_request(self, request: EngineCoreRequest) -> None: + raise NotImplementedError + + def profile(self, is_start: bool = True) -> None: + raise NotImplementedError + + def reset_prefix_cache(self) -> None: + raise NotImplementedError + + def abort_requests(self, request_ids: List[str]) -> None: + raise NotImplementedError + + async def get_output_async(self) -> EngineCoreOutputs: + raise NotImplementedError + + async def add_request_async(self, request: EngineCoreRequest) -> None: + raise NotImplementedError + + async def profile_async(self, is_start: bool = True) -> None: + raise NotImplementedError + + async def reset_prefix_cache_async(self) -> None: + raise NotImplementedError + + async def abort_requests_async(self, request_ids: List[str]) -> None: + raise NotImplementedError + + +class InprocClient(EngineCoreClient): + """ + InprocClient: client for in-process EngineCore. Intended + for use in LLMEngine for V0-style add_request() and step() + EngineCore setup in this process (no busy loop). + + * pushes EngineCoreRequest directly into the EngineCore + * pulls EngineCoreOutputs by stepping the EngineCore + """ + + def __init__(self, *args, **kwargs): + self.engine_core = EngineCore(*args, **kwargs) + + def get_output(self) -> EngineCoreOutputs: + return self.engine_core.step() + + def add_request(self, request: EngineCoreRequest) -> None: + self.engine_core.add_request(request) + + def abort_requests(self, request_ids: List[str]) -> None: + if len(request_ids) > 0: + self.engine_core.abort_requests(request_ids) + + def shutdown(self) -> None: + self.engine_core.shutdown() + + def profile(self, is_start: bool = True) -> None: + self.engine_core.profile(is_start) + + def reset_prefix_cache(self) -> None: + self.engine_core.reset_prefix_cache() + + +class MPClient(EngineCoreClient): + """ + MPClient: base client for multi-proc EngineCore. + EngineCore runs in a background process busy loop, getting + new EngineCoreRequests and returning EngineCoreOutputs + + * pushes EngineCoreRequests via input_socket + * pulls EngineCoreOutputs via output_socket + + * AsyncMPClient subclass for AsyncLLM usage + * SyncMPClient subclass for LLM usage + """ + + def __init__( + self, + asyncio_mode: bool, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool, + ): + # The child processes will send SIGUSR1 when unrecoverable + # errors happen. We kill the process tree here so that the + # stack trace is very evident. + # TODO(rob): rather than killing the main process, we should + # figure out how to raise an AsyncEngineDeadError and + # handle at the API server level so we can return a better + # error code to the clients calling VLLM. + def sigusr1_handler(signum, frame): + logger.fatal("Got fatal signal from worker processes, shutting " + "down. See stack trace above for root cause issue.") + kill_process_tree(os.getpid()) + + signal.signal(signal.SIGUSR1, sigusr1_handler) + + # Serialization setup. + self.encoder = PickleEncoder() + self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) + + # ZMQ setup. + self.ctx = ( + zmq.asyncio.Context() # type: ignore[attr-defined] + if asyncio_mode else zmq.Context()) # type: ignore[attr-defined] + + # Note(rob): shutdown function cannot be a bound method, + # else the gc cannot collect the object. + self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0), + self.ctx) + + # Paths and sockets for IPC. + output_path = get_open_zmq_ipc_path() + input_path = get_open_zmq_ipc_path() + self.output_socket = make_zmq_socket(self.ctx, output_path, + zmq.constants.PULL) + self.input_socket = make_zmq_socket(self.ctx, input_path, + zmq.constants.PUSH) + + # Start EngineCore in background process. + self.proc_handle = BackgroundProcHandle( + input_path=input_path, + output_path=output_path, + process_name="EngineCore", + target_fn=EngineCoreProc.run_engine_core, + process_kwargs={ + "vllm_config": vllm_config, + "executor_class": executor_class, + "log_stats": log_stats, + }) + + def shutdown(self): + """Clean up background resources.""" + if hasattr(self, "proc_handle"): + self.proc_handle.shutdown() + + self._finalizer() + + +class SyncMPClient(MPClient): + """Synchronous client for multi-proc EngineCore.""" + + def __init__(self, vllm_config: VllmConfig, + executor_class: Type[Executor]): + super().__init__( + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + ) + + def get_output(self) -> EngineCoreOutputs: + + (frame, ) = self.output_socket.recv_multipart(copy=False) + return self.decoder.decode(frame.buffer) + + def _send_input(self, request_type: EngineCoreRequestType, + request: EngineCoreRequestUnion) -> None: + + # (RequestType, SerializedRequest) + msg = (request_type.value, self.encoder.encode(request)) + self.input_socket.send_multipart(msg, copy=False) + + def add_request(self, request: EngineCoreRequest) -> None: + # NOTE: text prompt is not needed in the core engine as it has been + # tokenized. + request.prompt = None + self._send_input(EngineCoreRequestType.ADD, request) + + def abort_requests(self, request_ids: List[str]) -> None: + if len(request_ids) > 0: + self._send_input(EngineCoreRequestType.ABORT, request_ids) + + def profile(self, is_start: bool = True) -> None: + self._send_input(EngineCoreRequestType.PROFILE, + EngineCoreProfile(is_start)) + + def reset_prefix_cache(self) -> None: + self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, + EngineCoreResetPrefixCache()) + + +class AsyncMPClient(MPClient): + """Asyncio-compatible client for multi-proc EngineCore.""" + + def __init__(self, vllm_config: VllmConfig, + executor_class: Type[Executor]): + super().__init__( + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) + + self.outputs_queue: Optional[asyncio.Queue[bytes]] = None + self.queue_task: Optional[asyncio.Task] = None + + async def get_output_async(self) -> EngineCoreOutputs: + if self.outputs_queue is None: + # Perform IO in separate task to parallelize as much as possible + self.outputs_queue = asyncio.Queue() + + async def process_outputs_socket(): + assert self.outputs_queue is not None + while True: + (frame, ) = await self.output_socket.recv_multipart( + copy=False) + self.outputs_queue.put_nowait(frame.buffer) + + self.queue_task = asyncio.create_task(process_outputs_socket()) + + return self.decoder.decode(await self.outputs_queue.get()) + + async def _send_input(self, request_type: EngineCoreRequestType, + request: EngineCoreRequestUnion) -> None: + + msg = (request_type.value, self.encoder.encode(request)) + await self.input_socket.send_multipart(msg, copy=False) + + async def add_request_async(self, request: EngineCoreRequest) -> None: + # NOTE: text prompt is not needed in the core engine as it has been + # tokenized. + request.prompt = None + await self._send_input(EngineCoreRequestType.ADD, request) + + async def abort_requests_async(self, request_ids: List[str]) -> None: + if len(request_ids) > 0: + await self._send_input(EngineCoreRequestType.ABORT, request_ids) + + async def profile_async(self, is_start: bool = True) -> None: + await self._send_input(EngineCoreRequestType.PROFILE, + EngineCoreProfile(is_start)) + + async def reset_prefix_cache_async(self) -> None: + await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, + EngineCoreResetPrefixCache()) diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/detokenizer.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/detokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..861fcb012c34ecb918c7e044dbda3cbb5068ceae --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/detokenizer.py @@ -0,0 +1,182 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import List, Optional, Union + +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.logger import init_logger +from vllm.sampling_params import RequestOutputKind +from vllm.transformers_utils.detokenizer_utils import ( + AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason + +logger = init_logger(__name__) + + +@dataclass +class DetokenizerOutput: + output_text: str + token_ids: List[int] + finished: bool + finish_reason: Optional[FinishReason] = None + stop_reason: Union[int, str, None] = None + + +@dataclass +class IncrementalDetokenizer: + + # Generation data + output_text: str + tokens: List[str] + token_ids: List[int] + prompt_len: int + + # Stop strings + stop: List[str] + include_stop_str_in_output: bool + + # Metadata for incremental detokenization + prefix_offset: int + read_offset: int + + # Parameters for detokenization + skip_special_tokens: bool + spaces_between_special_tokens: bool + output_kind: RequestOutputKind + + # Tokenizer for this request + tokenizer: AnyTokenizer + + # Accounting for stop string buffering + stop_buffer_length: int + _last_output_text_offset: int = 0 + + @property + def output_token_ids(self) -> List[int]: + return self.token_ids[self.prompt_len:] + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: EngineCoreRequest, + ) -> "IncrementalDetokenizer": + + tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=request.prompt_token_ids, + skip_special_tokens=request.sampling_params.skip_special_tokens, + ) + + stops = request.sampling_params.stop + # Number of chars to hold back when stop strings are to be excluded + # from streamed output. + if stops and not request.sampling_params.include_stop_str_in_output: + stop_buffer_length = max(len(s) for s in stops) - 1 + else: + stop_buffer_length = 0 + + return cls( + output_text="", + tokens=tokens, + # Detokenizer mutates this list, so need a unique copy. + # NOTE(Nick): could we take ownership of it though? + token_ids=request.prompt_token_ids.copy(), + stop=stops, + include_stop_str_in_output=request.sampling_params. + include_stop_str_in_output, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=request.sampling_params.skip_special_tokens, + spaces_between_special_tokens=request.sampling_params. + spaces_between_special_tokens, + output_kind=request.sampling_params.output_kind, + prompt_len=len(request.prompt_token_ids), + tokenizer=tokenizer, + stop_buffer_length=stop_buffer_length, + ) + + def update_from_output( + self, + output: EngineCoreOutput, + ) -> Optional[DetokenizerOutput]: + """ + Update RequestState for the request_id by: + 1) Detokenize the new token ids incrementally. + 2) Update the RequestOutput with the new text. + """ + + new_token_ids = output.new_token_ids + finish_reason = output.finish_reason + stop_reason = output.stop_reason + + # 1) Detokenize the new token ids incrementally. + # TODO(woosuk): This method becomes very inefficient when the number of + # new_token_ids is more than 1. We need to optimize this. + decoded_text = "" + for new_token_id in new_token_ids: + self.token_ids.append(new_token_id) + (new_tokens, new_decoded_token_text, prefix_offset, + read_offset) = detokenize_incrementally( + tokenizer=self.tokenizer, + all_input_ids=self.token_ids, + prev_tokens=self.tokens, + prefix_offset=self.prefix_offset, + read_offset=self.read_offset, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self. + spaces_between_special_tokens, + ) + + self.tokens.extend(new_tokens) + self.prefix_offset = prefix_offset + self.read_offset = read_offset + self.output_text += new_decoded_token_text + + decoded_text += new_decoded_token_text + + # 2) Evaluate stop criteria. + if self.stop: + stop = StopChecker.check_stop_strings( + output_text=self.output_text, + new_char_count=len(decoded_text), + stop=self.stop, + include_in_output=self.include_stop_str_in_output, + ) + if stop is not None: + stop_str, truncate_to = stop + if truncate_to != -1: + self.output_text = self.output_text[:truncate_to] + finish_reason = FinishReason.STOP + stop_reason = stop_str + + # TODO: handle stop_token_ids here too? + + # 3) Update the RequestOutput object with the new text. + finished = finish_reason is not None + if self.output_kind == RequestOutputKind.FINAL_ONLY \ + and not finished: + return None + + delta = self.output_kind == RequestOutputKind.DELTA + output_text = self._get_next_output_text(finished, delta) + token_ids = new_token_ids if delta else self.output_token_ids + + return DetokenizerOutput(output_text, token_ids, finished, + finish_reason, stop_reason) + + def _get_next_output_text(self, finished: bool, delta: bool) -> str: + """If delta is True, only new text since the last call to + this method is returned""" + + # We return the full output text if the sequence is finished. + buffer_length = 0 if finished else self.stop_buffer_length + if not delta: + return self.output_text[:-buffer_length] if buffer_length else ( + self.output_text) + length = len(self.output_text) - buffer_length + last_offset = self._last_output_text_offset + if last_offset < length: + self._last_output_text_offset = length + return self.output_text[last_offset:length] + return "" diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..e0452bcad7ba7e1afc560230dd183ece5fe47a90 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Mapping, Optional, Type, Union + +from typing_extensions import TypeVar + +from vllm.config import VllmConfig +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics_types import StatLoggerBase +from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING +from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.outputs import RequestOutput +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.tokenizer_group import ( + BaseTokenizerGroup, init_tokenizer_from_configs) +from vllm.usage.usage_lib import UsageContext +from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.engine.output_processor import OutputProcessor +from vllm.v1.engine.processor import Processor +from vllm.v1.executor.abstract import Executor + +logger = init_logger(__name__) + +_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup) + + +class LLMEngine: + """Legacy LLMEngine for backwards compatibility.""" + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + use_cached_outputs: bool = False, + multiprocess_mode: bool = False, + ) -> None: + self.model_config = vllm_config.model_config + + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + parallel_config=vllm_config.parallel_config, + lora_config=vllm_config.lora_config) + self.tokenizer.ping() + + # Processor (convert Inputs --> EngineCoreRequests) + self.processor = Processor(model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + tokenizer=self.tokenizer, + input_registry=input_registry, + mm_registry=mm_registry) + + # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). + self.output_processor = OutputProcessor(self.tokenizer, + log_stats=False) + + # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) + self.engine_core = EngineCoreClient.make_client( + multiprocess_mode=multiprocess_mode, + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + ) + + @classmethod + def from_engine_args( + cls, + engine_args: EngineArgs, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + enable_multiprocessing: bool = False, + ) -> "LLMEngine": + """Creates an LLM engine from the engine arguments.""" + + # Create the engine configs. + vllm_config = engine_args.create_engine_config(usage_context) + executor_class = Executor.get_class(vllm_config) + + if VLLM_ENABLE_V1_MULTIPROCESSING: + logger.debug("Enabling multiprocessing for LLMEngine.") + enable_multiprocessing = True + + # Create the LLMEngine. + return cls(vllm_config=vllm_config, + executor_class=executor_class, + log_stats=not engine_args.disable_log_stats, + usage_context=usage_context, + stat_loggers=stat_loggers, + multiprocess_mode=enable_multiprocessing) + + def get_num_unfinished_requests(self) -> int: + return self.output_processor.get_num_unfinished_requests() + + def has_unfinished_requests(self) -> bool: + return self.output_processor.has_unfinished_requests() + + @classmethod + def validate_outputs(cls, outputs, output_type): + return outputs + + def abort_request(self, request_ids: List[str]) -> None: + """Remove request_ids from EngineCore and Detokenizer.""" + + self.engine_core.abort_requests(request_ids) + self.output_processor.abort_requests(request_ids) + + def add_request( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> None: + + # 1) Process raw inputs into the request. + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) + + # 2) Make a new RequestState and queue. + self.output_processor.add_request(request) + + # 3) Add the request to EngineCore. + self.engine_core.add_request(request) + + def step(self) -> List[RequestOutput]: + + # 1) Get EngineCoreOutput from the EngineCore. + outputs = self.engine_core.get_output() + + # 2) Process EngineCoreOutputs. + processed_outputs = self.output_processor.process_outputs( + outputs.outputs) + + # 3) Abort any reqs that finished due to stop strings. + self.engine_core.abort_requests(processed_outputs.reqs_to_abort) + + return processed_outputs.request_outputs + + def get_model_config(self): + return self.model_config + + def start_profile(self): + self.engine_core.profile(True) + + def stop_profile(self): + self.engine_core.profile(False) + + def reset_prefix_cache(self): + self.engine_core.reset_prefix_cache() + + def get_tokenizer_group( + self, + group_type: Type[_G] = BaseTokenizerGroup, + ) -> _G: + tokenizer_group = self.tokenizer + + if tokenizer_group is None: + raise ValueError("Unable to get tokenizer because " + "skip_tokenizer_init is True") + if not isinstance(tokenizer_group, group_type): + raise TypeError("Invalid type of tokenizer group. " + f"Expected type: {group_type}, but " + f"found type: {type(tokenizer_group)}") + + return tokenizer_group diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/mm_input_mapper.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/mm_input_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..83a0d9db161d2c0e11543fe60d8f5f29d23cb7aa --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/mm_input_mapper.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, List, Optional + +from vllm.config import ModelConfig +from vllm.logger import init_logger +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalKwargs, MultiModalRegistry) +from vllm.utils import LRUCache + +logger = init_logger(__name__) + +# The idea of MM preprocessor caching is based on having a client and a server, +# where the client executes in the frontend process (=P0) and the server in the +# core process (=P1). +# +# -- Client: Executes the MM mapper and performs caching of the results. +# -- Server: Performs caching of the results +# +# The caching for both client and server is mirrored/similar, and this allows us +# to avoid the serialization of "mm_inputs" (like pixel values) between +# client (=P0) and server (=P1) processes. + +# Both Client and Server must use the same cache size +# (to perform mirrored caching) +# TODO: Tune the MM cache size +MM_CACHE_SIZE = 256 + + +class MMInputMapperClient: + + def __init__( + self, + model_config: ModelConfig, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + ): + self.model_config = model_config + self.mm_registry = mm_registry + self.multi_modal_input_mapper = mm_registry.create_input_mapper( + model_config) + self.mm_registry.init_mm_limits_per_prompt(model_config) + + # Init cache + self.use_cache = not model_config.disable_mm_preprocessor_cache + self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE) + + # DEBUG: Set to None to disable + self.mm_debug_cache_hit_ratio_steps = None + self.mm_cache_hits = 0 + self.mm_cache_total = 0 + + def cache_hit_ratio(self, steps): + if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0: + logger.debug("MMInputMapper: cache_hit_ratio = %.2f ", + self.mm_cache_hits / self.mm_cache_total) + + # TODO: Support modalities beyond image. + def process_inputs( + self, + mm_data: MultiModalDataDict, + mm_hashes: Optional[List[str]], + mm_processor_kwargs: Optional[Dict[str, Any]], + precomputed_mm_inputs: Optional[List[MultiModalKwargs]], + ) -> List[MultiModalKwargs]: + if precomputed_mm_inputs is None: + image_inputs = mm_data["image"] + if not isinstance(image_inputs, list): + image_inputs = [image_inputs] + num_inputs = len(image_inputs) + else: + num_inputs = len(precomputed_mm_inputs) + + # Sanity + if self.use_cache: + assert mm_hashes is not None + assert num_inputs == len(mm_hashes) + + # Process each image input separately, so that later we can schedule + # them in a fine-grained manner. + # Apply caching (if enabled) and reuse precomputed inputs (if provided) + ret_inputs: List[MultiModalKwargs] = [] + for input_id in range(num_inputs): + if self.mm_debug_cache_hit_ratio_steps is not None: + self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps) + + mm_input = None + if self.use_cache: + assert mm_hashes is not None + mm_hash = mm_hashes[input_id] + mm_input = self.mm_cache.get(mm_hash) + + self.mm_cache_total += 1 + if mm_input is None: + if precomputed_mm_inputs is not None: + # Reuse precomputed input (for merged preprocessor) + mm_input = precomputed_mm_inputs[input_id] + else: + # Apply MM mapper + mm_input = self.multi_modal_input_mapper( + {"image": [image_inputs[input_id]]}, + mm_processor_kwargs=mm_processor_kwargs, + ) + + if self.use_cache: + # Add to cache + assert mm_hash is not None + self.mm_cache.put(mm_hash, mm_input) + else: + self.mm_cache_hits += 1 + mm_input = None # Avoids sending mm_input to Server + + ret_inputs.append(mm_input) + + return ret_inputs + + +class MMInputMapperServer: + + def __init__(self, model_config): + self.use_cache = not model_config.disable_mm_preprocessor_cache + self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE) + + def process_inputs( + self, + mm_inputs: List[Optional[MultiModalKwargs]], + mm_hashes: List[str], + ) -> List[MultiModalKwargs]: + assert len(mm_inputs) == len(mm_hashes) + + if not self.use_cache: + return mm_inputs + + full_mm_inputs = [] + for mm_input, mm_hash in zip(mm_inputs, mm_hashes): + assert mm_hash is not None + if mm_input is None: + mm_input = self.mm_cache.get(mm_hash) + assert mm_input is not None + else: + self.mm_cache.put(mm_hash, mm_input) + + full_mm_inputs.append(mm_input) + + return full_mm_inputs diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..947366691471784dd08462190f58767a10276630 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +from dataclasses import dataclass +from typing import Dict, List, Optional + +from vllm.outputs import RequestOutput +from vllm.transformers_utils.detokenizer_utils import AnyTokenizer +from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine.detokenizer import (DetokenizerOutput, + IncrementalDetokenizer) +from vllm.v1.metrics.stats import IterationStats, RequestStateStats + + +@dataclass +class OutputProcessorOutput: + + request_outputs: List[RequestOutput] + reqs_to_abort: List[str] + iteration_stats: IterationStats + + +class RequestState: + + def __init__( + self, + request_id: str, + prompt: Optional[str], + prompt_token_ids: List[int], + detokenizer: IncrementalDetokenizer, + arrival_time: float, + queue: Optional[asyncio.Queue[RequestOutput]], + ): + self.request_id = request_id + self.prompt = prompt + self.prompt_token_ids = prompt_token_ids + self.prompt_len = len(prompt_token_ids) + self.detokenizer = detokenizer + self.is_prefilling = True + self.queue = queue + + self.stats = RequestStateStats(last_token_time=arrival_time) + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: EngineCoreRequest, + queue: Optional[asyncio.Queue[RequestOutput]] = None, + ) -> "RequestState": + return cls( + request_id=request.request_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, + detokenizer=IncrementalDetokenizer.from_new_request( + tokenizer=tokenizer, + request=request, + ), + arrival_time=request.arrival_time, + queue=queue, + ) + + +class OutputProcessor: + """Process EngineCoreOutputs into RequestOutputs.""" + + def __init__( + self, + tokenizer: BaseTokenizerGroup, + log_stats: bool, + ): + self.log_stats = log_stats + self.tokenizer = tokenizer + self.request_states: Dict[str, RequestState] = {} + + def is_request_active(self, request_id: str) -> bool: + return request_id in self.request_states + + def get_num_unfinished_requests(self): + return len(self.request_states) + + def has_unfinished_requests(self) -> bool: + return len(self.request_states) > 0 + + def abort_requests( + self, + request_ids: List[str], + ) -> None: + for request_id in request_ids: + self.request_states.pop(request_id, None) + + def add_request( + self, + request: EngineCoreRequest, + queue: Optional[asyncio.Queue[RequestOutput]] = None, + ) -> None: + request_id = request.request_id + if request_id in self.request_states: + raise ValueError(f"Request id {request_id} already running.") + + self.request_states[request_id] = RequestState.from_new_request( + tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request), + request=request, + queue=queue) + + def process_outputs( + self, + engine_core_outputs: List[EngineCoreOutput], + iteration_stats: Optional[IterationStats] = None, + ) -> OutputProcessorOutput: + """ + Process the EngineCoreOutputs: + 1) Compute stats for logging + 2) Detokenize + 3) Create and handle RequestOutput objects: + * If there is a queue (for usage with AsyncLLM), + put the RequestOutput objects into the queue for + handling by the per-request generate() tasks. + + * If there is no queue (for usage with LLMEngine), + return a list of RequestOutput objects. + + ****************** NOTE FOR DEVELOPERS ****************** + + VLLM V1 minimizes the number of python loops over the full + batch to ensure system overheads are minimized. This is the + only function that should loop over EngineCoreOutputs. + + If you need to touch every element of the batch, implement a + method called XXXClass.update_from_output() to be called + within the loop below. For examples, see: + * IterationStats.update_from_output() + * Detokenizer.update_from_output() + + TODO(rob): add Protocol makes update_from_output explicit. + + ********************************************************** + """ + + request_outputs: List[RequestOutput] = [] + reqs_to_abort: List[str] = [] + if not iteration_stats: + iteration_stats = IterationStats(self.log_stats) + for engine_core_output in engine_core_outputs: + req_id = engine_core_output.request_id + req_state = self.request_states.get(req_id) + if req_state is None: + # Ignore output for already-aborted request. + continue + + # 1) Compute stats for this iteration. + iteration_stats.update_from_output(engine_core_output, + req_state.is_prefilling, + req_state.prompt_len, + req_state.stats) + req_state.is_prefilling = False + + # 2) Detokenize the token ids into text. + detokenizer_output = req_state.detokenizer.update_from_output( + engine_core_output) + + # 3) Create and handle RequestOutput objects. + if detokenizer_output is not None: + request_output = self._make_request_output( + req_state, detokenizer_output) + + if req_state.queue is not None: + # AsyncLLM: put into queue for handling by generate(). + req_state.queue.put_nowait(request_output) + else: + # LLMEngine: return list of RequestOutputs. + request_outputs.append(request_output) + + # Free completed requests. + if request_output.finished: + assert detokenizer_output.finish_reason is not None + + self.request_states.pop(req_id) + if not engine_core_output.finished: + # If req not finished in EngineCore, but Detokenizer + # detected stop string, abort needed in EngineCore. + reqs_to_abort.append(req_id) + + # Track per-request stats + iteration_stats.update_from_finished_request( + detokenizer_output.finish_reason, request_output, + req_state.stats) + + return OutputProcessorOutput( + request_outputs=request_outputs, + reqs_to_abort=reqs_to_abort, + iteration_stats=iteration_stats, + ) + + @staticmethod + def _make_request_output( + request_state: RequestState, + detokenizer_output: DetokenizerOutput, + ) -> RequestOutput: + request_output = RequestOutput.new( + request_state.request_id, + request_state.prompt, + request_state.prompt_token_ids, + detokenizer_output.output_text, + detokenizer_output.token_ids, + detokenizer_output.finished, + ) + if detokenizer_output.finished: + completion_output = request_output.outputs[0] + completion_output.finish_reason = str( + detokenizer_output.finish_reason) + completion_output.stop_reason = detokenizer_output.stop_reason + + return request_output diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/engine/processor.py b/.venv/lib/python3.11/site-packages/vllm/v1/engine/processor.py new file mode 100644 index 0000000000000000000000000000000000000000..366287951ed0444d5130d60e63d7e49fb38e6e15 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/engine/processor.py @@ -0,0 +1,230 @@ +# SPDX-License-Identifier: Apache-2.0 + +import time +from typing import Mapping, Optional, Union + +from vllm.config import CacheConfig, LoRAConfig, ModelConfig +from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, + PromptType, SingletonInputsAdapter) +from vllm.inputs.parse import is_encoder_decoder_inputs +from vllm.inputs.preprocess import InputPreprocessor +from vllm.lora.request import LoRARequest +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher, + MultiModalKwargs, MultiModalRegistry) +from vllm.multimodal.utils import merge_and_sort_multimodal_metadata +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.mm_input_mapper import MMInputMapperClient + + +class Processor: + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + lora_config: Optional[LoRAConfig], + tokenizer: BaseTokenizerGroup, + input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + ): + + self.model_config = model_config + self.lora_config = lora_config + self.tokenizer = tokenizer + + self.generation_config_fields = model_config.try_get_generation_config( + ) + self.input_preprocessor = InputPreprocessor(model_config, + self.tokenizer, + mm_registry) + self.input_processor = input_registry.create_input_processor( + model_config) + + # Multi-modal (huggingface) input mapper + self.mm_input_mapper_client = MMInputMapperClient(model_config) + + # Multi-modal hasher (for images) + self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \ + cache_config.enable_prefix_caching + + def process_inputs( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> EngineCoreRequest: + + # TODO(woosuk): Support pooling models. + # TODO(woosuk): Check max_logprobs + # TODO(woosuk): Support encoder-decoder models. + + if lora_request is not None and not self.lora_config: + raise ValueError(f"Got lora_request {lora_request} but LoRA is " + "not enabled!") + if arrival_time is None: + arrival_time = time.time() + assert priority == 0, "vLLM V1 does not support priority at the moment." + assert trace_headers is None, "vLLM V1 does not support tracing yet." + + # Process inputs. + preprocessed_inputs = self.input_preprocessor.preprocess( + prompt, + request_id=request_id, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + processed_inputs = self.input_processor(preprocessed_inputs) + self._validate_model_inputs(processed_inputs) + eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) + + if is_encoder_decoder_inputs(processed_inputs): + decoder_inputs = SingletonInputsAdapter( + processed_inputs["decoder"]) + encoder_inputs = SingletonInputsAdapter( + processed_inputs["encoder"]) + else: + decoder_inputs = SingletonInputsAdapter(processed_inputs) + encoder_inputs = None + + # TODO: Impl encoder-decoder + if encoder_inputs is not None: + raise NotImplementedError + + assert isinstance(params, SamplingParams) + # TODO: can we avoid cloning here in multiproc case + sampling_params = params.clone() + sampling_params.update_from_generation_config( + self.generation_config_fields, eos_token_id) + + # Multimodal related. + # Compute MM hashes (if enabled) + mm_hashes = None + if self.use_hash: + # Use mm_hashes from processed inputs if the model has merged + # input processor. + if decoder_inputs.multi_modal_hashes: + mm_hashes = decoder_inputs.multi_modal_hashes + # Fallback to using MultiModalHasher directly. + else: + mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt) + + # For merged preprocessor, mm_data is already mm_inputs + precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None + decoder_mm_data = decoder_inputs.multi_modal_data + if isinstance(decoder_mm_data, MultiModalKwargs): + # The output of merged multi-modal processor (`decoder_mm_data`) + # contains the kwargs for all items from all modalities. + # This code separates them so that there is one set of kwargs + # per item per modality. + precomputed_mm_inputs = [ + MultiModalKwargs.from_items([item]) + for modality in decoder_mm_data.modalities + for item in decoder_mm_data.get_items(modality) + ] + + mm_positions = decoder_inputs.multi_modal_placeholders + + # Last-mile processing of multimodal metadata and inputs. + if mm_positions: + + # Merge and flatten multimodal placeholders, hashes and inputs + # from dictionaries to lists, and sort them by each item's position + # in the input sequence. + # NOTE: interleaved modalities are not supported. + ( + sorted_modalities, + sorted_mm_positions, + sorted_mm_hashes, + ) = merge_and_sort_multimodal_metadata( + mm_positions, + mm_hashes, + ) + + # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple + # modalities involved AND the model supports merged input processor. + if len(sorted_modalities) > 1 and precomputed_mm_inputs: + + modality_order_dict = { + modality: order + for order, modality in enumerate(sorted_modalities) + } + + # Sanity check to make sure each multimodal input has only one + # modality key. + for mm_input in precomputed_mm_inputs: + assert len(mm_input.modalities) == 1 + + # Sort MultiModalKwags to match sorted_mm_positions + precomputed_mm_inputs = sorted( + precomputed_mm_inputs, + key=lambda mm_input: modality_order_dict[list( + mm_input.modalities)[0]]) + + # Apply mm input cache update (and input mapper if necessary). + sorted_mm_inputs = self.mm_input_mapper_client.process_inputs( + mm_data=decoder_mm_data, + mm_hashes=sorted_mm_hashes, + mm_processor_kwargs=decoder_inputs.mm_processor_kwargs, + precomputed_mm_inputs=precomputed_mm_inputs, + ) + else: + sorted_mm_inputs = None + sorted_mm_hashes = None + sorted_mm_positions = None + + return EngineCoreRequest( + request_id=request_id, + prompt=decoder_inputs.prompt, + prompt_token_ids=decoder_inputs.prompt_token_ids, + mm_inputs=sorted_mm_inputs, + mm_hashes=sorted_mm_hashes, + mm_placeholders=sorted_mm_positions, + sampling_params=sampling_params, + eos_token_id=eos_token_id, + arrival_time=arrival_time, + lora_request=lora_request, + ) + + def _validate_model_inputs(self, inputs: ProcessorInputs): + if is_encoder_decoder_inputs(inputs): + # For encoder-decoder multimodal models, the max_prompt_len + # restricts the decoder prompt length + prompt_inputs = inputs["decoder" if self.model_config. + is_multimodal_model else "encoder"] + else: + prompt_inputs = inputs + + prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids + + if prompt_ids is None or len(prompt_ids) == 0: + raise ValueError("Prompt cannot be empty") + + if len(prompt_ids) >= self.model_config.max_model_len: + raise ValueError( + f"Prompt length of {len(prompt_ids)} is longer than the " + f"maximum model length of {self.model_config.max_model_len}.") + + if self.model_config.is_multimodal_model: + max_prompt_len = self.model_config.max_model_len + + if len(prompt_ids) > max_prompt_len: + raise ValueError( + f"The prompt (total length {len(prompt_ids)}) is too long " + f"to fit into the model (context length {max_prompt_len}). " + "Make sure that `max_model_len` is no smaller than the " + "number of text tokens plus multimodal tokens. For image " + "inputs, the number of image tokens depends on the number " + "of images, and possibly their aspect ratios as well.") + + # TODO: Find out how many placeholder tokens are there so we can + # check that chunked prefill does not truncate them + # max_batch_len = self.scheduler_config.max_num_batched_tokens diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__init__.py b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1807a2f1777628eba1e82cee7c478ef860d6eee7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/loggers.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/loggers.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a59a6b6209de1081060cc3f1dc9ddbac84758d36 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/loggers.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/stats.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/stats.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc32d87be4bd2b35ef1cb7152fab72fa22496205 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/__pycache__/stats.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/metrics/loggers.py b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/loggers.py new file mode 100644 index 0000000000000000000000000000000000000000..eb1acf584c6b02e19f9ca6e41c58f5571bc310cc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/loggers.py @@ -0,0 +1,221 @@ +# SPDX-License-Identifier: Apache-2.0 + +import time +from abc import ABC, abstractmethod +from typing import Dict, List + +import numpy as np +import prometheus_client + +from vllm.config import ModelConfig +from vllm.logger import init_logger +from vllm.v1.engine import FinishReason +from vllm.v1.metrics.stats import IterationStats, SchedulerStats + +logger = init_logger(__name__) + +_LOCAL_LOGGING_INTERVAL_SEC = 5.0 + + +class StatLoggerBase(ABC): + + @abstractmethod + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): + ... + + +class LoggingStatLogger(StatLoggerBase): + + def __init__(self): + self._reset(time.monotonic()) + + def _reset(self, now): + self.last_log_time = now + + # Tracked stats over current local logging interval. + self.num_prompt_tokens: List[int] = [] + self.num_generation_tokens: List[int] = [] + + def _local_interval_elapsed(self, now: float) -> bool: + # Log every _LOCAL_LOGGING_INTERVAL_SEC. + elapsed_time = now - self.last_log_time + return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC + + def _track_iteration_stats(self, iteration_stats: IterationStats): + # Save tracked stats for token counters. + self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens) + self.num_generation_tokens.append( + iteration_stats.num_generation_tokens) + + def _get_throughput(self, tracked_stats: List[int], now: float) -> float: + # Compute summary metrics for tracked stats + return float(np.sum(tracked_stats) / (now - self.last_log_time)) + + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): + """Log Stats to standard output.""" + + self._track_iteration_stats(iteration_stats) + + now = time.monotonic() + if not self._local_interval_elapsed(now): + return + + prompt_throughput = self._get_throughput(self.num_prompt_tokens, now) + generation_throughput = self._get_throughput( + self.num_generation_tokens, now) + + self._reset(now) + + # Format and print output. + logger.info( + "Avg prompt throughput: %.1f tokens/s, " + "Avg generation throughput: %.1f tokens/s, " + "Running: %d reqs, Waiting: %d reqs " + "GPU KV cache usage: %.1f%%.", + prompt_throughput, + generation_throughput, + scheduler_stats.num_running_reqs, + scheduler_stats.num_waiting_reqs, + scheduler_stats.gpu_cache_usage * 100, + ) + + +class PrometheusStatLogger(StatLoggerBase): + + def __init__(self, model_config: ModelConfig): + self._unregister_vllm_metrics() + + labelnames = ["model_name"] + labelvalues = [model_config.served_model_name] + + max_model_len = model_config.max_model_len + + self.gauge_scheduler_running = prometheus_client.Gauge( + name="vllm:num_requests_running", + documentation="Number of requests in model execution batches.", + labelnames=labelnames).labels(*labelvalues) + + self.gauge_scheduler_waiting = prometheus_client.Gauge( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + labelnames=labelnames).labels(*labelvalues) + + self.gauge_gpu_cache_usage = prometheus_client.Gauge( + name="vllm:gpu_cache_usage_perc", + documentation="GPU KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_prompt_tokens = prometheus_client.Counter( + name="vllm:prompt_tokens_total", + documentation="Number of prefill tokens processed.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_generation_tokens = prometheus_client.Counter( + name="vllm:generation_tokens_total", + documentation="Number of generation tokens processed.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_request_success: Dict[FinishReason, + prometheus_client.Counter] = {} + counter_request_success_base = prometheus_client.Counter( + name="vllm:request_success_total", + documentation="Count of successfully processed requests.", + labelnames=labelnames + ["finished_reason"]) + for reason in FinishReason: + self.counter_request_success[ + reason] = counter_request_success_base.labels(*(labelvalues + + [str(reason)])) + + self.histogram_num_prompt_tokens_request = \ + prometheus_client.Histogram( + name="vllm:request_prompt_tokens", + documentation="Number of prefill tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames).labels(*labelvalues) + + self.histogram_num_generation_tokens_request = \ + prometheus_client.Histogram( + name="vllm:request_generation_tokens", + documentation="Number of generation tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames).labels(*labelvalues) + + self.histogram_time_to_first_token = \ + prometheus_client.Histogram( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + ], + labelnames=labelnames).labels(*labelvalues) + + self.histogram_time_per_output_token = \ + prometheus_client.Histogram( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, + 0.75, 1.0, 2.5 + ], + labelnames=labelnames).labels(*labelvalues) + + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): + """Log to prometheus.""" + self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + + self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + + self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) + self.counter_generation_tokens.inc( + iteration_stats.num_generation_tokens) + + for finished_request in iteration_stats.finished_requests: + self.counter_request_success[finished_request.finish_reason].inc() + self.histogram_num_prompt_tokens_request.observe( + finished_request.num_prompt_tokens) + self.histogram_num_generation_tokens_request.observe( + finished_request.num_generation_tokens) + + for ttft in iteration_stats.time_to_first_tokens_iter: + self.histogram_time_to_first_token.observe(ttft) + for tpot in iteration_stats.time_per_output_tokens_iter: + self.histogram_time_per_output_token.observe(tpot) + + @staticmethod + def _unregister_vllm_metrics(): + # Unregister any existing vLLM collectors (for CI/CD + for collector in list(prometheus_client.REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + prometheus_client.REGISTRY.unregister(collector) + + +def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: + """ + Builds a list of buckets with increasing powers of 10 multiplied by + mantissa values until the value exceeds the specified maximum. + + """ + exponent = 0 + buckets: List[int] = [] + while True: + for m in mantissa_lst: + value = m * 10**exponent + if value <= max_value: + buckets.append(value) + else: + return buckets + exponent += 1 + + +def build_1_2_5_buckets(max_value: int) -> List[int]: + """ + Example: + >>> build_1_2_5_buckets(100) + [1, 2, 5, 10, 20, 50, 100] + """ + return build_buckets([1, 2, 5], max_value) diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/metrics/stats.py b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..e3f1efcc9b1a7547405642e87097c450eac86363 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/metrics/stats.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 + +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING, List + +if TYPE_CHECKING: + from vllm.outputs import RequestOutput + from vllm.v1.engine import EngineCoreOutput, FinishReason + + +@dataclass +class SchedulerStats: + """Stats associated with the scheduler.""" + + num_running_reqs: int = 0 + num_waiting_reqs: int = 0 + + gpu_cache_usage: float = 0.0 + # gpu_prefix_cache_hit_rate: float = 0.0 + + +@dataclass +class RequestStateStats: + """Stats that need to be tracked across delta updates.""" + + num_generation_tokens: int = 0 + last_token_time: float = 0.0 + + +@dataclass +class FinishedRequestStats: + """Stats associated with a finished request.""" + + finish_reason: "FinishReason" + num_prompt_tokens: int = 0 + num_generation_tokens: int = 0 + + +class IterationStats: + """Stats associated with a single set of EngineCoreOutputs.""" + + def __init__(self, log_stats: bool): + self.log_stats = log_stats + self.num_generation_tokens = 0 + self.num_prompt_tokens = 0 + self.finished_requests: List[FinishedRequestStats] = [] + self.time_to_first_tokens_iter: List[float] = [] + self.time_per_output_tokens_iter: List[float] = [] + + def update_from_output(self, output: "EngineCoreOutput", + is_prefilling: bool, prompt_len: int, + request_state_stats: RequestStateStats): + if not self.log_stats: + return + + num_new_generation_tokens = len(output.new_token_ids) + now = time.time() + last_token_latency = now - request_state_stats.last_token_time + + self.num_generation_tokens += num_new_generation_tokens + if is_prefilling: + # This relies on the invariant that EngineCore does + # not stream outputs for partially completed prefills + # (scheduler.update_from_output makes EngineCoreOutput + # iff num_computed_tokens == num_tokens). + assert (num_new_generation_tokens > 0) + self.num_prompt_tokens += prompt_len + + self.time_to_first_tokens_iter.append(last_token_latency) + else: + self.time_per_output_tokens_iter.append(last_token_latency) + + request_state_stats.num_generation_tokens += num_new_generation_tokens + request_state_stats.last_token_time = now + + def update_from_finished_request(self, finish_reason: "FinishReason", + request_output: "RequestOutput", + request_state_stats: RequestStateStats): + self.finished_requests.append( + FinishedRequestStats(finish_reason, + len(request_output.prompt_token_ids), + request_state_stats.num_generation_tokens)) diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62ce2c2e9becb4bd1a8a692a76307d198dbc6eda Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/block_table.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/block_table.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1df90a156897a28a335a965a043f49c6dfd449ac Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/block_table.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_input_batch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_input_batch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57ac73e1c5edd269f3e9e245616b235a6eedc503 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_input_batch.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_model_runner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_model_runner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94c0bd5d231071e29daacc560d65a1e80d3e3295 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_model_runner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_worker.cpython-311.pyc b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_worker.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..866a17de0b37c8b9a61e1692dfa168e0da3d2aed Binary files /dev/null and b/.venv/lib/python3.11/site-packages/vllm/v1/worker/__pycache__/gpu_worker.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py b/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..ec6d04cd497527776e8e3bcb3b0fb9677b872cb0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py @@ -0,0 +1,1113 @@ +# SPDX-License-Identifier: Apache-2.0 + +import gc +import time +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast + +import numpy as np +import torch +import torch.distributed +import torch.nn as nn + +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention +from vllm.config import CompilationLevel, VllmConfig +from vllm.distributed.parallel_state import graph_capture +from vllm.forward_context import set_forward_context +from vllm.inputs import INPUT_REGISTRY +from vllm.logger import init_logger +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.model_executor.model_loader import get_model +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.utils import group_mm_inputs_by_modality +from vllm.sampling_params import SamplingType +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, + LayerBlockType, cdiv, is_pin_memory_available) +from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, + FlashAttentionMetadata) +from vllm.v1.core.encoder_cache_manager import compute_encoder_budget +from vllm.v1.engine.mm_input_mapper import MMInputMapperClient +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheSpec) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.utils import bind_kv_cache +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + +logger = init_logger(__name__) + + +class GPUModelRunner: + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + + model_config = self.model_config + cache_config = self.cache_config + scheduler_config = self.scheduler_config + parallel_config = self.parallel_config + self.device = device + self.pin_memory = is_pin_memory_available() + self.dtype = self.model_config.dtype + if cache_config.cache_dtype == "auto": + self.kv_cache_dtype = self.dtype + else: + self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + cache_config.cache_dtype] + + self.is_multimodal_model = model_config.is_multimodal_model + self.sliding_window = model_config.get_sliding_window() + self.block_size = cache_config.block_size + self.max_model_len = model_config.max_model_len + self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) + self.max_num_tokens = scheduler_config.max_num_batched_tokens + self.max_num_reqs = scheduler_config.max_num_seqs + + # Model-related. + self.num_attn_layers = model_config.get_num_layers_by_block_type( + parallel_config, LayerBlockType.attention) + self.num_query_heads = model_config.get_num_attention_heads( + parallel_config) + self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) + self.head_size = model_config.get_head_size() + self.hidden_size = model_config.get_hidden_size() + + # Multi-modal data support + self.input_registry = INPUT_REGISTRY + self.mm_registry = MULTIMODAL_REGISTRY + + # NOTE: Initialized input mapper is only used for processing dummy + # multimodal data into multimodal kwargs for GPU memory profiling. + self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config) + self.mm_input_mapper_profiling.use_cache = False + + encoder_compute_budget, encoder_cache_size = compute_encoder_budget( + model_config=model_config, + scheduler_config=scheduler_config, + ) + self.max_num_encoder_input_tokens = encoder_compute_budget + self.encoder_cache_size = encoder_cache_size + + # Lazy initialization + # self.model: nn.Module # Set after load_model + self.kv_caches: List[torch.Tensor] = [] + # req_id -> (input_id -> encoder_output) + self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} + + # Request states. + self.requests: Dict[str, CachedRequestState] = {} + # Persistent batch. + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_blocks_per_req=self.max_num_blocks_per_req, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=model_config.get_vocab_size(), + ) + + self.use_cuda_graph = (self.vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE + and not self.model_config.enforce_eager) + # TODO(woosuk): Provide an option to tune the max cudagraph batch size. + # The convention is different. + # self.cudagraph_batch_sizes sorts in ascending order. + # The batch sizes in the config are in descending order. + self.cudagraph_batch_sizes = list( + reversed( + self.vllm_config.compilation_config.cudagraph_capture_sizes)) + + # Cache the device properties. + self.device_properties = torch.cuda.get_device_properties(self.device) + self.num_sms = self.device_properties.multi_processor_count + + # Persistent buffers for CUDA graphs. + self.input_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device=self.device) + self.positions = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device=self.device) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + # NOTE: `mrope_positions` is implemented with one additional dummy + # position on purpose to make it non-contiguous so that it can work + # with torch compile. + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 + self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1), + dtype=torch.int64, + device=self.device) + self.mrope_positions_cpu = torch.zeros( + (3, self.max_num_tokens + 1), + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory) + + self.inputs_embeds = torch.zeros( + (self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=self.device) + + # OPTIMIZATION: Cache the tensors rather than creating them every step. + self.arange_np = np.arange(max(self.max_num_reqs + 1, + self.max_model_len, + self.max_num_tokens), + dtype=np.int32) + # NOTE(woosuk): These tensors are "stateless", i.e., they are literally + # a faster version of creating a new tensor every time. Thus, we should + # not make any assumptions about the values in these tensors. + self.input_ids_cpu = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + self.input_ids_np = self.input_ids_cpu.numpy() + self.positions_cpu = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory) + self.positions_np = self.positions_cpu.numpy() + self.slot_mapping_cpu = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + self.slot_mapping_np = self.slot_mapping_cpu.numpy() + self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1, + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + self.query_start_loc_np = self.query_start_loc_cpu.numpy() + self.seq_lens_cpu = torch.zeros(self.max_num_reqs, + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + self.seq_lens_np = self.seq_lens_cpu.numpy() + + def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: + """Update the cached states and the persistent batch with the scheduler + output. + + The updated states are used by the `_prepare_inputs` function to create + the input GPU tensors for the model. + + Returns: + True if there is a new/resumed/paused/finished request in the batch. + If False, we can skip copying SamplingMetadata to the GPU. + """ + # Remove finished requests from the cached states. + for req_id in scheduler_output.finished_req_ids: + self.requests.pop(req_id, None) + self.encoder_cache.pop(req_id, None) + # Remove the finished requests from the persistent batch. + # NOTE(woosuk): There could be an edge case where finished_req_ids and + # scheduled_req_ids overlap. This happens when a request is aborted and + # then resubmitted with the same ID. In this case, we treat them as two + # distinct requests - clearing the cached states for the first request + # and handling the second as a new request. + removed_req_indices: List[int] = [] + for req_id in scheduler_output.finished_req_ids: + req_index = self.input_batch.remove_request(req_id) + if req_index is not None: + removed_req_indices.append(req_index) + + # Free the cached encoder outputs. + for req_id, input_id in scheduler_output.free_encoder_input_ids: + encoder_outputs = self.encoder_cache.get(req_id) + if encoder_outputs is not None: + encoder_outputs.pop(input_id, None) + if not encoder_outputs: + self.encoder_cache.pop(req_id, None) + + # Remove the unscheduled requests from the persistent batch. + # NOTE(woosuk): The unscheduled requests are either preempted requests + # or running requests that are not scheduled in this step. We remove + # them from the persistent batch but keep their cached states since + # they will be scheduled again sometime in the future. + scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys() + cached_req_ids = self.input_batch.req_id_to_index.keys() + unscheduled_req_ids = cached_req_ids - scheduled_req_ids + # NOTE(woosuk): The persistent batch optimization assumes that + # consecutive batches contain mostly the same requests. If batches + # have low request overlap (e.g., alternating between two distinct + # sets of requests), this optimization becomes very inefficient. + for req_id in unscheduled_req_ids: + req_index = self.input_batch.remove_request(req_id) + assert req_index is not None + removed_req_indices.append(req_index) + + req_ids_to_add: List[str] = [] + # Add new requests to the cached states. + for new_req_data in scheduler_output.scheduled_new_reqs: + req_id = new_req_data.req_id + sampling_params = new_req_data.sampling_params + if sampling_params.sampling_type == SamplingType.RANDOM_SEED: + generator = torch.Generator(device=self.device) + generator.manual_seed(sampling_params.seed) + else: + generator = None + + self.requests[req_id] = CachedRequestState( + req_id=req_id, + prompt_token_ids=new_req_data.prompt_token_ids, + prompt=new_req_data.prompt, + mm_inputs=new_req_data.mm_inputs, + mm_positions=new_req_data.mm_positions, + sampling_params=sampling_params, + generator=generator, + block_ids=new_req_data.block_ids, + num_computed_tokens=new_req_data.num_computed_tokens, + output_token_ids=[], + ) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + image_grid_thw = [] + video_grid_thw = [] + second_per_grid_ts = [] + for mm_input in self.requests[req_id].mm_inputs: + if mm_input.get("image_grid_thw") is not None: + image_grid_thw.extend( + mm_input["image_grid_thw"].tolist()) + if mm_input.get("video_grid_thw") is not None: + video_grid_thw.extend( + mm_input["video_grid_thw"].tolist()) + if mm_input.get("second_per_grid_ts") is not None: + second_per_grid_ts.extend( + mm_input["second_per_grid_ts"]) + + hf_config = self.model_config.hf_config + + self.requests[req_id].mrope_positions, \ + self.requests[req_id].mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + self.requests[req_id].prompt_token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + ) + + req_ids_to_add.append(req_id) + + # Update the states of the running/resumed requests. + for req_data in scheduler_output.scheduled_cached_reqs: + req_id = req_data.req_id + req_state = self.requests[req_id] + + # Update the cached states. + req_state.num_computed_tokens = req_data.num_computed_tokens + if not req_data.resumed_from_preemption: + # Append the new blocks to the existing block IDs. + req_state.block_ids.extend(req_data.new_block_ids) + else: + # The request is resumed from preemption. + # Replace the existing block IDs with the new ones. + req_state.block_ids = req_data.new_block_ids + + req_index = self.input_batch.req_id_to_index.get(req_id) + if req_index is None: + # The request is not in the persistent batch. + # The request was either preempted and resumed later, or was not + # scheduled in the previous step and needs to be added again. + req_ids_to_add.append(req_id) + continue + + # Update the persistent batch. + self.input_batch.num_computed_tokens_cpu[req_index] = ( + req_data.num_computed_tokens) + start_index = len(req_state.block_ids) - len( + req_data.new_block_ids) + self.input_batch.block_table.append_row(req_index, start_index, + req_data.new_block_ids) + + # Add the new or resumed requests to the persistent batch. + # The smaller empty indices are filled first. + removed_req_indices = sorted(removed_req_indices, reverse=True) + for req_id in req_ids_to_add: + req_state = self.requests[req_id] + if removed_req_indices: + # Fill the empty index. + req_index = removed_req_indices.pop() + else: + # Append to the end. + req_index = None + self.input_batch.add_request(req_state, req_index) + + # Condense the batched states if there are empty indices. + if removed_req_indices: + self.input_batch.condense(removed_req_indices) + return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0 + + def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 + num_reqs = self.input_batch.num_reqs + assert num_reqs > 0 + + # OPTIMIZATION: Start copying the block table first. + # This way, we can overlap the copy with the following CPU operations. + self.input_batch.block_table.commit(num_reqs) + + # Get the number of scheduled tokens for each request. + # TODO: The Python loop can be slow. Optimize. + num_scheduled_tokens = [] + max_num_scheduled_tokens = 0 + for req_id in self.input_batch.req_ids[:num_reqs]: + assert req_id is not None + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_scheduled_tokens.append(num_tokens) + max_num_scheduled_tokens = max(max_num_scheduled_tokens, + num_tokens) + num_scheduled_tokens = np.array(num_scheduled_tokens, dtype=np.int32) + assert max_num_scheduled_tokens > 0 + + # Get request indices. + # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] + req_indices = np.repeat(self.arange_np[:num_reqs], + num_scheduled_tokens) + + # Get batched arange. + # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_scheduled_tokens) + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, + num_scheduled_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets + + # Get positions. + positions_np = self.positions_np[:total_num_scheduled_tokens] + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + arange, + out=positions_np) + + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + self._calc_mrope_positions(scheduler_output) + + # Get token indices. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] + # where M is the max_model_len. + token_indices = (positions_np + + req_indices * self.input_batch.token_ids_cpu.shape[1]) + # NOTE(woosuk): We use torch.index_select instead of np.take here + # because torch.index_select is much faster than np.take for large + # tensors. + torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(), + 0, + torch.from_numpy(token_indices), + out=self.input_ids_cpu[:total_num_scheduled_tokens]) + + # Calculate the slot mapping. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` here + # because M (max_model_len) is not necessarily divisible by block_size. + block_table_indices = (req_indices * self.max_num_blocks_per_req + + positions_np // self.block_size) + # NOTE(woosuk): We use torch.index_select instead of np.take here + # because torch.index_select is much faster than np.take for large + # tensors. + block_table_cpu = self.input_batch.block_table.get_cpu_tensor() + block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() + block_offsets = positions_np % self.block_size + np.add(block_numbers * self.block_size, + block_offsets, + out=self.slot_mapping_np[:total_num_scheduled_tokens]) + + # Prepare the attention metadata. + self.query_start_loc_np[0] = 0 + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens + + self.seq_lens_np[:num_reqs] = ( + self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + max_seq_len = self.seq_lens_np[:num_reqs].max() + + # Copy the tensors to the GPU. + self.input_ids[:total_num_scheduled_tokens].copy_( + self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) + if self.model_config.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + else: + # Common case (1D positions) + self.positions[:total_num_scheduled_tokens].copy_( + self.positions_cpu[:total_num_scheduled_tokens], + non_blocking=True) + query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to( + self.device, non_blocking=True) + seq_lens = self.seq_lens_cpu[:num_reqs].to(self.device, + non_blocking=True) + slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to( + self.device, non_blocking=True).long() + + # Prepare for cascade attention if needed. + common_prefix_len = (scheduler_output.num_common_prefix_blocks * + self.block_size) + if common_prefix_len == 0: + # Common case. + use_cascade = False + else: + # NOTE(woosuk): Cascade attention uses two attention kernels: one + # for the common prefix and the other for the rest. For the first + # kernel, we concatenate all the query tokens (possibly from + # different requests) and treat them as if they are from the same + # request. Then, we use bi-directional attention to process the + # common prefix in the KV cache. Importantly, this means that the + # first kernel does not do any masking. + + # Consider the following example: + # Request 1's input query: [D, E, X] + # Request 1's kv cache: [A, B, C, D, E, X] + # Request 1's num_computed_tokens: 3 (i.e., [A, B, C]) + # Request 2's input query: [E, Y] + # Request 2's kv cache: [A, B, C, D, E, Y] + # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D]) + + # If we use [A, B, C, D, E] as the common prefix, then the + # first kernel will compute the bi-directional attention between + # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E]. + # However, this is wrong because D in Request 1 should not attend to + # E in the common prefix (i.e., we need masking). + # To avoid this, [A, B, C, D] should be the common prefix. + # That is, the common prefix should be capped by the minimum + # num_computed_tokens among the requests, and plus one to include + # the first token of the query. + + # In practice, we use [A, B, C] as the common prefix, instead of + # [A, B, C, D] (i.e., the common prefix is capped by the minimum + # num_computed_tokens, without plus one). + # This is because of an implementation detail: We want to always + # use two kernels for cascade attention. Let's imagine: + # Request 3's input query: [D] + # Request 3's kv cache: [A, B, C, D] + # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D]) + # If we use [A, B, C, D] as the common prefix for Request 1-3, + # then Request 3 will be processed only by the first kernel, + # and the second kernel will get an empty input. While this is not + # a fundamental problem, our current implementation does not support + # this case. + common_prefix_len = min( + common_prefix_len, + self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) + # common_prefix_len should be a multiple of the block size. + common_prefix_len = (common_prefix_len // self.block_size * + self.block_size) + use_cascade = FlashAttentionBackend.use_cascade_attention( + common_prefix_len=common_prefix_len, + query_lens=num_scheduled_tokens, + num_query_heads=self.num_query_heads, + num_kv_heads=self.num_kv_heads, + use_alibi=False, # FIXME + use_sliding_window=self.sliding_window is not None, + num_sms=self.num_sms, + ) + + if use_cascade: + # TODO: Optimize. + cu_prefix_query_lens = torch.tensor( + [0, total_num_scheduled_tokens], + dtype=torch.int32, + device=self.device) + prefix_kv_lens = torch.tensor([common_prefix_len], + dtype=torch.int32, + device=self.device) + suffix_kv_lens = (self.seq_lens_np[:num_reqs] - common_prefix_len) + suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(self.device) + else: + cu_prefix_query_lens = None + prefix_kv_lens = None + suffix_kv_lens = None + + attn_metadata = FlashAttentionMetadata( + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + query_start_loc=query_start_loc, + max_seq_len=max_seq_len, + seq_lens=seq_lens, + block_table=( + self.input_batch.block_table.get_device_tensor()[:num_reqs]), + slot_mapping=slot_mapping, + use_cascade=use_cascade, + common_prefix_len=common_prefix_len, + cu_prefix_query_lens=cu_prefix_query_lens, + prefix_kv_lens=prefix_kv_lens, + suffix_kv_lens=suffix_kv_lens, + ) + # NOTE(woosuk): Due to chunked prefills, the batch may contain partial + # requests. While we should not sample any token from these partial + # requests, we do so for simplicity. We will ignore the sampled + # tokens from the partial requests. + # TODO: Support prompt logprobs. + logits_indices = query_start_loc[1:] - 1 + return attn_metadata, logits_indices + + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + mrope_pos_ptr = 0 + num_reqs = self.input_batch.num_reqs + for index, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + assert req_id is not None + + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = \ + self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = \ + scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = len(req.prompt_token_ids) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, + num_prompt_tokens - num_computed_tokens) + completion_part_len = max( + 0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + req.mrope_positions[:,src_start:src_end] + + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's mrope_positions on-the-fly + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + completion_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + + def _prepare_sampling( + self, + batch_changed: bool, + ) -> SamplingMetadata: + # Create the sampling metadata. + req_id_output_token_ids: Dict[str, List[int]] = \ + {req_id: req.output_token_ids \ + for req_id, req in self.requests.items()} + + sampling_metadata = self.input_batch.make_sampling_metadata( + req_id_output_token_ids, skip_copy=not batch_changed) + return sampling_metadata + + def _execute_encoder(self, scheduler_output: "SchedulerOutput"): + scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs + if not scheduled_encoder_inputs: + return + + # Batch the multi-modal inputs. + mm_inputs: List[MultiModalKwargs] = [] + req_input_ids: List[Tuple[str, int]] = [] + for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): + req_state = self.requests[req_id] + for input_id in encoder_input_ids: + mm_inputs.append(req_state.mm_inputs[input_id]) + req_input_ids.append((req_id, input_id)) + + # Batch mm inputs as much as we can: if a request in the batch has + # multiple modalities or a different modality than the previous one, + # we process it separately to preserve item order. + # FIXME(ywang96): This is a hacky way to deal with multiple modalities + # in the same batch while still being able to benefit from batching + # multimodal inputs. The proper solution should be reordering the + # encoder outputs. + grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs) + + encoder_outputs = [] + for grouped_mm_inputs in grouped_mm_inputs_list: + batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, + device=self.device) + + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, each of shape + # (feature_size, hidden_size) in case the feature size is dynamic + # depending on the input multimodal items. + curr_group_outputs = self.model.get_multimodal_embeddings( + **batched_mm_inputs) + + for output in curr_group_outputs: + encoder_outputs.append(output) + + # Cache the encoder outputs. + for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): + if req_id not in self.encoder_cache: + self.encoder_cache[req_id] = {} + self.encoder_cache[req_id][input_id] = output + + def _gather_encoder_outputs( + self, + scheduler_output: "SchedulerOutput", + ) -> List[torch.Tensor]: + encoder_outputs: List[torch.Tensor] = [] + num_reqs = self.input_batch.num_reqs + for req_id in self.input_batch.req_ids[:num_reqs]: + assert req_id is not None + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ + req_id] + req_state = self.requests[req_id] + num_computed_tokens = req_state.num_computed_tokens + mm_positions = req_state.mm_positions + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info["offset"] + num_encoder_tokens = pos_info["length"] + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, + # num_computed_tokens + num_scheduled_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_scheduled_tokens: + # The encoder output is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + continue + + start_idx = max(num_computed_tokens - start_pos, 0) + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens) + assert start_idx < end_idx + assert req_id in self.encoder_cache + assert i in self.encoder_cache[req_id] + encoder_output = self.encoder_cache[req_id][i] + encoder_outputs.append(encoder_output[start_idx:end_idx]) + return encoder_outputs + + def get_model(self) -> nn.Module: + return self.model + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> ModelRunnerOutput: + batch_changed = self._update_states(scheduler_output) + + if self.is_multimodal_model: + # Run the multimodal encoder if any. + self._execute_encoder(scheduler_output) + encoder_outputs = self._gather_encoder_outputs(scheduler_output) + else: + encoder_outputs = [] + + # Prepare the decoder inputs. + attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + if (self.use_cuda_graph + and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): + # Use piecewise CUDA graphs. + # Add padding to the batch size. + num_input_tokens = self.vllm_config.pad_for_cudagraph( + num_scheduled_tokens) + else: + # Eager mode. + num_input_tokens = num_scheduled_tokens + attn_metadata.num_input_tokens = num_input_tokens + + if self.is_multimodal_model: + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + input_ids = self.input_ids[:num_scheduled_tokens] + if encoder_outputs: + inputs_embeds = self.model.get_input_embeddings( + input_ids, encoder_outputs) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + inputs_embeds = self.inputs_embeds[:num_input_tokens] + input_ids = None + else: + # For text-only models, we use token ids as input. + # While it is possible to use embeddings as input just like the + # multimodal models, it is not desirable for performance since + # then the embedding layer is not included in the CUDA graph. + input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None + + # Run the decoder. + # Use persistent buffers for CUDA graphs. + with set_forward_context(attn_metadata, self.vllm_config): + positions = self.mrope_positions[:, :num_input_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_input_tokens] + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + kv_caches=self.kv_caches, + attn_metadata=None, + inputs_embeds=inputs_embeds, + ) + hidden_states = hidden_states[:num_scheduled_tokens] + hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(hidden_states, None) + + # Sample the next token and get logprobs if needed. + sampling_metadata = self._prepare_sampling(batch_changed) + sampler_output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + + # TODO(woosuk): The following loop can be slow since it iterates over + # the requests one by one. Optimize. + num_reqs = self.input_batch.num_reqs + request_seq_lens: List[Tuple[int, CachedRequestState, int]] = [] + for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + assert req_id is not None + req_state = self.requests[req_id] + seq_len = (req_state.num_computed_tokens + + scheduler_output.num_scheduled_tokens[req_id]) + assert seq_len <= req_state.num_tokens + if seq_len == req_state.num_tokens: + # Append the sampled token to the output token ids. + self.input_batch.num_tokens[i] += 1 + # OPTIMIZATION: Priming the state updates for later updates. + req_state.output_token_ids.append(0) + request_seq_lens.append((i, req_state, seq_len)) + else: + # Ignore the sampled token from the partial request. + # Rewind the generator state as if the token was not sampled. + generator = self.input_batch.generators.get(i) + if generator is not None: + # This relies on cuda-specific torch-internal impl details + generator.set_offset(generator.get_offset() - 4) + + # num_reqs entries should be non-None + assert all( + req_id is not None for req_id in + self.input_batch.req_ids[:num_reqs]), "req_ids contains None" + req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) + + # NOTE: GPU -> CPU Sync happens here. + # Move as many CPU operations as possible before this sync point. + sampled_token_ids = sampler_output.sampled_token_ids.tolist() + # Update with the actual token ids + for i, req_state, seq_len in request_seq_lens: + token_id = sampled_token_ids[i] + self.input_batch.token_ids_cpu[i, seq_len] = token_id + req_state.output_token_ids[-1] = token_id + + if sampler_output.logprob_token_ids is None: + logprob_token_ids = None + else: + logprob_token_ids = sampler_output.logprob_token_ids.cpu() + if sampler_output.logprobs is None: + logprobs = None + else: + logprobs = sampler_output.logprobs.cpu() + + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=sampled_token_ids, + logprob_token_ids_cpu=logprob_token_ids, + logprobs_cpu=logprobs, + ) + return model_runner_output + + def load_model(self) -> None: + logger.info("Starting to load model %s...", self.model_config.model) + with DeviceMemoryProfiler() as m: # noqa: SIM117 + self.model = get_model(vllm_config=self.vllm_config) + + self.model_memory_usage = m.consumed_memory + logger.info("Loading model weights took %.4f GB", + self.model_memory_usage / float(2**30)) + + @torch.inference_mode() + def _dummy_run( + self, + num_tokens: int, + kv_caches: Optional[List[torch.Tensor]] = None, + ) -> torch.Tensor: + model = self.model + if kv_caches is None: + kv_caches = self.kv_caches + if self.is_multimodal_model: + input_ids = None + inputs_embeds = self.inputs_embeds[:num_tokens] + else: + input_ids = self.input_ids[:num_tokens] + inputs_embeds = None + with set_forward_context(None, self.vllm_config): + positions = self.mrope_positions[:, :num_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_tokens] + hidden_states = model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=None, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def profile_run(self) -> None: + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value `None`. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + dummy_kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(self.num_attn_layers) + ] + + # Profile with multimodal encoder & encoder cache. + # TODO: handle encoder-decoder models once we support them. + if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 + and self.encoder_cache_size > 0): + + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( # noqa: E501 + self.model_config) + dummy_data_modality, max_tokens_per_mm_item = max( + max_tokens_by_modality_dict.items(), key=lambda item: item[1]) + + # Check how many items of this modality can be supported by + # the encoder budget. + encoder_budget = min(self.max_num_encoder_input_tokens, + self.encoder_cache_size) + + max_num_mm_items_encoder_budget = cdiv(encoder_budget, + max_tokens_per_mm_item) + + # Check how many items of this modality can be supported by + # the decoder budget. + max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( + self.model_config)[dummy_data_modality] + + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. + max_num_mm_items_decoder_budget = self.max_num_reqs * \ + max_mm_items_per_req + + max_num_mm_items = min(max_num_mm_items_encoder_budget, + max_num_mm_items_decoder_budget) + + logger.info( + "Encoder cache will be initialized with a budget of %s tokens," + " and profiled with %s %s items of the maximum feature size.", + encoder_budget, max_num_mm_items, dummy_data_modality) + + # Create dummy batch of multimodal inputs. + dummy_request_data = self.input_registry.dummy_data_for_profiling( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_registry=self.mm_registry, + ) + dummy_mm_data = dummy_request_data.multi_modal_data + + # Dummy data definition in V0 may contain multiple multimodal items + # (e.g, multiple images) for a single request, therefore here we + # always replicate first item by max_num_mm_items times since in V1 + # they are scheduled to be processed separately. + + # Case when models have a merged processor, their dummy data is + # already batched `MultiModalKwargs`, therefore we take the first + # `MultiModalKwargsItem` from the desired modality to profile on. + if isinstance(dummy_mm_data, MultiModalKwargs): + dummy_mm_item = dummy_mm_data.get_item( + modality=dummy_data_modality, item_index=0) + dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) + + # Case when models have dummy data explicitly defined as + # `MultiModalDataDict`, so they need to be processed through input + # mapper. + # TODO (ywang96): deprecate this path once merged processor is + # supported on all models. + else: + mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs( + mm_data=dummy_mm_data, + mm_hashes=None, + mm_processor_kwargs=None, + precomputed_mm_inputs=None) + dummy_mm_kwargs = mm_kwargs_list[0] + + batched_dummy_mm_inputs = MultiModalKwargs.batch( + [dummy_mm_kwargs] * max_num_mm_items) + batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( + batched_dummy_mm_inputs, device=self.device) + + # Run multimodal encoder. + dummy_encoder_outputs = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) + assert len(dummy_encoder_outputs) == max_num_mm_items, ( + "Expected dimension 0 of encoder outputs to match the number " + f"of multimodal data items: {max_num_mm_items}, got " + f"{len(dummy_encoder_outputs)=} instead. This is most likely " + "due to the 'get_multimodal_embeddings' method of the model " + "not implemented correctly.") + + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) + + # Trigger compilation for general shape. + hidden_states = self._dummy_run(self.max_num_tokens, dummy_kv_caches) + logits = self.model.compute_logits(hidden_states, None) + logits = logits[:self.max_num_tokens] + # TODO(woosuk): Consider the memory usage of the sampler. + torch.cuda.synchronize() + del hidden_states, logits + self.encoder_cache.clear() + gc.collect() + + def capture_model(self) -> None: + if not self.use_cuda_graph: + logger.warning( + "Skipping CUDA graph capture. Please add " + "-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE) + return + + start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] + + # Trigger CUDA graph capture for specific shapes. + # Capture the large shapes first so that the smaller shapes + # can reuse the memory pool allocated for the large shapes. + with graph_capture(device=self.device): + for num_tokens in reversed(self.cudagraph_batch_sizes): + for _ in range(self.vllm_config.compilation_config. + cudagraph_num_of_warmups): + self._dummy_run(num_tokens) + self._dummy_run(num_tokens) + + end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] + elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory + # This usually takes 5~20 seconds. + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / (1 << 30)) + + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize KV cache based on `kv_cache_config`. + Args: + kv_cache_config: Configuration for the KV cache, including the KV + cache size of each layer + """ + if len(kv_cache_config.groups) > 1: + raise NotImplementedError( + "Hybrid models with more than one KV cache type are not " + "supported yet.") + + kv_caches: Dict[str, torch.Tensor] = {} + + for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): + tensor_config = kv_cache_config.tensors[layer_name] + assert tensor_config.size % layer_spec.page_size_bytes == 0 + num_blocks = tensor_config.size // layer_spec.page_size_bytes + if isinstance(layer_spec, FullAttentionSpec): + kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( + num_blocks, layer_spec.block_size, layer_spec.num_kv_heads, + layer_spec.head_size) + dtype = layer_spec.dtype + kv_caches[layer_name] = torch.zeros(kv_cache_shape, + dtype=dtype, + device=self.device) + else: + raise NotImplementedError + + bind_kv_cache( + kv_caches, + self.vllm_config.compilation_config.static_forward_context, + self.kv_caches) + + def get_kv_cache_spec(self) -> KVCacheSpec: + """ + Generates the KVCacheSpec by parsing the kv cache format from each + Attention module in the static forward context. + Returns: + KVCacheSpec: A dictionary mapping layer names to their KV cache + format. Layers that do not need KV cache are not included. + """ + + forward_ctx = self.vllm_config.compilation_config.static_forward_context + block_size = self.vllm_config.cache_config.block_size + kv_cache_spec: KVCacheSpec = {} + for layer_name, attn_module in forward_ctx.items(): + # TODO: Support other attention modules, e.g., sliding window, + # cross-attention, MLA. + assert isinstance(attn_module, Attention) + if attn_module.attn_type == AttentionType.DECODER: + kv_cache_spec[layer_name] = FullAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=attn_module.dtype, + ) + elif attn_module.attn_type in (AttentionType.ENCODER, + AttentionType.ENCODER_ONLY): + # encoder-only attention does not need KV cache. + continue + elif attn_module.attn_type == AttentionType.ENCODER_DECODER: + raise NotImplementedError + else: + raise ValueError( + f"Unknown attention type: {attn_module.attn_type}") + + return kv_cache_spec diff --git a/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_worker.py b/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..0adb69073397c9fde95d7d6c88fc6314e9af7b28 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_worker.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +"""A GPU worker class.""" +import gc +import os +from typing import TYPE_CHECKING, Optional + +import torch +import torch.distributed +import torch.nn as nn + +import vllm.envs as envs +from vllm.config import ParallelConfig, VllmConfig +from vllm.device_allocator.cumem import CuMemAllocator +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment, + set_custom_all_reduce) +from vllm.logger import init_logger +from vllm.model_executor import set_random_seed +from vllm.platforms import current_platform +from vllm.utils import GiB_bytes +from vllm.v1.core.scheduler import SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.worker.gpu_model_runner import GPUModelRunner + +logger = init_logger(__name__) + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + + +class Worker: + + def __init__( + self, + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + is_driver_worker: bool = False, + ): + + # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config) + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + + self.parallel_config.rank = rank + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() + + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + if envs.VLLM_TORCH_PROFILER_DIR: + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info("Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir) + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + with_stack=True, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir, use_gzip=True)) + else: + self.profiler = None + + def sleep(self, level: int = 1) -> None: + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] + allocator = CuMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + free_bytes_after_sleep, total = torch.cuda.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, " + "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, + used_bytes / GiB_bytes) + + def wake_up(self) -> None: + allocator = CuMemAllocator.get_instance() + allocator.wake_up() + + def init_device(self): + if self.device_config.device.type == "cuda": + # torch.distributed.all_reduce does not free the input tensor until + # the synchronization point. This causes the memory usage to grow + # as the number of all_reduce calls increases. This env var disables + # this behavior. + # Related issue: + # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 + os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" + + # This env var set by Ray causes exceptions with graph building. + os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + self.device = torch.device(f"cuda:{self.local_rank}") + torch.cuda.set_device(self.device) + + _check_if_gpu_supports_dtype(self.model_config.dtype) + gc.collect() + torch.cuda.empty_cache() + self.init_gpu_memory = torch.cuda.mem_get_info()[0] + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, + self.local_rank) + # Set random seed. + set_random_seed(self.model_config.seed) + + # Construct the model runner + self.model_runner = GPUModelRunner(self.vllm_config, self.device) + + def load_model(self) -> None: + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + assert allocator.get_current_usage() == 0, ( + "Sleep mode can only be " + "used for one instance per process.") + context = allocator.use_memory_pool(tag="weights") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.load_model() + + @torch.inference_mode() + def determine_available_memory(self) -> int: + """Profiles the peak memory usage of the model to determine how much + memory can be used for KV cache without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the free memory that can be used for KV cache in + bytes. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + _, total_gpu_memory = torch.cuda.mem_get_info() + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + free_gpu_memory, _ = torch.cuda.mem_get_info() + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + assert self.init_gpu_memory > free_gpu_memory, ( + "Error in memory profiling. " + f"Initial free memory {self.init_gpu_memory}, current free memory" + f" {free_gpu_memory}. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + + # Get the peak memory allocation recorded by torch + peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] + + # Check for any memory left around that may have been allocated on the + # gpu outside of `torch`. NCCL operations, for example, can use a few + # GB during a forward pass + torch.cuda.empty_cache() + torch_allocated_bytes = torch.cuda.memory_stats( + )["allocated_bytes.all.current"] + total_allocated_bytes = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + non_torch_allocations = total_allocated_bytes - torch_allocated_bytes + if non_torch_allocations > 0: + peak_memory += non_torch_allocations + available_kv_cache_memory = ( + total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) + + return int(available_kv_cache_memory) + + def get_kv_cache_spec(self) -> KVCacheSpec: + return self.model_runner.get_kv_cache_spec() + + def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None: + """Allocate GPU KV cache with the specified kv_cache_config.""" + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + context = allocator.use_memory_pool(tag="kv_cache") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.initialize_kv_cache(kv_cache_config) + + def compile_or_warm_up_model(self) -> None: + # warm up sizes that are not in cudagraph capture sizes, + # but users still want to compile for better performance, + # e.g. for the max-num-batched token size in chunked prefill. + warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() + if not self.model_config.enforce_eager: + warmup_sizes = [ + x for x in warmup_sizes if x not in + self.vllm_config.compilation_config.cudagraph_capture_sizes + ] + for size in sorted(warmup_sizes, reverse=True): + logger.info("Compile and warming up model for size %d", size) + self.model_runner._dummy_run(size) + if not self.model_config.enforce_eager: + self.model_runner.capture_model() + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> Optional[ModelRunnerOutput]: + output = self.model_runner.execute_model(scheduler_output) + return output if self.rank == 0 else None + + def profile(self, is_start: bool = True): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + if is_start: + self.profiler.start() + else: + self.profiler.stop() + + def check_health(self) -> None: + # worker will always be healthy as long as it's running. + return + + +def init_worker_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + local_rank: int = -1, +) -> None: + """Initialize the distributed environment.""" + set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) + + init_distributed_environment(parallel_config.world_size, rank, + distributed_init_method, local_rank) + + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + + +def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): + # Check if the GPU supports the dtype. + if torch_dtype == torch.bfloat16: # noqa: SIM102 + if not current_platform.has_device_capability(80): + capability = current_platform.get_device_capability() + gpu_name = current_platform.get_device_name() + + if capability is None: + compute_str = "does not have a compute capability" + else: + version_str = capability.as_version_str() + compute_str = f"has compute capability {version_str}" + + raise ValueError( + "Bfloat16 is only supported on GPUs with compute capability " + f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " + "You can use float16 instead by explicitly setting the" + "`dtype` flag in CLI, for example: --dtype=half.")