Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__init__.py +19 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/config.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/processor.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/s3_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/tokenizer.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/config.py +605 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/arctic.py +206 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/chatglm.py +71 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/cohere2.py +194 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/eagle.py +51 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/exaone.py +191 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/falcon.py +89 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/h2ovl.py +15 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/internvl.py +53 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/medusa.py +62 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mllama.py +30 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mpt.py +179 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nemotron.py +204 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nvlm_d.py +14 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/solar.py +246 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/ultravox.py +101 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer.py +167 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer_utils.py +169 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/processor.py +106 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__init__.py +6 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/deepseek_vl2.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/s3_utils.py +154 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer.py +245 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__init__.py +56 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +70 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +250 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +108 -0
- .venv/lib/python3.11/site-packages/vllm/transformers_utils/utils.py +22 -0
- .venv/lib/python3.11/site-packages/vllm/v1/engine/__init__.py +111 -0
- .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/async_llm.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core_client.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/detokenizer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from vllm.envs import VLLM_USE_MODELSCOPE
|
| 4 |
+
|
| 5 |
+
if VLLM_USE_MODELSCOPE:
|
| 6 |
+
# Patch here, before each import happens
|
| 7 |
+
import modelscope
|
| 8 |
+
from packaging import version
|
| 9 |
+
|
| 10 |
+
# patch_hub begins from modelscope>=1.18.1
|
| 11 |
+
if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
|
| 12 |
+
raise ImportError(
|
| 13 |
+
'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
|
| 14 |
+
'install by `pip install modelscope -U`')
|
| 15 |
+
|
| 16 |
+
from modelscope.utils.hf_util import patch_hub
|
| 17 |
+
|
| 18 |
+
# Patch hub to download models from modelscope to speed up.
|
| 19 |
+
patch_hub()
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (793 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (25 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer.cpython-311.pyc
ADDED
|
Binary file (6.19 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer_utils.cpython-311.pyc
ADDED
|
Binary file (5.98 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/processor.cpython-311.pyc
ADDED
|
Binary file (3.33 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/s3_utils.cpython-311.pyc
ADDED
|
Binary file (8.73 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/tokenizer.cpython-311.pyc
ADDED
|
Binary file (11 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (1.37 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/config.py
ADDED
|
@@ -0,0 +1,605 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import enum
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any, Dict, Optional, Type, Union
|
| 8 |
+
|
| 9 |
+
import huggingface_hub
|
| 10 |
+
from huggingface_hub import (file_exists, hf_hub_download,
|
| 11 |
+
try_to_load_from_cache)
|
| 12 |
+
from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
|
| 13 |
+
LocalEntryNotFoundError,
|
| 14 |
+
RepositoryNotFoundError,
|
| 15 |
+
RevisionNotFoundError)
|
| 16 |
+
from torch import nn
|
| 17 |
+
from transformers import GenerationConfig, PretrainedConfig
|
| 18 |
+
from transformers.models.auto.image_processing_auto import (
|
| 19 |
+
get_image_processor_config)
|
| 20 |
+
from transformers.models.auto.modeling_auto import (
|
| 21 |
+
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
|
| 22 |
+
from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
|
| 23 |
+
|
| 24 |
+
from vllm.envs import VLLM_USE_MODELSCOPE
|
| 25 |
+
from vllm.logger import init_logger
|
| 26 |
+
# yapf conflicts with isort for this block
|
| 27 |
+
# yapf: disable
|
| 28 |
+
from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
|
| 29 |
+
DbrxConfig, DeepseekVLV2Config,
|
| 30 |
+
EAGLEConfig, ExaoneConfig,
|
| 31 |
+
H2OVLChatConfig,
|
| 32 |
+
InternVLChatConfig, JAISConfig,
|
| 33 |
+
MedusaConfig, MllamaConfig,
|
| 34 |
+
MLPSpeculatorConfig, MPTConfig,
|
| 35 |
+
NemotronConfig, NVLM_D_Config,
|
| 36 |
+
Olmo2Config, RWConfig,
|
| 37 |
+
SolarConfig, Telechat2Config,
|
| 38 |
+
UltravoxConfig)
|
| 39 |
+
# yapf: enable
|
| 40 |
+
from vllm.transformers_utils.utils import check_gguf_file
|
| 41 |
+
from vllm.utils import resolve_obj_by_qualname
|
| 42 |
+
|
| 43 |
+
if VLLM_USE_MODELSCOPE:
|
| 44 |
+
from modelscope import AutoConfig
|
| 45 |
+
else:
|
| 46 |
+
from transformers import AutoConfig
|
| 47 |
+
|
| 48 |
+
MISTRAL_CONFIG_NAME = "params.json"
|
| 49 |
+
HF_TOKEN = os.getenv('HF_TOKEN', None)
|
| 50 |
+
|
| 51 |
+
logger = init_logger(__name__)
|
| 52 |
+
|
| 53 |
+
_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = {
|
| 54 |
+
"mllama": MllamaConfig
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
| 58 |
+
"chatglm": ChatGLMConfig,
|
| 59 |
+
"cohere2": Cohere2Config,
|
| 60 |
+
"dbrx": DbrxConfig,
|
| 61 |
+
"deepseek_vl_v2": DeepseekVLV2Config,
|
| 62 |
+
"mpt": MPTConfig,
|
| 63 |
+
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
| 64 |
+
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
| 65 |
+
"jais": JAISConfig,
|
| 66 |
+
"mlp_speculator": MLPSpeculatorConfig,
|
| 67 |
+
"medusa": MedusaConfig,
|
| 68 |
+
"eagle": EAGLEConfig,
|
| 69 |
+
"exaone": ExaoneConfig,
|
| 70 |
+
"h2ovl_chat": H2OVLChatConfig,
|
| 71 |
+
"internvl_chat": InternVLChatConfig,
|
| 72 |
+
"nemotron": NemotronConfig,
|
| 73 |
+
"NVLM_D": NVLM_D_Config,
|
| 74 |
+
"olmo2": Olmo2Config,
|
| 75 |
+
"solar": SolarConfig,
|
| 76 |
+
"telechat": Telechat2Config,
|
| 77 |
+
"ultravox": UltravoxConfig,
|
| 78 |
+
**_CONFIG_REGISTRY_OVERRIDE_HF
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class ConfigFormat(str, enum.Enum):
|
| 83 |
+
AUTO = "auto"
|
| 84 |
+
HF = "hf"
|
| 85 |
+
MISTRAL = "mistral"
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def file_or_path_exists(model: Union[str, Path], config_name: str,
|
| 89 |
+
revision: Optional[str]) -> bool:
|
| 90 |
+
if Path(model).exists():
|
| 91 |
+
return (Path(model) / config_name).is_file()
|
| 92 |
+
|
| 93 |
+
# Offline mode support: Check if config file is cached already
|
| 94 |
+
cached_filepath = try_to_load_from_cache(repo_id=model,
|
| 95 |
+
filename=config_name,
|
| 96 |
+
revision=revision)
|
| 97 |
+
if isinstance(cached_filepath, str):
|
| 98 |
+
# The config file exists in cache- we can continue trying to load
|
| 99 |
+
return True
|
| 100 |
+
|
| 101 |
+
# NB: file_exists will only check for the existence of the config file on
|
| 102 |
+
# hf_hub. This will fail in offline mode.
|
| 103 |
+
try:
|
| 104 |
+
return file_exists(model,
|
| 105 |
+
config_name,
|
| 106 |
+
revision=revision,
|
| 107 |
+
token=HF_TOKEN)
|
| 108 |
+
except huggingface_hub.errors.OfflineModeIsEnabled:
|
| 109 |
+
# Don't raise in offline mode, all we know is that we don't have this
|
| 110 |
+
# file cached.
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def patch_rope_scaling(config: PretrainedConfig) -> None:
|
| 115 |
+
"""Provide backwards compatibility for RoPE."""
|
| 116 |
+
text_config = getattr(config, "text_config", None)
|
| 117 |
+
if text_config is not None:
|
| 118 |
+
patch_rope_scaling(text_config)
|
| 119 |
+
|
| 120 |
+
rope_scaling = getattr(config, "rope_scaling", None)
|
| 121 |
+
if rope_scaling is not None:
|
| 122 |
+
patch_rope_scaling_dict(rope_scaling)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None:
|
| 126 |
+
if "rope_type" in rope_scaling and "type" in rope_scaling:
|
| 127 |
+
rope_type = rope_scaling["rope_type"]
|
| 128 |
+
rope_type_legacy = rope_scaling["type"]
|
| 129 |
+
if rope_type != rope_type_legacy:
|
| 130 |
+
raise ValueError(
|
| 131 |
+
f"Found conflicts between 'rope_type={rope_type}' (modern "
|
| 132 |
+
f"field) and 'type={rope_type_legacy}' (legacy field). "
|
| 133 |
+
"You should only specify one of them.")
|
| 134 |
+
|
| 135 |
+
if "rope_type" not in rope_scaling and "type" in rope_scaling:
|
| 136 |
+
rope_scaling["rope_type"] = rope_scaling["type"]
|
| 137 |
+
logger.info("Replacing legacy 'type' key with 'rope_type'")
|
| 138 |
+
|
| 139 |
+
if "rope_type" not in rope_scaling:
|
| 140 |
+
raise ValueError("rope_scaling should have a 'rope_type' key")
|
| 141 |
+
|
| 142 |
+
if rope_scaling["rope_type"] == "su":
|
| 143 |
+
rope_scaling["rope_type"] = "longrope"
|
| 144 |
+
logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
|
| 145 |
+
elif rope_scaling["rope_type"] == "mrope":
|
| 146 |
+
assert "mrope_section" in rope_scaling
|
| 147 |
+
rope_scaling["rope_type"] = "default"
|
| 148 |
+
logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def uses_mrope(config: PretrainedConfig) -> bool:
|
| 152 |
+
"""Detect if the model with this config uses M-ROPE."""
|
| 153 |
+
rope_scaling = getattr(config, "rope_scaling", None)
|
| 154 |
+
if rope_scaling is None:
|
| 155 |
+
return False
|
| 156 |
+
|
| 157 |
+
return "mrope_section" in rope_scaling
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def is_encoder_decoder(config: PretrainedConfig) -> bool:
|
| 161 |
+
"""Detect if the model with this config is used as an encoder/decoder."""
|
| 162 |
+
text_config = getattr(config, "text_config", None)
|
| 163 |
+
if text_config is not None:
|
| 164 |
+
return is_encoder_decoder(text_config)
|
| 165 |
+
|
| 166 |
+
return getattr(config, "is_encoder_decoder", False)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def get_config(
|
| 170 |
+
model: Union[str, Path],
|
| 171 |
+
trust_remote_code: bool,
|
| 172 |
+
revision: Optional[str] = None,
|
| 173 |
+
code_revision: Optional[str] = None,
|
| 174 |
+
config_format: ConfigFormat = ConfigFormat.AUTO,
|
| 175 |
+
**kwargs,
|
| 176 |
+
) -> PretrainedConfig:
|
| 177 |
+
# Separate model folder from file path for GGUF models
|
| 178 |
+
|
| 179 |
+
is_gguf = check_gguf_file(model)
|
| 180 |
+
if is_gguf:
|
| 181 |
+
kwargs["gguf_file"] = Path(model).name
|
| 182 |
+
model = Path(model).parent
|
| 183 |
+
|
| 184 |
+
if config_format == ConfigFormat.AUTO:
|
| 185 |
+
if is_gguf or file_or_path_exists(
|
| 186 |
+
model, HF_CONFIG_NAME, revision=revision):
|
| 187 |
+
config_format = ConfigFormat.HF
|
| 188 |
+
elif file_or_path_exists(model, MISTRAL_CONFIG_NAME,
|
| 189 |
+
revision=revision):
|
| 190 |
+
config_format = ConfigFormat.MISTRAL
|
| 191 |
+
else:
|
| 192 |
+
# If we're in offline mode and found no valid config format, then
|
| 193 |
+
# raise an offline mode error to indicate to the user that they
|
| 194 |
+
# don't have files cached and may need to go online.
|
| 195 |
+
# This is conveniently triggered by calling file_exists().
|
| 196 |
+
file_exists(model,
|
| 197 |
+
HF_CONFIG_NAME,
|
| 198 |
+
revision=revision,
|
| 199 |
+
token=HF_TOKEN)
|
| 200 |
+
|
| 201 |
+
raise ValueError(f"No supported config format found in {model}")
|
| 202 |
+
|
| 203 |
+
if config_format == ConfigFormat.HF:
|
| 204 |
+
config_dict, _ = PretrainedConfig.get_config_dict(
|
| 205 |
+
model,
|
| 206 |
+
revision=revision,
|
| 207 |
+
code_revision=code_revision,
|
| 208 |
+
token=HF_TOKEN,
|
| 209 |
+
**kwargs,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Use custom model class if it's in our registry
|
| 213 |
+
model_type = config_dict.get("model_type")
|
| 214 |
+
if model_type in _CONFIG_REGISTRY:
|
| 215 |
+
config_class = _CONFIG_REGISTRY[model_type]
|
| 216 |
+
config = config_class.from_pretrained(
|
| 217 |
+
model,
|
| 218 |
+
revision=revision,
|
| 219 |
+
code_revision=code_revision,
|
| 220 |
+
token=HF_TOKEN,
|
| 221 |
+
**kwargs,
|
| 222 |
+
)
|
| 223 |
+
else:
|
| 224 |
+
try:
|
| 225 |
+
config = AutoConfig.from_pretrained(
|
| 226 |
+
model,
|
| 227 |
+
trust_remote_code=trust_remote_code,
|
| 228 |
+
revision=revision,
|
| 229 |
+
code_revision=code_revision,
|
| 230 |
+
token=HF_TOKEN,
|
| 231 |
+
**kwargs,
|
| 232 |
+
)
|
| 233 |
+
except ValueError as e:
|
| 234 |
+
if (not trust_remote_code
|
| 235 |
+
and "requires you to execute the configuration file"
|
| 236 |
+
in str(e)):
|
| 237 |
+
err_msg = (
|
| 238 |
+
"Failed to load the model config. If the model "
|
| 239 |
+
"is a custom model not yet available in the "
|
| 240 |
+
"HuggingFace transformers library, consider setting "
|
| 241 |
+
"`trust_remote_code=True` in LLM or using the "
|
| 242 |
+
"`--trust-remote-code` flag in the CLI.")
|
| 243 |
+
raise RuntimeError(err_msg) from e
|
| 244 |
+
else:
|
| 245 |
+
raise e
|
| 246 |
+
|
| 247 |
+
elif config_format == ConfigFormat.MISTRAL:
|
| 248 |
+
config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
|
| 249 |
+
else:
|
| 250 |
+
raise ValueError(f"Unsupported config format: {config_format}")
|
| 251 |
+
|
| 252 |
+
# Special architecture mapping check for GGUF models
|
| 253 |
+
if is_gguf:
|
| 254 |
+
if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
|
| 255 |
+
raise RuntimeError(
|
| 256 |
+
f"Can't get gguf config for {config.model_type}.")
|
| 257 |
+
model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
|
| 258 |
+
config.update({"architectures": [model_type]})
|
| 259 |
+
|
| 260 |
+
patch_rope_scaling(config)
|
| 261 |
+
|
| 262 |
+
if trust_remote_code:
|
| 263 |
+
maybe_register_config_serialize_by_value()
|
| 264 |
+
|
| 265 |
+
return config
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def get_hf_file_to_dict(file_name: str,
|
| 269 |
+
model: Union[str, Path],
|
| 270 |
+
revision: Optional[str] = 'main'):
|
| 271 |
+
"""
|
| 272 |
+
Downloads a file from the Hugging Face Hub and returns
|
| 273 |
+
its contents as a dictionary.
|
| 274 |
+
|
| 275 |
+
Parameters:
|
| 276 |
+
- file_name (str): The name of the file to download.
|
| 277 |
+
- model (str): The name of the model on the Hugging Face Hub.
|
| 278 |
+
- revision (str): The specific version of the model.
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
- config_dict (dict): A dictionary containing
|
| 282 |
+
the contents of the downloaded file.
|
| 283 |
+
"""
|
| 284 |
+
file_path = Path(model) / file_name
|
| 285 |
+
|
| 286 |
+
if file_or_path_exists(model=model,
|
| 287 |
+
config_name=file_name,
|
| 288 |
+
revision=revision):
|
| 289 |
+
|
| 290 |
+
if not file_path.is_file():
|
| 291 |
+
try:
|
| 292 |
+
hf_hub_file = hf_hub_download(model,
|
| 293 |
+
file_name,
|
| 294 |
+
revision=revision)
|
| 295 |
+
except (RepositoryNotFoundError, RevisionNotFoundError,
|
| 296 |
+
EntryNotFoundError, LocalEntryNotFoundError) as e:
|
| 297 |
+
logger.debug("File or repository not found in hf_hub_download",
|
| 298 |
+
e)
|
| 299 |
+
return None
|
| 300 |
+
except HfHubHTTPError as e:
|
| 301 |
+
logger.warning(
|
| 302 |
+
"Cannot connect to Hugging Face Hub. Skipping file "
|
| 303 |
+
"download for '%s':",
|
| 304 |
+
file_name,
|
| 305 |
+
exc_info=e)
|
| 306 |
+
return None
|
| 307 |
+
file_path = Path(hf_hub_file)
|
| 308 |
+
|
| 309 |
+
with open(file_path) as file:
|
| 310 |
+
return json.load(file)
|
| 311 |
+
return None
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def get_pooling_config(model: str, revision: Optional[str] = 'main'):
|
| 315 |
+
"""
|
| 316 |
+
This function gets the pooling and normalize
|
| 317 |
+
config from the model - only applies to
|
| 318 |
+
sentence-transformers models.
|
| 319 |
+
|
| 320 |
+
Args:
|
| 321 |
+
model (str): The name of the Hugging Face model.
|
| 322 |
+
revision (str, optional): The specific version
|
| 323 |
+
of the model to use. Defaults to 'main'.
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
dict: A dictionary containing the pooling
|
| 327 |
+
type and whether normalization is used.
|
| 328 |
+
"""
|
| 329 |
+
|
| 330 |
+
modules_file_name = "modules.json"
|
| 331 |
+
modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
|
| 332 |
+
|
| 333 |
+
if modules_dict is None:
|
| 334 |
+
return None
|
| 335 |
+
|
| 336 |
+
pooling = next((item for item in modules_dict
|
| 337 |
+
if item["type"] == "sentence_transformers.models.Pooling"),
|
| 338 |
+
None)
|
| 339 |
+
normalize = bool(
|
| 340 |
+
next((item for item in modules_dict
|
| 341 |
+
if item["type"] == "sentence_transformers.models.Normalize"),
|
| 342 |
+
False))
|
| 343 |
+
|
| 344 |
+
if pooling:
|
| 345 |
+
|
| 346 |
+
pooling_file_name = "{}/config.json".format(pooling["path"])
|
| 347 |
+
pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
|
| 348 |
+
pooling_type_name = next(
|
| 349 |
+
(item for item, val in pooling_dict.items() if val is True), None)
|
| 350 |
+
|
| 351 |
+
if pooling_type_name is not None:
|
| 352 |
+
pooling_type_name = get_pooling_config_name(pooling_type_name)
|
| 353 |
+
|
| 354 |
+
return {"pooling_type": pooling_type_name, "normalize": normalize}
|
| 355 |
+
|
| 356 |
+
return None
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
|
| 360 |
+
if "pooling_mode_" in pooling_name:
|
| 361 |
+
pooling_name = pooling_name.replace("pooling_mode_", "")
|
| 362 |
+
|
| 363 |
+
if "_" in pooling_name:
|
| 364 |
+
pooling_name = pooling_name.split("_")[0]
|
| 365 |
+
|
| 366 |
+
if "lasttoken" in pooling_name:
|
| 367 |
+
pooling_name = "last"
|
| 368 |
+
|
| 369 |
+
supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
|
| 370 |
+
pooling_type_name = pooling_name.upper()
|
| 371 |
+
|
| 372 |
+
try:
|
| 373 |
+
if pooling_type_name in supported_pooling_types:
|
| 374 |
+
return pooling_type_name
|
| 375 |
+
except NotImplementedError as e:
|
| 376 |
+
logger.debug("Pooling type not supported", e)
|
| 377 |
+
return None
|
| 378 |
+
return None
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def get_sentence_transformer_tokenizer_config(model: str,
|
| 382 |
+
revision: Optional[str] = 'main'
|
| 383 |
+
):
|
| 384 |
+
"""
|
| 385 |
+
Returns the tokenization configuration dictionary for a
|
| 386 |
+
given Sentence Transformer BERT model.
|
| 387 |
+
|
| 388 |
+
Parameters:
|
| 389 |
+
- model (str): The name of the Sentence Transformer
|
| 390 |
+
BERT model.
|
| 391 |
+
- revision (str, optional): The revision of the m
|
| 392 |
+
odel to use. Defaults to 'main'.
|
| 393 |
+
|
| 394 |
+
Returns:
|
| 395 |
+
- dict: A dictionary containing the configuration parameters
|
| 396 |
+
for the Sentence Transformer BERT model.
|
| 397 |
+
"""
|
| 398 |
+
for config_name in [
|
| 399 |
+
"sentence_bert_config.json",
|
| 400 |
+
"sentence_roberta_config.json",
|
| 401 |
+
"sentence_distilbert_config.json",
|
| 402 |
+
"sentence_camembert_config.json",
|
| 403 |
+
"sentence_albert_config.json",
|
| 404 |
+
"sentence_xlm-roberta_config.json",
|
| 405 |
+
"sentence_xlnet_config.json",
|
| 406 |
+
]:
|
| 407 |
+
encoder_dict = get_hf_file_to_dict(config_name, model, revision)
|
| 408 |
+
if encoder_dict:
|
| 409 |
+
break
|
| 410 |
+
|
| 411 |
+
if not encoder_dict:
|
| 412 |
+
return None
|
| 413 |
+
|
| 414 |
+
if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
|
| 415 |
+
return encoder_dict
|
| 416 |
+
return None
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
def maybe_register_config_serialize_by_value() -> None:
|
| 420 |
+
"""Try to register HF model configuration class to serialize by value
|
| 421 |
+
|
| 422 |
+
If trust_remote_code is set, and the model's config file specifies an
|
| 423 |
+
`AutoConfig` class, then the config class is typically an instance of
|
| 424 |
+
a custom class imported from the HF modules cache.
|
| 425 |
+
|
| 426 |
+
Examples:
|
| 427 |
+
|
| 428 |
+
>>> from transformers import AutoConfig
|
| 429 |
+
>>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
|
| 430 |
+
>>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
|
| 431 |
+
>>> import transformers_modules # error, not initialized
|
| 432 |
+
>>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
|
| 433 |
+
>>> import transformers_modules # success, initialized
|
| 434 |
+
>>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
|
| 435 |
+
|
| 436 |
+
In the DeepSeek example, the config class is an instance of a custom
|
| 437 |
+
class that is not serializable by default. This class will not be
|
| 438 |
+
importable in spawned workers, and won't exist at all on
|
| 439 |
+
other nodes, which breaks serialization of the config.
|
| 440 |
+
|
| 441 |
+
In this function we tell the cloudpickle serialization library to pass
|
| 442 |
+
instances of these generated classes by value instead of by reference,
|
| 443 |
+
i.e. the class definition is serialized along with its data so that the
|
| 444 |
+
class module does not need to be importable on the receiving end.
|
| 445 |
+
|
| 446 |
+
See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
|
| 447 |
+
""" # noqa
|
| 448 |
+
try:
|
| 449 |
+
import transformers_modules
|
| 450 |
+
except ImportError:
|
| 451 |
+
# the config does not need trust_remote_code
|
| 452 |
+
return
|
| 453 |
+
|
| 454 |
+
try:
|
| 455 |
+
import cloudpickle
|
| 456 |
+
cloudpickle.register_pickle_by_value(transformers_modules)
|
| 457 |
+
|
| 458 |
+
# ray vendors its own version of cloudpickle
|
| 459 |
+
from vllm.executor.ray_utils import ray
|
| 460 |
+
if ray:
|
| 461 |
+
ray.cloudpickle.register_pickle_by_value(transformers_modules)
|
| 462 |
+
|
| 463 |
+
# multiprocessing uses pickle to serialize arguments when using spawn
|
| 464 |
+
# Here we get pickle to use cloudpickle to serialize config objects
|
| 465 |
+
# that contain instances of the custom config class to avoid
|
| 466 |
+
# serialization problems if the generated module (and model) has a `.`
|
| 467 |
+
# in its name
|
| 468 |
+
import multiprocessing
|
| 469 |
+
import pickle
|
| 470 |
+
|
| 471 |
+
from vllm.config import VllmConfig
|
| 472 |
+
|
| 473 |
+
def _reduce_config(config: VllmConfig):
|
| 474 |
+
return (pickle.loads, (cloudpickle.dumps(config), ))
|
| 475 |
+
|
| 476 |
+
multiprocessing.reducer.register(VllmConfig, _reduce_config)
|
| 477 |
+
|
| 478 |
+
except Exception as e:
|
| 479 |
+
logger.warning(
|
| 480 |
+
"Unable to register remote classes used by"
|
| 481 |
+
" trust_remote_code with by-value serialization. This may"
|
| 482 |
+
" lead to a later error. If remote code is not needed"
|
| 483 |
+
" remove `--trust-remote-code`",
|
| 484 |
+
exc_info=e)
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def load_params_config(model: Union[str, Path], revision: Optional[str],
|
| 488 |
+
**kwargs) -> PretrainedConfig:
|
| 489 |
+
# This function loads a params.json config which
|
| 490 |
+
# should be used when loading models in mistral format
|
| 491 |
+
|
| 492 |
+
config_file_name = "params.json"
|
| 493 |
+
|
| 494 |
+
config_dict = get_hf_file_to_dict(config_file_name, model, revision)
|
| 495 |
+
assert isinstance(config_dict, dict)
|
| 496 |
+
|
| 497 |
+
config_mapping = {
|
| 498 |
+
"dim": "hidden_size",
|
| 499 |
+
"norm_eps": "rms_norm_eps",
|
| 500 |
+
"n_kv_heads": "num_key_value_heads",
|
| 501 |
+
"n_layers": "num_hidden_layers",
|
| 502 |
+
"n_heads": "num_attention_heads",
|
| 503 |
+
"hidden_dim": "intermediate_size",
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
def recurse_elems(elem: Any):
|
| 507 |
+
if isinstance(elem, dict):
|
| 508 |
+
config_dict = {}
|
| 509 |
+
for key, value in elem.items():
|
| 510 |
+
key = config_mapping.get(key, key)
|
| 511 |
+
config_dict[key] = recurse_elems(value)
|
| 512 |
+
return PretrainedConfig(**config_dict)
|
| 513 |
+
else:
|
| 514 |
+
return elem
|
| 515 |
+
|
| 516 |
+
config_dict["model_type"] = config_dict.get("model_type", "transformer")
|
| 517 |
+
config_dict["hidden_act"] = config_dict.get("activation", "silu")
|
| 518 |
+
config_dict["tie_word_embeddings"] = config_dict.get(
|
| 519 |
+
"tie_embeddings", False)
|
| 520 |
+
config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
|
| 521 |
+
config_dict["max_position_embeddings"] = config_dict.get(
|
| 522 |
+
"max_position_embeddings", 128_000)
|
| 523 |
+
|
| 524 |
+
if config_dict.get("moe") is not None:
|
| 525 |
+
config_dict["architectures"] = ["MixtralForCausalLM"]
|
| 526 |
+
else:
|
| 527 |
+
config_dict["architectures"] = ["MistralForCausalLM"]
|
| 528 |
+
|
| 529 |
+
if config_dict.get("vision_encoder") is not None:
|
| 530 |
+
multimodal_config = config_dict.pop("vision_encoder")
|
| 531 |
+
|
| 532 |
+
config_dict = {
|
| 533 |
+
"text_config": config_dict,
|
| 534 |
+
"vision_config": multimodal_config
|
| 535 |
+
}
|
| 536 |
+
config_dict["architectures"] = ["PixtralForConditionalGeneration"]
|
| 537 |
+
config_dict["model_type"] = "pixtral"
|
| 538 |
+
|
| 539 |
+
config_dict.update(kwargs)
|
| 540 |
+
|
| 541 |
+
config = recurse_elems(config_dict)
|
| 542 |
+
return config
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
def get_hf_image_processor_config(
|
| 546 |
+
model: Union[str, Path],
|
| 547 |
+
revision: Optional[str] = None,
|
| 548 |
+
**kwargs,
|
| 549 |
+
) -> Dict[str, Any]:
|
| 550 |
+
# ModelScope does not provide an interface for image_processor
|
| 551 |
+
if VLLM_USE_MODELSCOPE:
|
| 552 |
+
return dict()
|
| 553 |
+
# Separate model folder from file path for GGUF models
|
| 554 |
+
if check_gguf_file(model):
|
| 555 |
+
model = Path(model).parent
|
| 556 |
+
return get_image_processor_config(model, revision=revision, **kwargs)
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
def get_hf_text_config(config: PretrainedConfig):
|
| 560 |
+
"""Get the "sub" config relevant to llm for multi modal models.
|
| 561 |
+
No op for pure text models.
|
| 562 |
+
"""
|
| 563 |
+
if hasattr(config, "text_config"):
|
| 564 |
+
# The code operates under the assumption that text_config should have
|
| 565 |
+
# `num_attention_heads` (among others). Assert here to fail early
|
| 566 |
+
# if transformers config doesn't align with this assumption.
|
| 567 |
+
assert hasattr(config.text_config, "num_attention_heads")
|
| 568 |
+
return config.text_config
|
| 569 |
+
else:
|
| 570 |
+
return config
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
def try_get_generation_config(
|
| 574 |
+
model: str,
|
| 575 |
+
trust_remote_code: bool,
|
| 576 |
+
revision: Optional[str] = None,
|
| 577 |
+
) -> Optional[GenerationConfig]:
|
| 578 |
+
try:
|
| 579 |
+
return GenerationConfig.from_pretrained(
|
| 580 |
+
model,
|
| 581 |
+
revision=revision,
|
| 582 |
+
)
|
| 583 |
+
except OSError: # Not found
|
| 584 |
+
try:
|
| 585 |
+
config = get_config(
|
| 586 |
+
model,
|
| 587 |
+
trust_remote_code=trust_remote_code,
|
| 588 |
+
revision=revision,
|
| 589 |
+
)
|
| 590 |
+
return GenerationConfig.from_model_config(config)
|
| 591 |
+
except OSError: # Not found
|
| 592 |
+
return None
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
def get_cross_encoder_activation_function(config: PretrainedConfig):
|
| 596 |
+
if (hasattr(config, "sbert_ce_default_activation_function")
|
| 597 |
+
and config.sbert_ce_default_activation_function is not None):
|
| 598 |
+
|
| 599 |
+
function_name = config.sbert_ce_default_activation_function
|
| 600 |
+
assert function_name.startswith("torch.nn.modules."), \
|
| 601 |
+
"Loading of activation functions is restricted to " \
|
| 602 |
+
"torch.nn.modules for security reasons"
|
| 603 |
+
return resolve_obj_by_qualname(function_name)()
|
| 604 |
+
else:
|
| 605 |
+
return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/arctic.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# yapf: disable
|
| 4 |
+
# ruff: noqa: E501
|
| 5 |
+
# coding=utf-8
|
| 6 |
+
# Copied from
|
| 7 |
+
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
| 8 |
+
""" Arctic model configuration"""
|
| 9 |
+
|
| 10 |
+
from dataclasses import asdict, dataclass
|
| 11 |
+
from typing import Any, Dict
|
| 12 |
+
|
| 13 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 14 |
+
from transformers.utils import logging
|
| 15 |
+
|
| 16 |
+
logger = logging.get_logger(__name__)
|
| 17 |
+
|
| 18 |
+
ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
| 19 |
+
"arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class ArcticLoraConfig:
|
| 25 |
+
lora_r: int = 64
|
| 26 |
+
lora_alpha: float = 16
|
| 27 |
+
shard_base_weights: bool = False
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class ArcticQuantizationConfig:
|
| 32 |
+
q_bits: int = 8
|
| 33 |
+
rounding: str = "nearest"
|
| 34 |
+
mantissa_bits: int = 3
|
| 35 |
+
group_size: int = 128
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class ArcticConfig(PretrainedConfig):
|
| 39 |
+
r"""
|
| 40 |
+
This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
|
| 41 |
+
Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
| 42 |
+
with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 46 |
+
documentation from [`PretrainedConfig`] for more information.
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
vocab_size (`int`, *optional*, defaults to 32000):
|
| 51 |
+
Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
|
| 52 |
+
`inputs_ids` passed when calling [`ArcticModel`]
|
| 53 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
| 54 |
+
Dimension of the hidden representations.
|
| 55 |
+
intermediate_size (`int`, *optional*, defaults to 14336):
|
| 56 |
+
Dimension of the MLP representations.
|
| 57 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
| 58 |
+
Number of hidden layers in the Transformer encoder.
|
| 59 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
| 60 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
| 61 |
+
num_key_value_heads (`int`, *optional*, defaults to 8):
|
| 62 |
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
| 63 |
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
| 64 |
+
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
| 65 |
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
| 66 |
+
by meanpooling all the original heads within that group. For more details checkout [this
|
| 67 |
+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
| 68 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
| 69 |
+
The non-linear activation function (function or string) in the decoder.
|
| 70 |
+
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
|
| 71 |
+
The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
|
| 72 |
+
allows sequence of up to 4096*32 tokens.
|
| 73 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 74 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 75 |
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
| 76 |
+
The epsilon used by the rms normalization layers.
|
| 77 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
| 78 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
| 79 |
+
relevant if `config.is_decoder=True`.
|
| 80 |
+
pad_token_id (`int`, *optional*):
|
| 81 |
+
The id of the padding token.
|
| 82 |
+
bos_token_id (`int`, *optional*, defaults to 1):
|
| 83 |
+
The id of the "beginning-of-sequence" token.
|
| 84 |
+
eos_token_id (`int`, *optional*, defaults to 2):
|
| 85 |
+
The id of the "end-of-sequence" token.
|
| 86 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
| 87 |
+
Whether the model's input and output word embeddings should be tied.
|
| 88 |
+
rope_theta (`float`, *optional*, defaults to 1000000.0):
|
| 89 |
+
The base period of the RoPE embeddings.
|
| 90 |
+
sliding_window (`int`, *optional*):
|
| 91 |
+
Sliding window attention window size. If not specified, will default to `4096`.
|
| 92 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 93 |
+
The dropout ratio for the attention probabilities.
|
| 94 |
+
num_experts_per_tok (`int`, *optional*, defaults to 2):
|
| 95 |
+
The number of experts to root per-token, can be also interpreted as the `top-p` routing
|
| 96 |
+
parameter
|
| 97 |
+
num_local_experts (`int`, *optional*, defaults to 8):
|
| 98 |
+
Number of experts per Sparse MLP layer.
|
| 99 |
+
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
| 100 |
+
The aux loss factor for the total loss.
|
| 101 |
+
|
| 102 |
+
```python
|
| 103 |
+
>>> from transformers import ArcticModel, ArcticConfig
|
| 104 |
+
|
| 105 |
+
>>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
|
| 106 |
+
>>> configuration = ArcticConfig()
|
| 107 |
+
|
| 108 |
+
>>> # Initializing a model from the Arctic 7B style configuration
|
| 109 |
+
>>> model = ArcticModel(configuration)
|
| 110 |
+
|
| 111 |
+
>>> # Accessing the model configuration
|
| 112 |
+
>>> configuration = model.config
|
| 113 |
+
```"""
|
| 114 |
+
|
| 115 |
+
model_type = "arctic"
|
| 116 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 117 |
+
|
| 118 |
+
def __init__(
|
| 119 |
+
self,
|
| 120 |
+
vocab_size=32000,
|
| 121 |
+
hidden_size=4096,
|
| 122 |
+
intermediate_size=14336,
|
| 123 |
+
num_hidden_layers=32,
|
| 124 |
+
num_attention_heads=32,
|
| 125 |
+
num_key_value_heads=None,
|
| 126 |
+
hidden_act="silu",
|
| 127 |
+
max_position_embeddings=4096,
|
| 128 |
+
initializer_range=0.02,
|
| 129 |
+
rms_norm_eps=1e-5,
|
| 130 |
+
use_cache=True,
|
| 131 |
+
pad_token_id=None,
|
| 132 |
+
bos_token_id=1,
|
| 133 |
+
eos_token_id=2,
|
| 134 |
+
tie_word_embeddings=False,
|
| 135 |
+
rope_theta=1e6,
|
| 136 |
+
sliding_window=None,
|
| 137 |
+
attention_dropout=0.0,
|
| 138 |
+
num_experts_per_tok=1,
|
| 139 |
+
num_local_experts=8,
|
| 140 |
+
router_aux_loss_coef=0.001,
|
| 141 |
+
moe_layer_frequency=2,
|
| 142 |
+
parallel_attn_mlp_res=False,
|
| 143 |
+
moe_train_capacity_factor=1,
|
| 144 |
+
moe_eval_capacity_factor=1,
|
| 145 |
+
enable_expert_tensor_parallelism=False,
|
| 146 |
+
moe_min_capacity=0,
|
| 147 |
+
moe_token_dropping=True,
|
| 148 |
+
quantization=None,
|
| 149 |
+
**kwargs,
|
| 150 |
+
):
|
| 151 |
+
self.vocab_size = vocab_size
|
| 152 |
+
self.max_position_embeddings = max_position_embeddings
|
| 153 |
+
self.hidden_size = hidden_size
|
| 154 |
+
self.intermediate_size = intermediate_size
|
| 155 |
+
self.num_hidden_layers = num_hidden_layers
|
| 156 |
+
self.num_attention_heads = num_attention_heads
|
| 157 |
+
self.sliding_window = sliding_window
|
| 158 |
+
|
| 159 |
+
# for backward compatibility
|
| 160 |
+
if num_key_value_heads is None:
|
| 161 |
+
num_key_value_heads = num_attention_heads
|
| 162 |
+
|
| 163 |
+
self.num_key_value_heads = num_key_value_heads
|
| 164 |
+
self.hidden_act = hidden_act
|
| 165 |
+
self.initializer_range = initializer_range
|
| 166 |
+
self.rms_norm_eps = rms_norm_eps
|
| 167 |
+
self.use_cache = use_cache
|
| 168 |
+
self.rope_theta = rope_theta
|
| 169 |
+
self.attention_dropout = attention_dropout
|
| 170 |
+
|
| 171 |
+
self.num_experts_per_tok = num_experts_per_tok
|
| 172 |
+
self.num_local_experts = num_local_experts
|
| 173 |
+
self.router_aux_loss_coef = router_aux_loss_coef
|
| 174 |
+
self.moe_layer_frequency = moe_layer_frequency
|
| 175 |
+
self.moe_train_capacity_factor = moe_train_capacity_factor
|
| 176 |
+
self.moe_eval_capacity_factor = moe_eval_capacity_factor
|
| 177 |
+
self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
|
| 178 |
+
self.moe_min_capacity = moe_min_capacity
|
| 179 |
+
self.moe_token_dropping = moe_token_dropping
|
| 180 |
+
self.parallel_attn_mlp_res = parallel_attn_mlp_res
|
| 181 |
+
if isinstance(quantization, dict):
|
| 182 |
+
self.quantization = ArcticQuantizationConfig(**quantization)
|
| 183 |
+
else:
|
| 184 |
+
self.quantization = quantization
|
| 185 |
+
|
| 186 |
+
super().__init__(
|
| 187 |
+
pad_token_id=pad_token_id,
|
| 188 |
+
bos_token_id=bos_token_id,
|
| 189 |
+
eos_token_id=eos_token_id,
|
| 190 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 191 |
+
**kwargs,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
@classmethod
|
| 195 |
+
def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig":
|
| 196 |
+
result = super().from_dict(config_dict, **kwargs)
|
| 197 |
+
config = result[0] if isinstance(result, tuple) else result
|
| 198 |
+
if isinstance(config.quantization, dict):
|
| 199 |
+
config.quantization = ArcticQuantizationConfig(**config.quantization)
|
| 200 |
+
return result
|
| 201 |
+
|
| 202 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 203 |
+
ret = super().to_dict()
|
| 204 |
+
if isinstance(ret["quantization"], ArcticQuantizationConfig):
|
| 205 |
+
ret["quantization"] = asdict(ret["quantization"])
|
| 206 |
+
return ret
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/chatglm.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Adapted from
|
| 4 |
+
# https://github.com/THUDM/ChatGLM2-6B
|
| 5 |
+
from transformers import PretrainedConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ChatGLMConfig(PretrainedConfig):
|
| 9 |
+
model_type = "chatglm"
|
| 10 |
+
attribute_map = {
|
| 11 |
+
"num_hidden_layers": "num_layers",
|
| 12 |
+
"n_head_kv": "multi_query_group_num",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
def __init__(self,
|
| 16 |
+
num_layers=28,
|
| 17 |
+
padded_vocab_size=65024,
|
| 18 |
+
hidden_size=4096,
|
| 19 |
+
ffn_hidden_size=13696,
|
| 20 |
+
kv_channels=128,
|
| 21 |
+
num_attention_heads=32,
|
| 22 |
+
seq_length=2048,
|
| 23 |
+
hidden_dropout=0.0,
|
| 24 |
+
attention_dropout=0.0,
|
| 25 |
+
layernorm_epsilon=1e-5,
|
| 26 |
+
rmsnorm=True,
|
| 27 |
+
apply_residual_connection_post_layernorm=False,
|
| 28 |
+
post_layer_norm=True,
|
| 29 |
+
add_bias_linear=False,
|
| 30 |
+
add_qkv_bias=False,
|
| 31 |
+
interleaved_qkv=False,
|
| 32 |
+
bias_dropout_fusion=True,
|
| 33 |
+
multi_query_attention=False,
|
| 34 |
+
multi_query_group_num=1,
|
| 35 |
+
apply_query_key_layer_scaling=True,
|
| 36 |
+
attention_softmax_in_fp32=True,
|
| 37 |
+
fp32_residual_connection=False,
|
| 38 |
+
quantization_bit=0,
|
| 39 |
+
pre_seq_len=None,
|
| 40 |
+
prefix_projection=False,
|
| 41 |
+
**kwargs):
|
| 42 |
+
self.num_layers = num_layers
|
| 43 |
+
self.vocab_size = padded_vocab_size
|
| 44 |
+
self.padded_vocab_size = padded_vocab_size
|
| 45 |
+
self.hidden_size = hidden_size
|
| 46 |
+
self.ffn_hidden_size = ffn_hidden_size
|
| 47 |
+
self.kv_channels = kv_channels
|
| 48 |
+
self.num_attention_heads = num_attention_heads
|
| 49 |
+
self.seq_length = seq_length
|
| 50 |
+
# It is to be compatible with long lora.
|
| 51 |
+
self.max_position_embeddings = seq_length
|
| 52 |
+
self.hidden_dropout = hidden_dropout
|
| 53 |
+
self.attention_dropout = attention_dropout
|
| 54 |
+
self.layernorm_epsilon = layernorm_epsilon
|
| 55 |
+
self.rmsnorm = rmsnorm
|
| 56 |
+
self.apply_residual_connection_post_layernorm = (
|
| 57 |
+
apply_residual_connection_post_layernorm)
|
| 58 |
+
self.post_layer_norm = post_layer_norm
|
| 59 |
+
self.add_bias_linear = add_bias_linear
|
| 60 |
+
self.add_qkv_bias = add_qkv_bias
|
| 61 |
+
self.bias_dropout_fusion = bias_dropout_fusion
|
| 62 |
+
self.multi_query_attention = multi_query_attention
|
| 63 |
+
self.multi_query_group_num = multi_query_group_num
|
| 64 |
+
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
|
| 65 |
+
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
|
| 66 |
+
self.fp32_residual_connection = fp32_residual_connection
|
| 67 |
+
self.quantization_bit = quantization_bit
|
| 68 |
+
self.pre_seq_len = pre_seq_len
|
| 69 |
+
self.prefix_projection = prefix_projection
|
| 70 |
+
self.interleaved_qkv = interleaved_qkv
|
| 71 |
+
super().__init__(**kwargs)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/cohere2.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# ruff: noqa
|
| 4 |
+
|
| 5 |
+
# Adapted from
|
| 6 |
+
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
|
| 7 |
+
from transformers import PretrainedConfig
|
| 8 |
+
from transformers.modeling_rope_utils import rope_config_validation
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Cohere2Config(PretrainedConfig):
|
| 12 |
+
r"""
|
| 13 |
+
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
|
| 14 |
+
model according to the specified arguments, defining the model architecture.
|
| 15 |
+
|
| 16 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 17 |
+
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
|
| 18 |
+
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
vocab_size (`int`, *optional*, defaults to 256000):
|
| 23 |
+
Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
|
| 24 |
+
`inputs_ids` passed when calling [`CohereModel`]
|
| 25 |
+
hidden_size (`int`, *optional*, defaults to 8192):
|
| 26 |
+
Dimension of the hidden representations.
|
| 27 |
+
intermediate_size (`int`, *optional*, defaults to 22528):
|
| 28 |
+
Dimension of the MLP representations.
|
| 29 |
+
logit_scale (`float`, *optional*, defaults to 0.0625):
|
| 30 |
+
The scaling factor for the output logits.
|
| 31 |
+
num_hidden_layers (`int`, *optional*, defaults to 40):
|
| 32 |
+
Number of hidden layers in the Transformer decoder.
|
| 33 |
+
num_attention_heads (`int`, *optional*, defaults to 64):
|
| 34 |
+
Number of attention heads for each attention layer in the Transformer decoder.
|
| 35 |
+
num_key_value_heads (`int`, *optional*):
|
| 36 |
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
| 37 |
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
| 38 |
+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
| 39 |
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
| 40 |
+
by meanpooling all the original heads within that group. For more details checkout [this
|
| 41 |
+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
| 42 |
+
`num_attention_heads`.
|
| 43 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
| 44 |
+
The non-linear activation function (function or string) in the decoder.
|
| 45 |
+
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
| 46 |
+
The maximum sequence length that this model might ever be used with.
|
| 47 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 48 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 49 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
| 50 |
+
The epsilon used by the layer normalization.
|
| 51 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
| 52 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
| 53 |
+
relevant if `config.is_decoder=True`.
|
| 54 |
+
pad_token_id (`int`, *optional*, defaults to 0):
|
| 55 |
+
Padding token id.
|
| 56 |
+
bos_token_id (`int`, *optional*, defaults to 5):
|
| 57 |
+
Beginning of stream token id.
|
| 58 |
+
eos_token_id (`int`, *optional*, defaults to 255001):
|
| 59 |
+
End of stream token id.
|
| 60 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
| 61 |
+
Whether to tie weight embeddings
|
| 62 |
+
rope_theta (`float`, *optional*, defaults to 10000.0):
|
| 63 |
+
The base period of the RoPE embeddings.
|
| 64 |
+
rope_scaling (`Dict`, *optional*):
|
| 65 |
+
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
| 66 |
+
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
| 67 |
+
accordingly.
|
| 68 |
+
Expected contents:
|
| 69 |
+
`rope_type` (`str`):
|
| 70 |
+
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
| 71 |
+
'llama3'], with 'default' being the original RoPE implementation.
|
| 72 |
+
`factor` (`float`, *optional*):
|
| 73 |
+
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
| 74 |
+
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
| 75 |
+
original maximum pre-trained length.
|
| 76 |
+
`original_max_position_embeddings` (`int`, *optional*):
|
| 77 |
+
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
| 78 |
+
pretraining.
|
| 79 |
+
`attention_factor` (`float`, *optional*):
|
| 80 |
+
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
| 81 |
+
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
| 82 |
+
`factor` field to infer the suggested value.
|
| 83 |
+
`beta_fast` (`float`, *optional*):
|
| 84 |
+
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
| 85 |
+
ramp function. If unspecified, it defaults to 32.
|
| 86 |
+
`beta_slow` (`float`, *optional*):
|
| 87 |
+
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
| 88 |
+
ramp function. If unspecified, it defaults to 1.
|
| 89 |
+
`short_factor` (`List[float]`, *optional*):
|
| 90 |
+
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
| 91 |
+
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
| 92 |
+
size divided by the number of attention heads divided by 2
|
| 93 |
+
`long_factor` (`List[float]`, *optional*):
|
| 94 |
+
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
| 95 |
+
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
| 96 |
+
size divided by the number of attention heads divided by 2
|
| 97 |
+
`low_freq_factor` (`float`, *optional*):
|
| 98 |
+
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
| 99 |
+
`high_freq_factor` (`float`, *optional*):
|
| 100 |
+
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
| 101 |
+
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
| 102 |
+
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
| 103 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 104 |
+
The dropout ratio for the attention probabilities.
|
| 105 |
+
sliding_window (`int`, *optional*, defaults to 4096):
|
| 106 |
+
Size of the sliding window attention context.
|
| 107 |
+
sliding_window_pattern (`int`, *optional*, defaults to 4):
|
| 108 |
+
Pattern for the sliding window attention.
|
| 109 |
+
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
|
| 110 |
+
|
| 111 |
+
```python
|
| 112 |
+
>>> from transformers import Cohere2Model, Cohere2Config
|
| 113 |
+
|
| 114 |
+
>>> # Initializing a Cohere Nextmodel configuration
|
| 115 |
+
>>> configuration = Cohere2Config()
|
| 116 |
+
|
| 117 |
+
>>> # Initializing a model from the Cohere2 configuration
|
| 118 |
+
>>> model = Cohere2Model(configuration) # doctest: +SKIP
|
| 119 |
+
|
| 120 |
+
>>> # Accessing the model configuration
|
| 121 |
+
>>> configuration = model.config # doctest: +SKIP
|
| 122 |
+
```
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
model_type = "cohere2"
|
| 126 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 127 |
+
|
| 128 |
+
def __init__(
|
| 129 |
+
self,
|
| 130 |
+
vocab_size=256000,
|
| 131 |
+
hidden_size=8192,
|
| 132 |
+
intermediate_size=22528,
|
| 133 |
+
logit_scale=0.0625,
|
| 134 |
+
num_hidden_layers=40,
|
| 135 |
+
num_attention_heads=64,
|
| 136 |
+
num_key_value_heads=None,
|
| 137 |
+
hidden_act="silu",
|
| 138 |
+
max_position_embeddings=8192,
|
| 139 |
+
initializer_range=0.02,
|
| 140 |
+
layer_norm_eps=1e-5,
|
| 141 |
+
use_cache=True,
|
| 142 |
+
pad_token_id=0,
|
| 143 |
+
bos_token_id=5,
|
| 144 |
+
eos_token_id=255001,
|
| 145 |
+
tie_word_embeddings=True,
|
| 146 |
+
rope_theta=10000.0,
|
| 147 |
+
rope_scaling=None,
|
| 148 |
+
attention_bias=False,
|
| 149 |
+
attention_dropout=0.0,
|
| 150 |
+
sliding_window=4096,
|
| 151 |
+
sliding_window_pattern=4,
|
| 152 |
+
cache_implementation="hybrid",
|
| 153 |
+
**kwargs,
|
| 154 |
+
):
|
| 155 |
+
self.vocab_size = vocab_size
|
| 156 |
+
self.max_position_embeddings = max_position_embeddings
|
| 157 |
+
self.hidden_size = hidden_size
|
| 158 |
+
self.logit_scale = logit_scale
|
| 159 |
+
self.intermediate_size = intermediate_size
|
| 160 |
+
self.num_hidden_layers = num_hidden_layers
|
| 161 |
+
self.num_attention_heads = num_attention_heads
|
| 162 |
+
|
| 163 |
+
# for backward compatibility
|
| 164 |
+
if num_key_value_heads is None:
|
| 165 |
+
num_key_value_heads = num_attention_heads
|
| 166 |
+
|
| 167 |
+
self.num_key_value_heads = num_key_value_heads
|
| 168 |
+
self.hidden_act = hidden_act
|
| 169 |
+
self.initializer_range = initializer_range
|
| 170 |
+
self.layer_norm_eps = layer_norm_eps
|
| 171 |
+
self.use_cache = use_cache
|
| 172 |
+
self.rope_theta = rope_theta
|
| 173 |
+
self.rope_scaling = rope_scaling
|
| 174 |
+
self.attention_bias = attention_bias
|
| 175 |
+
self.attention_dropout = attention_dropout
|
| 176 |
+
self.sliding_window = sliding_window
|
| 177 |
+
self.sliding_window_pattern = sliding_window_pattern
|
| 178 |
+
# Need to specify head_dim in the config so it can be used in the attention forward functions
|
| 179 |
+
self.head_dim = hidden_size // num_attention_heads
|
| 180 |
+
self.cache_implementation = cache_implementation
|
| 181 |
+
|
| 182 |
+
# Validate the correctness of rotary position embeddings parameters
|
| 183 |
+
rope_config_validation(self)
|
| 184 |
+
|
| 185 |
+
super().__init__(
|
| 186 |
+
pad_token_id=pad_token_id,
|
| 187 |
+
bos_token_id=bos_token_id,
|
| 188 |
+
eos_token_id=eos_token_id,
|
| 189 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 190 |
+
**kwargs,
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
__all__ = ["Cohere2Config"]
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/deepseek_vl2.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
|
| 6 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class VisionEncoderConfig(PretrainedConfig):
|
| 10 |
+
model_type: str = "vision"
|
| 11 |
+
|
| 12 |
+
model_name: str = "vit_so400m_patch14_siglip_384.webli"
|
| 13 |
+
image_size: int = 384
|
| 14 |
+
patch_size: int = 16
|
| 15 |
+
width: int = 1024
|
| 16 |
+
layers: int = 24
|
| 17 |
+
heads: int = 16
|
| 18 |
+
mlp_ratio: int = 4
|
| 19 |
+
global_pool: str = "map"
|
| 20 |
+
ignore_head: bool = True
|
| 21 |
+
class_token: bool = False
|
| 22 |
+
num_classes: int = 0
|
| 23 |
+
use_checkpoint: bool = False
|
| 24 |
+
weight_init: str = "skip"
|
| 25 |
+
deterministic: bool = False
|
| 26 |
+
num_recomputing_layers: int = 0
|
| 27 |
+
|
| 28 |
+
def __init__(self,
|
| 29 |
+
model_name: str = "vit_so400m_patch14_siglip_384.webli",
|
| 30 |
+
image_size: int = 384,
|
| 31 |
+
patch_size: int = 16,
|
| 32 |
+
width: int = 1024,
|
| 33 |
+
layers: int = 24,
|
| 34 |
+
heads: int = 16,
|
| 35 |
+
mlp_ratio: int = 4,
|
| 36 |
+
global_pool: str = "map",
|
| 37 |
+
ignore_head: bool = True,
|
| 38 |
+
class_token: bool = False,
|
| 39 |
+
num_classes: int = 0,
|
| 40 |
+
use_checkpoint: bool = False,
|
| 41 |
+
**kwargs):
|
| 42 |
+
self.model_name = model_name
|
| 43 |
+
self.image_size = image_size
|
| 44 |
+
self.patch_size = patch_size
|
| 45 |
+
self.width = width
|
| 46 |
+
self.layers = layers
|
| 47 |
+
self.heads = heads
|
| 48 |
+
self.mlp_ratio = mlp_ratio
|
| 49 |
+
self.global_pool = global_pool
|
| 50 |
+
self.ignore_head = ignore_head
|
| 51 |
+
self.class_token = class_token
|
| 52 |
+
self.num_classes = num_classes
|
| 53 |
+
self.use_checkpoint = use_checkpoint
|
| 54 |
+
|
| 55 |
+
super().__init__(**kwargs)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class MlpProjectorConfig(PretrainedConfig):
|
| 59 |
+
model_type = "mlp_projector"
|
| 60 |
+
projector_type: str = "downsample_mlp_gelu"
|
| 61 |
+
input_dim: int = 1152
|
| 62 |
+
n_embed: int = 2048
|
| 63 |
+
depth: int = 2
|
| 64 |
+
mlp_ratio: int = 1
|
| 65 |
+
downsample_ratio: int = 2
|
| 66 |
+
token_pooling: bool = False
|
| 67 |
+
|
| 68 |
+
def __init__(self,
|
| 69 |
+
projector_type: str = "downsample_mlp_gelu",
|
| 70 |
+
input_dim: int = 1152,
|
| 71 |
+
n_embed: int = 2048,
|
| 72 |
+
depth: int = 2,
|
| 73 |
+
mlp_ratio: int = 1,
|
| 74 |
+
downsample_ratio: int = 2,
|
| 75 |
+
**kwargs):
|
| 76 |
+
self.projector_type = projector_type
|
| 77 |
+
self.input_dim = input_dim
|
| 78 |
+
self.n_embed = n_embed
|
| 79 |
+
self.depth = depth
|
| 80 |
+
self.mlp_ratio = mlp_ratio
|
| 81 |
+
self.downsample_ratio = downsample_ratio
|
| 82 |
+
|
| 83 |
+
super().__init__(**kwargs)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class DeepseekV2Config(PretrainedConfig):
|
| 87 |
+
|
| 88 |
+
model_type = "deepseek_v2"
|
| 89 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 90 |
+
|
| 91 |
+
def __init__(
|
| 92 |
+
self,
|
| 93 |
+
vocab_size=102400,
|
| 94 |
+
hidden_size=4096,
|
| 95 |
+
intermediate_size=11008,
|
| 96 |
+
moe_intermediate_size=1407,
|
| 97 |
+
num_hidden_layers=30,
|
| 98 |
+
num_attention_heads=32,
|
| 99 |
+
num_key_value_heads=32,
|
| 100 |
+
n_shared_experts=None,
|
| 101 |
+
n_routed_experts=None,
|
| 102 |
+
ep_size=1,
|
| 103 |
+
routed_scaling_factor=1.0,
|
| 104 |
+
kv_lora_rank=512,
|
| 105 |
+
q_lora_rank=1536,
|
| 106 |
+
qk_rope_head_dim=64,
|
| 107 |
+
v_head_dim=128,
|
| 108 |
+
qk_nope_head_dim=128,
|
| 109 |
+
topk_method='gready',
|
| 110 |
+
n_group=None,
|
| 111 |
+
topk_group=None,
|
| 112 |
+
num_experts_per_tok=None,
|
| 113 |
+
moe_layer_freq=1,
|
| 114 |
+
first_k_dense_replace=0,
|
| 115 |
+
norm_topk_prob=False,
|
| 116 |
+
scoring_func='softmax',
|
| 117 |
+
aux_loss_alpha=0.001,
|
| 118 |
+
seq_aux=True,
|
| 119 |
+
hidden_act="silu",
|
| 120 |
+
max_position_embeddings=2048,
|
| 121 |
+
initializer_range=0.02,
|
| 122 |
+
rms_norm_eps=1e-6,
|
| 123 |
+
use_cache=True,
|
| 124 |
+
pad_token_id=None,
|
| 125 |
+
bos_token_id=100000,
|
| 126 |
+
eos_token_id=100001,
|
| 127 |
+
pretraining_tp=1,
|
| 128 |
+
tie_word_embeddings=False,
|
| 129 |
+
rope_theta=10000.0,
|
| 130 |
+
rope_scaling=None,
|
| 131 |
+
attention_bias=False,
|
| 132 |
+
attention_dropout=0.0,
|
| 133 |
+
use_mla=True,
|
| 134 |
+
**kwargs,
|
| 135 |
+
):
|
| 136 |
+
self.vocab_size = vocab_size
|
| 137 |
+
self.max_position_embeddings = max_position_embeddings
|
| 138 |
+
self.hidden_size = hidden_size
|
| 139 |
+
self.intermediate_size = intermediate_size
|
| 140 |
+
self.moe_intermediate_size = moe_intermediate_size
|
| 141 |
+
self.num_hidden_layers = num_hidden_layers
|
| 142 |
+
self.num_attention_heads = num_attention_heads
|
| 143 |
+
self.n_shared_experts = n_shared_experts
|
| 144 |
+
self.n_routed_experts = n_routed_experts
|
| 145 |
+
self.ep_size = ep_size
|
| 146 |
+
self.routed_scaling_factor = routed_scaling_factor
|
| 147 |
+
self.kv_lora_rank = kv_lora_rank
|
| 148 |
+
self.q_lora_rank = q_lora_rank
|
| 149 |
+
self.qk_rope_head_dim = qk_rope_head_dim
|
| 150 |
+
self.v_head_dim = v_head_dim
|
| 151 |
+
self.qk_nope_head_dim = qk_nope_head_dim
|
| 152 |
+
self.topk_method = topk_method
|
| 153 |
+
self.n_group = n_group
|
| 154 |
+
self.topk_group = topk_group
|
| 155 |
+
self.num_experts_per_tok = num_experts_per_tok
|
| 156 |
+
self.moe_layer_freq = moe_layer_freq
|
| 157 |
+
self.first_k_dense_replace = first_k_dense_replace
|
| 158 |
+
self.norm_topk_prob = norm_topk_prob
|
| 159 |
+
self.scoring_func = scoring_func
|
| 160 |
+
self.aux_loss_alpha = aux_loss_alpha
|
| 161 |
+
self.seq_aux = seq_aux
|
| 162 |
+
# for backward compatibility
|
| 163 |
+
if num_key_value_heads is None:
|
| 164 |
+
num_key_value_heads = num_attention_heads
|
| 165 |
+
|
| 166 |
+
self.num_key_value_heads = num_key_value_heads
|
| 167 |
+
self.hidden_act = hidden_act
|
| 168 |
+
self.initializer_range = initializer_range
|
| 169 |
+
self.rms_norm_eps = float(rms_norm_eps)
|
| 170 |
+
self.pretraining_tp = pretraining_tp
|
| 171 |
+
self.use_cache = use_cache
|
| 172 |
+
self.rope_theta = rope_theta
|
| 173 |
+
self.rope_scaling = rope_scaling
|
| 174 |
+
self.attention_bias = attention_bias
|
| 175 |
+
self.attention_dropout = attention_dropout
|
| 176 |
+
self.use_mla = use_mla
|
| 177 |
+
|
| 178 |
+
super().__init__(
|
| 179 |
+
pad_token_id=pad_token_id,
|
| 180 |
+
bos_token_id=bos_token_id,
|
| 181 |
+
eos_token_id=eos_token_id,
|
| 182 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 183 |
+
**kwargs,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
class DeepseekVLV2Config(PretrainedConfig):
|
| 188 |
+
model_type = "deepseek_vl_v2"
|
| 189 |
+
vision_config: VisionEncoderConfig
|
| 190 |
+
projector_config: MlpProjectorConfig
|
| 191 |
+
|
| 192 |
+
tile_tag: str = "2D"
|
| 193 |
+
global_view_pos: str = "head"
|
| 194 |
+
candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), )
|
| 195 |
+
|
| 196 |
+
def __init__(self,
|
| 197 |
+
tile_tag: str = "tile_tag",
|
| 198 |
+
global_view_pos: str = "head",
|
| 199 |
+
candidate_resolutions: Tuple[Tuple[int,
|
| 200 |
+
int]] = ((384, 384), ),
|
| 201 |
+
**kwargs):
|
| 202 |
+
super().__init__(**kwargs)
|
| 203 |
+
|
| 204 |
+
vision_config = kwargs.get("vision_config", {})
|
| 205 |
+
self.vision_config = VisionEncoderConfig(**vision_config)
|
| 206 |
+
|
| 207 |
+
projector_config = kwargs.get("projector_config", {})
|
| 208 |
+
self.projector_config = MlpProjectorConfig(**projector_config)
|
| 209 |
+
|
| 210 |
+
language_config = kwargs.get("language_config", {})
|
| 211 |
+
self.text_config = DeepseekV2Config(**language_config)
|
| 212 |
+
|
| 213 |
+
self.tile_tag = tile_tag
|
| 214 |
+
self.global_view_pos = global_view_pos
|
| 215 |
+
self.candidate_resolutions = candidate_resolutions
|
| 216 |
+
self.vocab_size = self.text_config.vocab_size
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/eagle.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Optional, Union
|
| 5 |
+
|
| 6 |
+
from transformers import AutoConfig, PretrainedConfig
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class EAGLEConfig(PretrainedConfig):
|
| 10 |
+
model_type = "eagle"
|
| 11 |
+
|
| 12 |
+
def __init__(self,
|
| 13 |
+
model: Union[PretrainedConfig, dict, None] = None,
|
| 14 |
+
truncated_vocab_size: Optional[int] = None,
|
| 15 |
+
**kwargs):
|
| 16 |
+
|
| 17 |
+
model_config = None if model is None else (AutoConfig.for_model(
|
| 18 |
+
**model) if isinstance(model, dict) else model)
|
| 19 |
+
|
| 20 |
+
for k, v in kwargs.items():
|
| 21 |
+
if k != "architectures" and k != "model_type" and hasattr(
|
| 22 |
+
model_config, k):
|
| 23 |
+
setattr(model_config, k, v)
|
| 24 |
+
|
| 25 |
+
self.model = model_config
|
| 26 |
+
|
| 27 |
+
if self.model is None:
|
| 28 |
+
self.truncated_vocab_size = None
|
| 29 |
+
else:
|
| 30 |
+
self.truncated_vocab_size = self.model.vocab_size if \
|
| 31 |
+
truncated_vocab_size is None else truncated_vocab_size
|
| 32 |
+
|
| 33 |
+
if "architectures" not in kwargs:
|
| 34 |
+
kwargs["architectures"] = ["EAGLEModel"]
|
| 35 |
+
|
| 36 |
+
super().__init__(**kwargs)
|
| 37 |
+
|
| 38 |
+
if self.model is not None:
|
| 39 |
+
for k, v in self.model.to_dict().items():
|
| 40 |
+
if not hasattr(self, k):
|
| 41 |
+
setattr(self, k, v)
|
| 42 |
+
|
| 43 |
+
@classmethod
|
| 44 |
+
def from_pretrained(
|
| 45 |
+
cls,
|
| 46 |
+
pretrained_model_name_or_path: Union[str, os.PathLike],
|
| 47 |
+
**kwargs,
|
| 48 |
+
) -> "EAGLEConfig":
|
| 49 |
+
config_dict, kwargs = cls.get_config_dict(
|
| 50 |
+
pretrained_model_name_or_path, **kwargs)
|
| 51 |
+
return cls.from_dict(config_dict, **kwargs)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/exaone.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Copied from
|
| 4 |
+
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
|
| 5 |
+
# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
|
| 6 |
+
#
|
| 7 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 8 |
+
# you may not use this file except in compliance with the License.
|
| 9 |
+
# You may obtain a copy of the License at
|
| 10 |
+
#
|
| 11 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 12 |
+
#
|
| 13 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 14 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 15 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 16 |
+
# See the License for the specific language governing permissions and
|
| 17 |
+
# limitations under the License.
|
| 18 |
+
"""Exaone model configuration"""
|
| 19 |
+
|
| 20 |
+
from typing import Dict
|
| 21 |
+
|
| 22 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 23 |
+
from transformers.utils import logging
|
| 24 |
+
|
| 25 |
+
logger = logging.get_logger(__name__)
|
| 26 |
+
|
| 27 |
+
EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ExaoneConfig(PretrainedConfig):
|
| 31 |
+
r"""
|
| 32 |
+
This is the configuration class to store the configuration of a :class:
|
| 33 |
+
`~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
|
| 34 |
+
according to the specified arguments, defining the model architecture.
|
| 35 |
+
Instantiating a configuration with the defaults will yield a similar
|
| 36 |
+
configuration to that of the Exaone
|
| 37 |
+
|
| 38 |
+
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
|
| 39 |
+
and can be used to control the model outputs. Read the documentation from :
|
| 40 |
+
class:`~transformers.PretrainedConfig` for more information.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
vocab_size (:obj:`int`, `optional`, defaults to 50257):
|
| 44 |
+
Vocabulary size of the GPT Lingvo model. Defines the number of
|
| 45 |
+
different tokens that can be represented by the :obj:`inputs_ids`
|
| 46 |
+
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
|
| 47 |
+
size of the model.
|
| 48 |
+
Defines the different tokens that can be represented by the
|
| 49 |
+
`inputs_ids` passed to the forward method of :class:
|
| 50 |
+
`~transformers.EXAONEModel`.
|
| 51 |
+
hidden_size (:obj:`int`, `optional`, defaults to 2048):
|
| 52 |
+
Dimensionality of the encoder layers and the pooler layer.
|
| 53 |
+
num_layers (:obj:`int`, `optional`, defaults to 24):
|
| 54 |
+
Number of hidden layers in the Transformer encoder.
|
| 55 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
| 56 |
+
Number of attention heads for each attention layer in the
|
| 57 |
+
Transformer decoder.
|
| 58 |
+
num_key_value_heads (`int`, *optional*):
|
| 59 |
+
This is the number of key_value heads that should be used to
|
| 60 |
+
implement Grouped Query Attention. If
|
| 61 |
+
`num_key_value_heads=num_attention_heads`, the model will use Multi
|
| 62 |
+
Head Attention (MHA), if `num_key_value_heads=1 the model will use
|
| 63 |
+
Multi Query Attention (MQA) otherwise GQA is used. When
|
| 64 |
+
converting a multi-head checkpoint to a GQA checkpoint,
|
| 65 |
+
each group key and value head should be constructed by meanpooling
|
| 66 |
+
all the original heads within that group. For more details checkout
|
| 67 |
+
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
|
| 68 |
+
specified, will default to `num_attention_heads`.
|
| 69 |
+
rotary_pct (`float`, *optional*, defaults to 0.25):
|
| 70 |
+
percentage of hidden dimensions to allocate to rotary embeddings
|
| 71 |
+
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
|
| 72 |
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
|
| 73 |
+
the Transformer encoder.
|
| 74 |
+
activation_function (:obj:`str` or :obj:`function`, `optional`,
|
| 75 |
+
defaults to :obj:`"gelu_new"`):
|
| 76 |
+
The non-linear activation function (function or string) in the
|
| 77 |
+
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
|
| 78 |
+
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
|
| 79 |
+
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
| 80 |
+
The dropout probabilitiy for all fully connected layers in the
|
| 81 |
+
embeddings, encoder, and pooler.
|
| 82 |
+
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
| 83 |
+
The dropout ratio for the attention probabilities.
|
| 84 |
+
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
|
| 85 |
+
The maximum sequence length that this model might ever be used with.
|
| 86 |
+
Typically set this to something large just in case
|
| 87 |
+
(e.g., 512 or 1024 or 2048).
|
| 88 |
+
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
| 89 |
+
The vocabulary size of the :obj:`token_type_ids` passed when calling
|
| 90 |
+
:class:`~transformers.EXAONEModel`.
|
| 91 |
+
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
| 92 |
+
The standard deviation of the truncated_normal_initializer for
|
| 93 |
+
initializing all weight matrices.
|
| 94 |
+
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
|
| 95 |
+
The epsilon used by the layer normalization layers.
|
| 96 |
+
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 97 |
+
Whether or not the model should return the last key/values
|
| 98 |
+
attentions (not used by all models).
|
| 99 |
+
Only relevant if ``config.is_decoder=True``.
|
| 100 |
+
gradient_checkpointing (:obj:`bool`, `optional`,
|
| 101 |
+
defaults to :obj:`False`):
|
| 102 |
+
If True, use gradient checkpointing to save memory at the expense
|
| 103 |
+
of slower backward pass.
|
| 104 |
+
Example::
|
| 105 |
+
|
| 106 |
+
>>> from transformers import ExoneModel, ExaoneConfig
|
| 107 |
+
|
| 108 |
+
>>> # Initializing a EXAONE configuration
|
| 109 |
+
>>> configuration = ExaoneConfig()
|
| 110 |
+
|
| 111 |
+
>>> # Initializing a model from configuration
|
| 112 |
+
>>> model = ExoneModel(configuration)
|
| 113 |
+
|
| 114 |
+
>>> # Accessing the model configuration
|
| 115 |
+
>>> configuration = model.config
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
model_type = "exaone"
|
| 119 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 120 |
+
attribute_map = {"num_hidden_layers": "num_layers"}
|
| 121 |
+
|
| 122 |
+
def __init__(
|
| 123 |
+
self,
|
| 124 |
+
vocab_size=102400,
|
| 125 |
+
max_position_embeddings=2048,
|
| 126 |
+
hidden_size=2048,
|
| 127 |
+
num_layers=32,
|
| 128 |
+
num_attention_heads=32,
|
| 129 |
+
num_key_value_heads=None,
|
| 130 |
+
intermediate_size=None,
|
| 131 |
+
activation_function="silu",
|
| 132 |
+
rotary_pct=0.25,
|
| 133 |
+
resid_dropout=0.0,
|
| 134 |
+
embed_dropout=0.0,
|
| 135 |
+
attention_dropout=0.0,
|
| 136 |
+
layer_norm_epsilon=1e-6,
|
| 137 |
+
initializer_range=0.02,
|
| 138 |
+
use_cache=True,
|
| 139 |
+
bos_token_id=0,
|
| 140 |
+
eos_token_id=2,
|
| 141 |
+
tie_word_embeddings=True,
|
| 142 |
+
**kwargs,
|
| 143 |
+
):
|
| 144 |
+
super().__init__(
|
| 145 |
+
bos_token_id=bos_token_id,
|
| 146 |
+
eos_token_id=eos_token_id,
|
| 147 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 148 |
+
**kwargs,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
self.vocab_size = vocab_size
|
| 152 |
+
self.max_position_embeddings = max_position_embeddings
|
| 153 |
+
self.hidden_size = hidden_size
|
| 154 |
+
self.num_layers = num_layers
|
| 155 |
+
self.num_attention_heads = num_attention_heads
|
| 156 |
+
self.num_hidden_layers = num_layers
|
| 157 |
+
if num_key_value_heads is None:
|
| 158 |
+
num_key_value_heads = num_attention_heads
|
| 159 |
+
self.num_key_value_heads = num_key_value_heads
|
| 160 |
+
if intermediate_size:
|
| 161 |
+
self.intermediate_size = intermediate_size
|
| 162 |
+
else:
|
| 163 |
+
self.intermediate_size = hidden_size * 4
|
| 164 |
+
self.activation_function = activation_function
|
| 165 |
+
self.resid_dropout = resid_dropout
|
| 166 |
+
self.embed_dropout = embed_dropout
|
| 167 |
+
self.attention_dropout = attention_dropout
|
| 168 |
+
self.layer_norm_epsilon = layer_norm_epsilon
|
| 169 |
+
self.initializer_range = initializer_range
|
| 170 |
+
self.use_cache = use_cache
|
| 171 |
+
self.rotary_pct = rotary_pct
|
| 172 |
+
|
| 173 |
+
self.bos_token_id = bos_token_id
|
| 174 |
+
self.eos_token_id = eos_token_id
|
| 175 |
+
|
| 176 |
+
self.use_logit_cap = kwargs.pop("use_logit_cap", False)
|
| 177 |
+
self.ln_no_scale = kwargs.pop("ln_no_scale", False)
|
| 178 |
+
self.use_gated = kwargs.pop("use_gated", False)
|
| 179 |
+
self.use_emb_norm = kwargs.pop("use_emb_norm", False)
|
| 180 |
+
self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
|
| 181 |
+
self.rotary_type = kwargs.pop("rotary_type", None)
|
| 182 |
+
self.scaling_factor = kwargs.pop("scaling_factor", 1)
|
| 183 |
+
self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
|
| 184 |
+
self.use_extra_logit = kwargs.pop("use_extra_logit", True)
|
| 185 |
+
self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
|
| 186 |
+
self.rotary_base = kwargs.pop("rotary_base", 10000.0)
|
| 187 |
+
self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
|
| 188 |
+
self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
|
| 189 |
+
(rotary_pct == 0.25))
|
| 190 |
+
if self.use_rotary_pos:
|
| 191 |
+
self.use_absolute_pos = False
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/falcon.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Adapted from
|
| 4 |
+
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
|
| 5 |
+
# Copyright 2023 The vLLM team.
|
| 6 |
+
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
|
| 7 |
+
# All rights reserved.
|
| 8 |
+
#
|
| 9 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 10 |
+
# you may not use this file except in compliance with the License.
|
| 11 |
+
# You may obtain a copy of the License at
|
| 12 |
+
#
|
| 13 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 14 |
+
#
|
| 15 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 16 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 17 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 18 |
+
# See the License for the specific language governing permissions and
|
| 19 |
+
# limitations under the License.
|
| 20 |
+
"""Falcon configuration"""
|
| 21 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class RWConfig(PretrainedConfig):
|
| 25 |
+
model_type = "falcon"
|
| 26 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 27 |
+
attribute_map = {
|
| 28 |
+
"num_hidden_layers": "n_layer",
|
| 29 |
+
"num_attention_heads": "n_head",
|
| 30 |
+
"num_kv_heads": "n_head_kv",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
vocab_size=250880,
|
| 36 |
+
hidden_size=64,
|
| 37 |
+
n_layer=2,
|
| 38 |
+
n_head=8,
|
| 39 |
+
layer_norm_epsilon=1e-5,
|
| 40 |
+
initializer_range=0.02,
|
| 41 |
+
use_cache=True,
|
| 42 |
+
bos_token_id=1,
|
| 43 |
+
eos_token_id=2,
|
| 44 |
+
hidden_dropout=0.0,
|
| 45 |
+
attention_dropout=0.0,
|
| 46 |
+
multi_query=True,
|
| 47 |
+
n_head_kv=None,
|
| 48 |
+
alibi=False,
|
| 49 |
+
bias=False,
|
| 50 |
+
parallel_attn=False,
|
| 51 |
+
new_decoder_architecture=False,
|
| 52 |
+
**kwargs,
|
| 53 |
+
) -> None:
|
| 54 |
+
self.vocab_size = vocab_size
|
| 55 |
+
# Backward compatibility with n_embed kwarg
|
| 56 |
+
n_embed = kwargs.pop("n_embed", None)
|
| 57 |
+
self.hidden_size = hidden_size if n_embed is None else n_embed
|
| 58 |
+
self.n_layer = n_layer
|
| 59 |
+
self.n_head = n_head
|
| 60 |
+
self.layer_norm_epsilon = layer_norm_epsilon
|
| 61 |
+
self.initializer_range = initializer_range
|
| 62 |
+
self.use_cache = use_cache
|
| 63 |
+
self.hidden_dropout = hidden_dropout
|
| 64 |
+
self.attention_dropout = attention_dropout
|
| 65 |
+
|
| 66 |
+
self.bos_token_id = bos_token_id
|
| 67 |
+
self.eos_token_id = eos_token_id
|
| 68 |
+
self.multi_query = multi_query
|
| 69 |
+
self.n_head_kv = 1 if n_head_kv is None else n_head_kv
|
| 70 |
+
self.alibi = alibi
|
| 71 |
+
self.bias = bias
|
| 72 |
+
self.parallel_attn = parallel_attn
|
| 73 |
+
self.new_decoder_architecture = new_decoder_architecture
|
| 74 |
+
|
| 75 |
+
if self.hidden_size == 8192:
|
| 76 |
+
# Hack for falcon-40b
|
| 77 |
+
self.new_decoder_architecture = True
|
| 78 |
+
|
| 79 |
+
super().__init__(bos_token_id=bos_token_id,
|
| 80 |
+
eos_token_id=eos_token_id,
|
| 81 |
+
**kwargs)
|
| 82 |
+
|
| 83 |
+
@property
|
| 84 |
+
def head_dim(self):
|
| 85 |
+
return self.hidden_size // self.n_head
|
| 86 |
+
|
| 87 |
+
@property
|
| 88 |
+
def rotary(self):
|
| 89 |
+
return not self.alibi
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/h2ovl.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Adapted from
|
| 4 |
+
# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
# H2OVL-Mississippi
|
| 7 |
+
# Copyright (c) 2024 H2O.AI
|
| 8 |
+
# Licensed under Apache 2.0 License [see LICENSE for details]
|
| 9 |
+
# --------------------------------------------------------
|
| 10 |
+
|
| 11 |
+
from .internvl import InternVLChatConfig
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class H2OVLChatConfig(InternVLChatConfig):
|
| 15 |
+
model_type = "h2ovl_chat"
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/internvl.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Adapted from
|
| 4 |
+
# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
# InternVL
|
| 7 |
+
# Copyright (c) 2024 OpenGVLab
|
| 8 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 9 |
+
# --------------------------------------------------------
|
| 10 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class InternVLChatConfig(PretrainedConfig):
|
| 14 |
+
model_type = 'internvl_chat'
|
| 15 |
+
is_composition = True
|
| 16 |
+
|
| 17 |
+
def __init__(self,
|
| 18 |
+
vision_config=None,
|
| 19 |
+
llm_config=None,
|
| 20 |
+
use_backbone_lora=0,
|
| 21 |
+
use_llm_lora=0,
|
| 22 |
+
select_layer=-1,
|
| 23 |
+
force_image_size=None,
|
| 24 |
+
downsample_ratio=0.5,
|
| 25 |
+
template=None,
|
| 26 |
+
dynamic_image_size=False,
|
| 27 |
+
use_thumbnail=False,
|
| 28 |
+
ps_version='v1',
|
| 29 |
+
min_dynamic_patch=1,
|
| 30 |
+
max_dynamic_patch=6,
|
| 31 |
+
**kwargs):
|
| 32 |
+
super().__init__(**kwargs)
|
| 33 |
+
|
| 34 |
+
if vision_config is None:
|
| 35 |
+
vision_config = {}
|
| 36 |
+
|
| 37 |
+
if llm_config is None:
|
| 38 |
+
llm_config = {}
|
| 39 |
+
|
| 40 |
+
self.vision_config = PretrainedConfig(**vision_config)
|
| 41 |
+
self.text_config = PretrainedConfig(**llm_config)
|
| 42 |
+
|
| 43 |
+
self.use_backbone_lora = use_backbone_lora
|
| 44 |
+
self.use_llm_lora = use_llm_lora
|
| 45 |
+
self.select_layer = select_layer
|
| 46 |
+
self.force_image_size = force_image_size
|
| 47 |
+
self.downsample_ratio = downsample_ratio
|
| 48 |
+
self.template = template
|
| 49 |
+
self.dynamic_image_size = dynamic_image_size
|
| 50 |
+
self.use_thumbnail = use_thumbnail
|
| 51 |
+
self.ps_version = ps_version # pixel shuffle version
|
| 52 |
+
self.min_dynamic_patch = min_dynamic_patch
|
| 53 |
+
self.max_dynamic_patch = max_dynamic_patch
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/medusa.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Optional, Union
|
| 5 |
+
|
| 6 |
+
from transformers import PretrainedConfig
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class MedusaConfig(PretrainedConfig):
|
| 10 |
+
model_type = "medusa"
|
| 11 |
+
|
| 12 |
+
def __init__(self,
|
| 13 |
+
hidden_size: int = 4096,
|
| 14 |
+
vocab_size: int = 32001,
|
| 15 |
+
num_heads: int = 5,
|
| 16 |
+
num_hidden_layers: int = 1,
|
| 17 |
+
max_paths: int = 64,
|
| 18 |
+
topk: int = 10,
|
| 19 |
+
truncated_vocab_size: Optional[int] = None,
|
| 20 |
+
**kwargs):
|
| 21 |
+
|
| 22 |
+
self.hidden_size = hidden_size
|
| 23 |
+
self.vocab_size = vocab_size
|
| 24 |
+
self.num_heads = num_heads
|
| 25 |
+
self.num_hidden_layers = num_hidden_layers
|
| 26 |
+
self.max_paths = max_paths
|
| 27 |
+
self.topk = topk
|
| 28 |
+
self.max_seq_len = int(2**20)
|
| 29 |
+
self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
|
| 30 |
+
else truncated_vocab_size
|
| 31 |
+
if "architectures" not in kwargs:
|
| 32 |
+
kwargs["architectures"] = ["MedusaModel"]
|
| 33 |
+
|
| 34 |
+
super().__init__(**kwargs)
|
| 35 |
+
|
| 36 |
+
@classmethod
|
| 37 |
+
def from_pretrained(
|
| 38 |
+
cls,
|
| 39 |
+
pretrained_model_name_or_path: Union[str, os.PathLike],
|
| 40 |
+
**kwargs,
|
| 41 |
+
) -> "MedusaConfig":
|
| 42 |
+
config_dict, kwargs = cls.get_config_dict(
|
| 43 |
+
pretrained_model_name_or_path, **kwargs)
|
| 44 |
+
for k in list(config_dict.keys()):
|
| 45 |
+
if 'num' in k:
|
| 46 |
+
if 'heads' in k:
|
| 47 |
+
config_dict["num_heads"] = config_dict.pop(k)
|
| 48 |
+
elif 'layers' in k:
|
| 49 |
+
config_dict["num_hidden_layers"] = config_dict.pop(k)
|
| 50 |
+
return cls.from_dict(config_dict, **kwargs)
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def num_attention_heads(self):
|
| 54 |
+
return 0
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def num_lookahead_tokens(self):
|
| 58 |
+
return self.num_heads
|
| 59 |
+
|
| 60 |
+
@num_lookahead_tokens.setter
|
| 61 |
+
def num_lookahead_tokens(self, num_lookahead_tokens: int):
|
| 62 |
+
self.num_heads = num_lookahead_tokens
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mllama.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from transformers.models.mllama import configuration_mllama as mllama_hf_config
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
|
| 7 |
+
'''
|
| 8 |
+
Use this class to override is_encoder_decoder:
|
| 9 |
+
- transformers regards mllama as is_encoder_decoder=False
|
| 10 |
+
- vllm needs is_encoder_decoder=True to enable cross-attention
|
| 11 |
+
'''
|
| 12 |
+
|
| 13 |
+
def __init__(
|
| 14 |
+
self,
|
| 15 |
+
**kwargs,
|
| 16 |
+
):
|
| 17 |
+
super().__init__(**kwargs)
|
| 18 |
+
self.is_encoder_decoder = True
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class MllamaConfig(mllama_hf_config.MllamaConfig):
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
text_config=None,
|
| 26 |
+
**kwargs,
|
| 27 |
+
):
|
| 28 |
+
if isinstance(text_config, dict):
|
| 29 |
+
text_config = MllamaTextConfig(**text_config)
|
| 30 |
+
super().__init__(text_config=text_config, **kwargs)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mpt.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Copied from
|
| 4 |
+
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
|
| 5 |
+
"""A HuggingFace-style model configuration."""
|
| 6 |
+
import warnings
|
| 7 |
+
from typing import Any, Dict, Optional, Union
|
| 8 |
+
|
| 9 |
+
from transformers import PretrainedConfig
|
| 10 |
+
|
| 11 |
+
attn_config_defaults: Dict = {
|
| 12 |
+
'attn_type': 'multihead_attention',
|
| 13 |
+
'attn_pdrop': 0.0,
|
| 14 |
+
'attn_impl': 'triton',
|
| 15 |
+
'qk_ln': False,
|
| 16 |
+
'clip_qkv': None,
|
| 17 |
+
'softmax_scale': None,
|
| 18 |
+
'prefix_lm': False,
|
| 19 |
+
'attn_uses_sequence_id': False,
|
| 20 |
+
'alibi': False,
|
| 21 |
+
'alibi_bias_max': 8
|
| 22 |
+
}
|
| 23 |
+
ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
|
| 24 |
+
init_config_defaults: Dict = {
|
| 25 |
+
'name': 'kaiming_normal_',
|
| 26 |
+
'fan_mode': 'fan_in',
|
| 27 |
+
'init_nonlinearity': 'relu',
|
| 28 |
+
'init_div_is_residual': True,
|
| 29 |
+
'emb_init_std': None,
|
| 30 |
+
'emb_init_uniform_lim': None,
|
| 31 |
+
'init_std': None,
|
| 32 |
+
'init_gain': 0.0
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class MPTConfig(PretrainedConfig):
|
| 37 |
+
model_type = 'mpt'
|
| 38 |
+
attribute_map = {
|
| 39 |
+
'num_attention_heads': 'n_heads',
|
| 40 |
+
'hidden_size': 'd_model',
|
| 41 |
+
'num_hidden_layers': 'n_layers',
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# pylint: disable=dangerous-default-value
|
| 45 |
+
def __init__(self,
|
| 46 |
+
d_model: int = 2048,
|
| 47 |
+
n_heads: int = 16,
|
| 48 |
+
n_layers: int = 24,
|
| 49 |
+
expansion_ratio: int = 4,
|
| 50 |
+
max_seq_len: int = 2048,
|
| 51 |
+
vocab_size: int = 50368,
|
| 52 |
+
resid_pdrop: float = 0.0,
|
| 53 |
+
emb_pdrop: float = 0.0,
|
| 54 |
+
learned_pos_emb: bool = True,
|
| 55 |
+
attn_config: Dict = attn_config_defaults,
|
| 56 |
+
ffn_config: Dict = ffn_config_defaults,
|
| 57 |
+
init_device: str = 'cpu',
|
| 58 |
+
logit_scale: Optional[Union[float, str]] = None,
|
| 59 |
+
no_bias: bool = False,
|
| 60 |
+
embedding_fraction: float = 1.0,
|
| 61 |
+
norm_type: str = 'low_precision_layernorm',
|
| 62 |
+
use_cache: bool = False,
|
| 63 |
+
init_config: Dict = init_config_defaults,
|
| 64 |
+
fc_type: str = 'torch',
|
| 65 |
+
verbose: Optional[int] = None,
|
| 66 |
+
**kwargs: Any):
|
| 67 |
+
self.d_model = d_model
|
| 68 |
+
self.n_heads = n_heads
|
| 69 |
+
self.n_layers = n_layers
|
| 70 |
+
self.expansion_ratio = expansion_ratio
|
| 71 |
+
self.max_seq_len = max_seq_len
|
| 72 |
+
self.vocab_size = vocab_size
|
| 73 |
+
self.resid_pdrop = resid_pdrop
|
| 74 |
+
self.emb_pdrop = emb_pdrop
|
| 75 |
+
self.learned_pos_emb = learned_pos_emb
|
| 76 |
+
self.attn_config = attn_config
|
| 77 |
+
self.ffn_config = ffn_config
|
| 78 |
+
self.init_device = init_device
|
| 79 |
+
self.logit_scale = logit_scale
|
| 80 |
+
self.no_bias = no_bias
|
| 81 |
+
self.embedding_fraction = embedding_fraction
|
| 82 |
+
self.norm_type = norm_type
|
| 83 |
+
self.use_cache = use_cache
|
| 84 |
+
self.init_config = init_config
|
| 85 |
+
self.fc_type = fc_type
|
| 86 |
+
if verbose is not None:
|
| 87 |
+
warnings.warn(DeprecationWarning(
|
| 88 |
+
'verbose argument for MPTConfig is now ignored and '
|
| 89 |
+
'will be removed. Use python_log_level instead.'),
|
| 90 |
+
stacklevel=2)
|
| 91 |
+
if 'name' in kwargs:
|
| 92 |
+
del kwargs['name']
|
| 93 |
+
if 'loss_fn' in kwargs:
|
| 94 |
+
del kwargs['loss_fn']
|
| 95 |
+
if self.attn_config.get('alibi', False):
|
| 96 |
+
self.learned_pos_emb = False
|
| 97 |
+
warnings.warn(
|
| 98 |
+
f'alibi is turned on, setting `learned_pos_emb` '
|
| 99 |
+
f'to {self.learned_pos_emb}`',
|
| 100 |
+
stacklevel=2)
|
| 101 |
+
super().__init__(**kwargs)
|
| 102 |
+
self._validate_config()
|
| 103 |
+
|
| 104 |
+
def _set_config_defaults(
|
| 105 |
+
self, config: Dict[str, Any],
|
| 106 |
+
config_defaults: Dict[str, Any]) -> Dict[str, Any]:
|
| 107 |
+
for (k, v) in config_defaults.items():
|
| 108 |
+
if k not in config:
|
| 109 |
+
config[k] = v
|
| 110 |
+
return config
|
| 111 |
+
|
| 112 |
+
def _validate_config(self) -> None:
|
| 113 |
+
self.attn_config = self._set_config_defaults(self.attn_config,
|
| 114 |
+
attn_config_defaults)
|
| 115 |
+
self.ffn_config = self._set_config_defaults(self.ffn_config,
|
| 116 |
+
ffn_config_defaults)
|
| 117 |
+
self.init_config = self._set_config_defaults(self.init_config,
|
| 118 |
+
init_config_defaults)
|
| 119 |
+
if self.d_model % self.n_heads != 0:
|
| 120 |
+
raise ValueError('d_model must be divisible by n_heads')
|
| 121 |
+
if any(
|
| 122 |
+
prob < 0 or prob > 1 for prob in
|
| 123 |
+
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
|
| 124 |
+
]):
|
| 125 |
+
raise ValueError(
|
| 126 |
+
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
|
| 127 |
+
"probabilities and must be between 0 and 1")
|
| 128 |
+
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
|
| 129 |
+
raise ValueError(
|
| 130 |
+
f"Unknown attn_impl={self.attn_config['attn_impl']}")
|
| 131 |
+
if self.attn_config['prefix_lm'] and self.attn_config[
|
| 132 |
+
'attn_impl'] not in ['torch', 'triton']:
|
| 133 |
+
raise NotImplementedError(
|
| 134 |
+
'prefix_lm only implemented with torch and triton attention.')
|
| 135 |
+
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
|
| 136 |
+
'torch', 'triton'
|
| 137 |
+
]:
|
| 138 |
+
raise NotImplementedError(
|
| 139 |
+
'alibi only implemented with torch and triton attention.')
|
| 140 |
+
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
|
| 141 |
+
'attn_impl'] not in ['torch', 'triton']:
|
| 142 |
+
raise NotImplementedError(
|
| 143 |
+
'attn_uses_sequence_id only implemented with torch '
|
| 144 |
+
'and triton attention.')
|
| 145 |
+
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
|
| 146 |
+
raise ValueError(
|
| 147 |
+
'model.embedding_fraction must be between 0 (exclusive) '
|
| 148 |
+
'and 1 (inclusive)!')
|
| 149 |
+
if isinstance(self.logit_scale,
|
| 150 |
+
str) and self.logit_scale != 'inv_sqrt_d_model':
|
| 151 |
+
raise ValueError(
|
| 152 |
+
f"self.logit_scale={self.logit_scale!r} is not recognized as "
|
| 153 |
+
"an option; use numeric value or 'inv_sqrt_d_model'.")
|
| 154 |
+
if self.init_config.get('name', None) is None:
|
| 155 |
+
raise ValueError(
|
| 156 |
+
f"self.init_config={self.init_config!r} 'name' needs to be set."
|
| 157 |
+
)
|
| 158 |
+
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
| 159 |
+
warnings.warn(
|
| 160 |
+
'Positional information not being provided to the model.',
|
| 161 |
+
stacklevel=2)
|
| 162 |
+
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
| 163 |
+
try:
|
| 164 |
+
# pylint: disable=import-outside-toplevel
|
| 165 |
+
import transformer_engine.pytorch as te
|
| 166 |
+
del te
|
| 167 |
+
except Exception as exc:
|
| 168 |
+
raise ImportError(
|
| 169 |
+
'TransformerEngine import fail. `fc_type: te` requires '
|
| 170 |
+
'TransformerEngine be installed. '
|
| 171 |
+
'The required version of transformer_engine also requires '
|
| 172 |
+
'FlashAttention v1.0.6 is installed:\n'
|
| 173 |
+
'pip install flash-attn==1.0.6 --no-build-isolation \n'
|
| 174 |
+
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
|
| 175 |
+
) from exc
|
| 176 |
+
if self.ffn_config['ffn_type'] == 'mptmlp':
|
| 177 |
+
self.ffn_config['fc_type'] = self.fc_type
|
| 178 |
+
elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
| 179 |
+
self.ffn_config['bias'] = not self.no_bias
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nemotron.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
| 4 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 7 |
+
# you may not use this file except in compliance with the License.
|
| 8 |
+
# You may obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 15 |
+
# See the License for the specific language governing permissions and
|
| 16 |
+
# limitations under the License.
|
| 17 |
+
"""Nemotron model configuration"""
|
| 18 |
+
|
| 19 |
+
from transformers import PretrainedConfig
|
| 20 |
+
from transformers.utils import logging
|
| 21 |
+
|
| 22 |
+
logger = logging.get_logger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class NemotronConfig(PretrainedConfig):
|
| 26 |
+
r"""
|
| 27 |
+
This is the configuration class to store the configuration of a
|
| 28 |
+
[`NemotronModel`]. It is used to instantiate an Nemotron model
|
| 29 |
+
according to the specified arguments, defining the model architecture.
|
| 30 |
+
Instantiating a configuration with the defaults will yield a similar
|
| 31 |
+
configuration to that of the Nemotron-8B.
|
| 32 |
+
|
| 33 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be
|
| 34 |
+
used to control the model outputs. Read the documentation from
|
| 35 |
+
[`PretrainedConfig`] for more information.
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
vocab_size (`int`, *optional*, defaults to 256000):
|
| 40 |
+
Vocabulary size of the Nemotron model. Defines the number of
|
| 41 |
+
different tokens that can be represented by the
|
| 42 |
+
`inputs_ids` passed when calling [`NemotronModel`]
|
| 43 |
+
hidden_size (`int`, *optional*, defaults to 6144):
|
| 44 |
+
Dimension of the hidden representations.
|
| 45 |
+
intermediate_size (`int`, *optional*, defaults to 24576):
|
| 46 |
+
Dimension of the MLP representations.
|
| 47 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
| 48 |
+
Number of hidden layers in the Transformer decoder.
|
| 49 |
+
num_attention_heads (`int`, *optional*, defaults to 48):
|
| 50 |
+
Number of attention heads for each attention layer in the
|
| 51 |
+
Transformer decoder.
|
| 52 |
+
head_dim (`int`, *optional*):
|
| 53 |
+
Projection weights dimension in multi-head attention. Set to
|
| 54 |
+
hidden_size // num_attention_heads if None
|
| 55 |
+
num_key_value_heads (`int`, *optional*):
|
| 56 |
+
This is the number of key_value heads that should be used to
|
| 57 |
+
implement Grouped Query Attention. If
|
| 58 |
+
`num_key_value_heads=num_attention_heads`, the model will use
|
| 59 |
+
Multi Head Attention (MHA), if
|
| 60 |
+
`num_key_value_heads=1 the model will use Multi Query Attention
|
| 61 |
+
(MQA) otherwise GQA is used. When converting a multi-head
|
| 62 |
+
checkpoint to a GQA checkpoint, each group key and value
|
| 63 |
+
head should be constructed by meanpooling all the original
|
| 64 |
+
heads within that group. For more details checkout
|
| 65 |
+
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
|
| 66 |
+
is not specified, will default to `num_attention_heads`.
|
| 67 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
| 68 |
+
The non-linear activation function (function or string) in the
|
| 69 |
+
decoder.
|
| 70 |
+
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
| 71 |
+
The maximum sequence length that this model might ever be used
|
| 72 |
+
with.
|
| 73 |
+
initializer_range (`float`, *optional*, defaults to 0.0134):
|
| 74 |
+
The standard deviation of the truncated_normal_initializer for
|
| 75 |
+
initializing all weight matrices.
|
| 76 |
+
norm_eps (`float`, *optional*, defaults to 1e-05):
|
| 77 |
+
The epsilon used by the normalization layers.
|
| 78 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
| 79 |
+
Whether or not the model should return the last key/values
|
| 80 |
+
attentions (not used by all models). Only relevant if
|
| 81 |
+
`config.is_decoder=True`.
|
| 82 |
+
pad_token_id (`int`, *optional*):
|
| 83 |
+
Padding token id.
|
| 84 |
+
bos_token_id (`int`, *optional*, defaults to 2):
|
| 85 |
+
Beginning of stream token id.
|
| 86 |
+
eos_token_id (`int`, *optional*, defaults to 3):
|
| 87 |
+
End of stream token id.
|
| 88 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
| 89 |
+
Whether to tie weight embeddings
|
| 90 |
+
rope_theta (`float`, *optional*, defaults to 10000.0):
|
| 91 |
+
The base period of the RoPE embeddings.
|
| 92 |
+
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
|
| 93 |
+
Percentage of the query and keys which will have rotary embedding.
|
| 94 |
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
| 95 |
+
Whether to use a bias in the query, key, value and output
|
| 96 |
+
projection layers during self-attention.
|
| 97 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 98 |
+
The dropout ratio for the attention probabilities.
|
| 99 |
+
mlp_bias (`bool`, *optional*, defaults to `False`):
|
| 100 |
+
Whether to use a bias in up_proj and down_proj layers in the MLP
|
| 101 |
+
layers.
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
>>> from transformers import NemotronModel, NemotronConfig
|
| 105 |
+
>>> # Initializing a Nemotron nemotron-15b style configuration
|
| 106 |
+
>>> configuration = NemotronConfig()
|
| 107 |
+
>>> # Initializing a model from the nemotron-15b style configuration
|
| 108 |
+
>>> model = NemotronModel(configuration)
|
| 109 |
+
>>> # Accessing the model configuration
|
| 110 |
+
>>> configuration = model.config
|
| 111 |
+
```"""
|
| 112 |
+
|
| 113 |
+
model_type = "nemotron"
|
| 114 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 115 |
+
|
| 116 |
+
def __init__(
|
| 117 |
+
self,
|
| 118 |
+
vocab_size=256000,
|
| 119 |
+
hidden_size=6144,
|
| 120 |
+
intermediate_size=24576,
|
| 121 |
+
num_hidden_layers=32,
|
| 122 |
+
num_attention_heads=48,
|
| 123 |
+
head_dim=None,
|
| 124 |
+
num_key_value_heads=None,
|
| 125 |
+
hidden_act="relu2",
|
| 126 |
+
max_position_embeddings=4096,
|
| 127 |
+
initializer_range=0.0134,
|
| 128 |
+
norm_eps=1e-5,
|
| 129 |
+
use_cache=True,
|
| 130 |
+
pad_token_id=None,
|
| 131 |
+
bos_token_id=2,
|
| 132 |
+
eos_token_id=3,
|
| 133 |
+
tie_word_embeddings=False,
|
| 134 |
+
rope_theta=10000.0,
|
| 135 |
+
rope_scaling=None,
|
| 136 |
+
partial_rotary_factor=0.5,
|
| 137 |
+
attention_bias=False,
|
| 138 |
+
attention_dropout=0.0,
|
| 139 |
+
mlp_bias=False,
|
| 140 |
+
**kwargs,
|
| 141 |
+
):
|
| 142 |
+
self.vocab_size = vocab_size
|
| 143 |
+
self.max_position_embeddings = max_position_embeddings
|
| 144 |
+
self.hidden_size = hidden_size
|
| 145 |
+
self.intermediate_size = intermediate_size
|
| 146 |
+
self.num_hidden_layers = num_hidden_layers
|
| 147 |
+
self.num_attention_heads = num_attention_heads
|
| 148 |
+
head_dim = head_dim or kwargs.get("kv_channels")
|
| 149 |
+
self.head_dim = head_dim if head_dim is not None else (
|
| 150 |
+
hidden_size // num_attention_heads)
|
| 151 |
+
|
| 152 |
+
# for backward compatibility
|
| 153 |
+
if num_key_value_heads is None:
|
| 154 |
+
num_key_value_heads = num_attention_heads
|
| 155 |
+
|
| 156 |
+
self.num_key_value_heads = num_key_value_heads
|
| 157 |
+
self.hidden_act = hidden_act
|
| 158 |
+
self.initializer_range = initializer_range
|
| 159 |
+
self.norm_eps = norm_eps
|
| 160 |
+
self.use_cache = use_cache
|
| 161 |
+
self.rope_theta = rope_theta
|
| 162 |
+
self.rope_scaling = rope_scaling
|
| 163 |
+
# for backward compatibility
|
| 164 |
+
partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
|
| 165 |
+
"rope_percentage") or partial_rotary_factor
|
| 166 |
+
self.partial_rotary_factor = partial_rotary_factor
|
| 167 |
+
self._rope_scaling_validation()
|
| 168 |
+
self.attention_bias = attention_bias
|
| 169 |
+
self.attention_dropout = attention_dropout
|
| 170 |
+
self.mlp_bias = mlp_bias
|
| 171 |
+
|
| 172 |
+
super().__init__(
|
| 173 |
+
pad_token_id=pad_token_id,
|
| 174 |
+
bos_token_id=bos_token_id,
|
| 175 |
+
eos_token_id=eos_token_id,
|
| 176 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 177 |
+
**kwargs,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
def _rope_scaling_validation(self):
|
| 181 |
+
"""
|
| 182 |
+
Validate the `rope_scaling` configuration.
|
| 183 |
+
"""
|
| 184 |
+
if self.rope_scaling is None:
|
| 185 |
+
return
|
| 186 |
+
|
| 187 |
+
if not isinstance(self.rope_scaling, dict) or len(
|
| 188 |
+
self.rope_scaling) != 2:
|
| 189 |
+
raise ValueError(
|
| 190 |
+
"`rope_scaling` must be a dictionary with two fields, "
|
| 191 |
+
f"`type` and `factor`, got {self.rope_scaling}")
|
| 192 |
+
rope_scaling_type = self.rope_scaling.get("type", None)
|
| 193 |
+
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
| 194 |
+
if rope_scaling_type is None or rope_scaling_type not in [
|
| 195 |
+
"linear", "dynamic"
|
| 196 |
+
]:
|
| 197 |
+
raise ValueError(
|
| 198 |
+
"`rope_scaling`'s type field must be one of ['linear', "
|
| 199 |
+
f"'dynamic'], got {rope_scaling_type}")
|
| 200 |
+
if rope_scaling_factor is None or not isinstance(
|
| 201 |
+
rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
|
| 202 |
+
raise ValueError(
|
| 203 |
+
"`rope_scaling`'s factor field must be a float > 1, got "
|
| 204 |
+
f"{rope_scaling_factor}")
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nvlm_d.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Adapted from
|
| 4 |
+
# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
# NVLM-D
|
| 7 |
+
# Copyright (c) 2024 NVIDIA
|
| 8 |
+
# Licensed under Apache 2.0 License [see LICENSE for details]
|
| 9 |
+
# --------------------------------------------------------
|
| 10 |
+
from .internvl import InternVLChatConfig
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class NVLM_D_Config(InternVLChatConfig):
|
| 14 |
+
model_type = 'NVLM_D'
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/solar.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
| 6 |
+
# and OPT implementations in this library. It has been modified from its
|
| 7 |
+
# original forms to accommodate minor architectural differences compared
|
| 8 |
+
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
| 9 |
+
#
|
| 10 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 11 |
+
# you may not use this file except in compliance with the License.
|
| 12 |
+
# You may obtain a copy of the License at
|
| 13 |
+
#
|
| 14 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 15 |
+
#
|
| 16 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 17 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 18 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 19 |
+
# See the License for the specific language governing permissions and
|
| 20 |
+
# limitations under the License.
|
| 21 |
+
"""Solar model configuration"""
|
| 22 |
+
|
| 23 |
+
from transformers import PretrainedConfig
|
| 24 |
+
from transformers.utils import logging
|
| 25 |
+
|
| 26 |
+
logger = logging.get_logger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class SolarConfig(PretrainedConfig):
|
| 30 |
+
r"""
|
| 31 |
+
This is the configuration class to store
|
| 32 |
+
the configuration of a [`SolarModel`].
|
| 33 |
+
It is used to instantiate an LLaMA model
|
| 34 |
+
according to the specified arguments,
|
| 35 |
+
defining the model architecture.
|
| 36 |
+
Instantiating a configuration with the
|
| 37 |
+
defaults will yield a similar
|
| 38 |
+
configuration to that of the LLaMA-7B.
|
| 39 |
+
Configuration objects inherit from [`PretrainedConfig`]
|
| 40 |
+
and can be used to control the model outputs.
|
| 41 |
+
Read the documentation from [`PretrainedConfig`] for more information.
|
| 42 |
+
Args:
|
| 43 |
+
vocab_size (`int`, *optional*, defaults to 32000):
|
| 44 |
+
Vocabulary size of the LLaMA model.
|
| 45 |
+
Defines the number of different tokens
|
| 46 |
+
that can be represented by the `inputs_ids`
|
| 47 |
+
passed when calling [`SolarModel`]
|
| 48 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
| 49 |
+
Dimension of the hidden representations.
|
| 50 |
+
intermediate_size (`int`, *optional*, defaults to 11008):
|
| 51 |
+
Dimension of the MLP representations.
|
| 52 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
| 53 |
+
Number of hidden layers in the Transformer decoder.
|
| 54 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
| 55 |
+
Number of attention heads for each attention layer
|
| 56 |
+
in the Transformer decoder.
|
| 57 |
+
num_key_value_heads (`int`, *optional*):
|
| 58 |
+
This is the number of key_value heads that
|
| 59 |
+
should be used to implement Grouped Query Attention. If
|
| 60 |
+
`num_key_value_heads=num_attention_heads`,
|
| 61 |
+
the model will use Multi Head Attention (MHA), if
|
| 62 |
+
`num_key_value_heads=1` the model
|
| 63 |
+
will use Multi Query Attention (MQA)
|
| 64 |
+
otherwise GQA is used. When
|
| 65 |
+
converting a multi-head checkpoint to a GQA checkpoint,
|
| 66 |
+
each group key and value head should be constructed
|
| 67 |
+
by meanpooling all the original heads within that group.
|
| 68 |
+
For more details checkout [this paper]
|
| 69 |
+
(https://arxiv.org/pdf/2305.13245.pdf).
|
| 70 |
+
If it is not specified, will default to
|
| 71 |
+
`num_attention_heads`.
|
| 72 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
| 73 |
+
The non-linear activation function (function or string)
|
| 74 |
+
in the decoder.
|
| 75 |
+
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
| 76 |
+
The maximum sequence length that this model might ever be used with.
|
| 77 |
+
Solar 1 supports up to 2048 tokens,
|
| 78 |
+
Solar 2 up to 4096, CodeSolar up to 16384.
|
| 79 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 80 |
+
The standard deviation of
|
| 81 |
+
the truncated_normal_initializer for initializing
|
| 82 |
+
all weight matrices.
|
| 83 |
+
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
| 84 |
+
The epsilon used by the rms normalization layers.
|
| 85 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
| 86 |
+
Whether or not the model should return
|
| 87 |
+
the last key/values attentions (not used by all models). Only
|
| 88 |
+
relevant if `config.is_decoder=True`.
|
| 89 |
+
pad_token_id (`int`, *optional*):
|
| 90 |
+
Padding token id.
|
| 91 |
+
bos_token_id (`int`, *optional*, defaults to 1):
|
| 92 |
+
Beginning of stream token id.
|
| 93 |
+
eos_token_id (`int`, *optional*, defaults to 2):
|
| 94 |
+
End of stream token id.
|
| 95 |
+
pretraining_tp (`int`, *optional*, defaults to 1):
|
| 96 |
+
Experimental feature. Tensor parallelism rank
|
| 97 |
+
used during pretraining.
|
| 98 |
+
Please refer to [this
|
| 99 |
+
document](https://huggingface.co/docs/
|
| 100 |
+
transformers/main/
|
| 101 |
+
perf_train_gpu_many#tensor-parallelism)
|
| 102 |
+
to understand more about it. This value is
|
| 103 |
+
necessary to ensure exact reproducibility
|
| 104 |
+
of the pretraining results.
|
| 105 |
+
Please refer to [this
|
| 106 |
+
issue](https://github.com/pytorch/pytorch/issues/76232).
|
| 107 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
| 108 |
+
Whether to tie weight embeddings
|
| 109 |
+
rope_theta (`float`, *optional*, defaults to 10000.0):
|
| 110 |
+
The base period of the RoPE embeddings.
|
| 111 |
+
rope_scaling (`Dict`, *optional*):
|
| 112 |
+
Dictionary containing the scaling configuration for
|
| 113 |
+
the RoPE embeddings.
|
| 114 |
+
Currently supports two scaling
|
| 115 |
+
strategies: linear and dynamic.
|
| 116 |
+
Their scaling factor must be a float greater than 1.
|
| 117 |
+
The expected format is
|
| 118 |
+
`{"type": strategy name, "factor": scaling factor}`.
|
| 119 |
+
When using this flag, don't update
|
| 120 |
+
`max_position_embeddings` to the expected new maximum.
|
| 121 |
+
See the following thread for more information on how
|
| 122 |
+
these scaling strategies behave:
|
| 123 |
+
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
|
| 124 |
+
dynamically_scaled_rope_further_increases/. This is an
|
| 125 |
+
experimental feature, subject to breaking
|
| 126 |
+
API changes in future versions.
|
| 127 |
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
| 128 |
+
Whether to use a bias in the query, key, value
|
| 129 |
+
and output projection layers during self-attention.
|
| 130 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 131 |
+
The dropout ratio for the attention probabilities.
|
| 132 |
+
mlp_bias (`bool`, *optional*, defaults to `False`):
|
| 133 |
+
Whether to use a bias in up_proj, down_proj and gate_proj
|
| 134 |
+
layers in the MLP layers.
|
| 135 |
+
sliding_window (`int`, *optional*, defaults to 2047):
|
| 136 |
+
Sliding window attention window size. If not specified,
|
| 137 |
+
will default to `2047`.
|
| 138 |
+
```python
|
| 139 |
+
>>> from transformers import SolarModel, SolarConfig
|
| 140 |
+
>>> # Initializing a Solar-pro style configuration
|
| 141 |
+
>>> configuration = SolarConfig()
|
| 142 |
+
>>> # Initializing a model from the Solar-pro style configuration
|
| 143 |
+
>>> model = SolarModel(configuration)
|
| 144 |
+
>>> # Accessing the model configuration
|
| 145 |
+
>>> configuration = model.config
|
| 146 |
+
```"""
|
| 147 |
+
|
| 148 |
+
model_type = "solar"
|
| 149 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 150 |
+
|
| 151 |
+
def __init__(
|
| 152 |
+
self,
|
| 153 |
+
vocab_size=32000,
|
| 154 |
+
hidden_size=4096,
|
| 155 |
+
intermediate_size=11008,
|
| 156 |
+
num_hidden_layers=32,
|
| 157 |
+
num_attention_heads=32,
|
| 158 |
+
num_key_value_heads=None,
|
| 159 |
+
hidden_act="silu",
|
| 160 |
+
max_position_embeddings=2048,
|
| 161 |
+
initializer_range=0.02,
|
| 162 |
+
rms_norm_eps=1e-6,
|
| 163 |
+
use_cache=True,
|
| 164 |
+
pad_token_id=None,
|
| 165 |
+
bos_token_id=1,
|
| 166 |
+
eos_token_id=2,
|
| 167 |
+
pretraining_tp=1,
|
| 168 |
+
tie_word_embeddings=False,
|
| 169 |
+
rope_theta=10000.0,
|
| 170 |
+
rope_scaling=None,
|
| 171 |
+
attention_bias=False,
|
| 172 |
+
attention_dropout=0.0,
|
| 173 |
+
mlp_bias=False,
|
| 174 |
+
sliding_window=2047,
|
| 175 |
+
bskcn_1=None,
|
| 176 |
+
bskcn_2=None,
|
| 177 |
+
bskcn_3=None,
|
| 178 |
+
bskcn_4=None,
|
| 179 |
+
bskcn_tv=None,
|
| 180 |
+
**kwargs,
|
| 181 |
+
):
|
| 182 |
+
self.vocab_size = vocab_size
|
| 183 |
+
self.max_position_embeddings = max_position_embeddings
|
| 184 |
+
self.hidden_size = hidden_size
|
| 185 |
+
self.intermediate_size = intermediate_size
|
| 186 |
+
self.num_hidden_layers = num_hidden_layers
|
| 187 |
+
self.num_attention_heads = num_attention_heads
|
| 188 |
+
|
| 189 |
+
# for backward compatibility
|
| 190 |
+
if num_key_value_heads is None:
|
| 191 |
+
num_key_value_heads = num_attention_heads
|
| 192 |
+
|
| 193 |
+
self.num_key_value_heads = num_key_value_heads
|
| 194 |
+
self.hidden_act = hidden_act
|
| 195 |
+
self.initializer_range = initializer_range
|
| 196 |
+
self.rms_norm_eps = rms_norm_eps
|
| 197 |
+
self.pretraining_tp = pretraining_tp
|
| 198 |
+
self.use_cache = use_cache
|
| 199 |
+
self.rope_theta = rope_theta
|
| 200 |
+
self.rope_scaling = rope_scaling
|
| 201 |
+
self._rope_scaling_validation()
|
| 202 |
+
self.attention_bias = attention_bias
|
| 203 |
+
self.attention_dropout = attention_dropout
|
| 204 |
+
self.mlp_bias = mlp_bias
|
| 205 |
+
self.sliding_window = sliding_window
|
| 206 |
+
self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
|
| 207 |
+
self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
|
| 208 |
+
self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
|
| 209 |
+
self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
|
| 210 |
+
self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
|
| 211 |
+
|
| 212 |
+
super().__init__(
|
| 213 |
+
pad_token_id=pad_token_id,
|
| 214 |
+
bos_token_id=bos_token_id,
|
| 215 |
+
eos_token_id=eos_token_id,
|
| 216 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 217 |
+
**kwargs,
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
def _rope_scaling_validation(self):
|
| 221 |
+
"""
|
| 222 |
+
Validate the `rope_scaling` configuration.
|
| 223 |
+
"""
|
| 224 |
+
if self.rope_scaling is None:
|
| 225 |
+
return
|
| 226 |
+
|
| 227 |
+
if (not isinstance(self.rope_scaling, dict)
|
| 228 |
+
or len(self.rope_scaling) != 2):
|
| 229 |
+
raise ValueError(
|
| 230 |
+
"`rope_scaling` must be a dictionary with two fields,"
|
| 231 |
+
" `type` and `factor`, "
|
| 232 |
+
f"got {self.rope_scaling}")
|
| 233 |
+
rope_scaling_type = self.rope_scaling.get("type", None)
|
| 234 |
+
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
| 235 |
+
if rope_scaling_type is None or rope_scaling_type not in [
|
| 236 |
+
"linear",
|
| 237 |
+
"dynamic",
|
| 238 |
+
]:
|
| 239 |
+
raise ValueError(f"`rope_scaling`'s type field must be one of "
|
| 240 |
+
f"['linear', 'dynamic'], got {rope_scaling_type}")
|
| 241 |
+
if (rope_scaling_factor is None
|
| 242 |
+
or not isinstance(rope_scaling_factor, float)
|
| 243 |
+
or rope_scaling_factor <= 1.0):
|
| 244 |
+
raise ValueError(
|
| 245 |
+
f"`rope_scaling`'s factor field must be a float > 1,"
|
| 246 |
+
f" got {rope_scaling_factor}")
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/ultravox.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
|
| 4 |
+
from typing import Any, Dict, Optional
|
| 5 |
+
|
| 6 |
+
import transformers
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class UltravoxConfig(transformers.PretrainedConfig):
|
| 10 |
+
r"""
|
| 11 |
+
This is the configuration class to store the configuration of a
|
| 12 |
+
[`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
| 13 |
+
Ultravox model according to the specified arguments, defining the model
|
| 14 |
+
architecture.
|
| 15 |
+
|
| 16 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to
|
| 17 |
+
control the model outputs. Read the documentation from [`PretrainedConfig`]
|
| 18 |
+
for more information.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
audio_config (`Union[AutoConfig, dict]`, *optional*):
|
| 22 |
+
Custom audio config or dict
|
| 23 |
+
text_config (`Union[AutoConfig, dict]`, *optional*):
|
| 24 |
+
The config object of the text backbone. Can be any of `LlamaConfig`
|
| 25 |
+
or `MistralConfig`.
|
| 26 |
+
ignore_index (`int`, *optional*, defaults to -100):
|
| 27 |
+
The ignore index for the loss function.
|
| 28 |
+
audio_token_index (`int`, *optional*, defaults to 32000):
|
| 29 |
+
The audio token index to encode the audio prompt.
|
| 30 |
+
stack_factor (`int`, *optional*, defaults to 8):
|
| 31 |
+
Audio downsampling factor for the multimodal projector.
|
| 32 |
+
norm_init (`float`, *optional*, defaults to 0.4):
|
| 33 |
+
The initialization value for the layer normalization.
|
| 34 |
+
projector_act (`str`, *optional*, defaults to `"swiglu"`):
|
| 35 |
+
The activation function used by the multimodal projector.
|
| 36 |
+
text_model_lora_config (`LoraConfigSimplified`, *optional*):
|
| 37 |
+
The LoRA configuration for finetuning the text model.
|
| 38 |
+
audio_model_lora_config (`LoraConfigSimplified`, *optional*):
|
| 39 |
+
The LoRA configuration for finetuning the audio model.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
model_type = "ultravox"
|
| 43 |
+
is_composition = False
|
| 44 |
+
|
| 45 |
+
def __init__(
|
| 46 |
+
self,
|
| 47 |
+
audio_config: Optional[Dict[str, Any]] = None,
|
| 48 |
+
text_config: Optional[Dict[str, Any]] = None,
|
| 49 |
+
audio_model_id: Optional[str] = None,
|
| 50 |
+
text_model_id: Optional[str] = None,
|
| 51 |
+
ignore_index: int = -100,
|
| 52 |
+
audio_token_index: int = 32000,
|
| 53 |
+
hidden_size: int = 4096,
|
| 54 |
+
stack_factor: int = 8,
|
| 55 |
+
norm_init: float = 0.4,
|
| 56 |
+
projector_act: str = "swiglu",
|
| 57 |
+
text_model_lora_config: Optional[Dict[str, Any]] = None,
|
| 58 |
+
audio_model_lora_config: Optional[Dict[str, Any]] = None,
|
| 59 |
+
**kwargs,
|
| 60 |
+
):
|
| 61 |
+
self.ignore_index = ignore_index
|
| 62 |
+
|
| 63 |
+
self.audio_model_id = audio_model_id
|
| 64 |
+
self.text_model_id = text_model_id
|
| 65 |
+
self.audio_token_index = audio_token_index
|
| 66 |
+
|
| 67 |
+
self.hidden_size = hidden_size
|
| 68 |
+
self.stack_factor = stack_factor
|
| 69 |
+
self.norm_init = norm_init
|
| 70 |
+
self.projector_act = projector_act
|
| 71 |
+
|
| 72 |
+
if text_model_id is not None:
|
| 73 |
+
# Avoid circular import
|
| 74 |
+
from vllm.transformers_utils.config import get_config
|
| 75 |
+
|
| 76 |
+
self.text_config = get_config(text_model_id,
|
| 77 |
+
trust_remote_code=False)
|
| 78 |
+
else:
|
| 79 |
+
text_config = text_config or {}
|
| 80 |
+
self.text_config = transformers.CONFIG_MAPPING[text_config.get(
|
| 81 |
+
"model_type", "llama")](**text_config)
|
| 82 |
+
|
| 83 |
+
if audio_model_id is not None:
|
| 84 |
+
# Avoid circular import
|
| 85 |
+
from vllm.transformers_utils.config import get_config
|
| 86 |
+
|
| 87 |
+
self.audio_config = get_config(audio_model_id,
|
| 88 |
+
trust_remote_code=False)
|
| 89 |
+
else:
|
| 90 |
+
audio_config = audio_config or {}
|
| 91 |
+
self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
|
| 92 |
+
"model_type", "whisper")](**audio_config)
|
| 93 |
+
|
| 94 |
+
self.text_model_lora_config = text_model_lora_config or {}
|
| 95 |
+
self.audio_model_lora_config = audio_model_lora_config or {}
|
| 96 |
+
|
| 97 |
+
self.vocab_size = self.text_config.vocab_size
|
| 98 |
+
|
| 99 |
+
self.initializer_range = self.text_config.initializer_range
|
| 100 |
+
|
| 101 |
+
super().__init__(**kwargs)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import Dict, List, Optional
|
| 4 |
+
|
| 5 |
+
from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
|
| 6 |
+
Sequence, SequenceGroup)
|
| 7 |
+
|
| 8 |
+
from .detokenizer_utils import (convert_prompt_ids_to_tokens,
|
| 9 |
+
detokenize_incrementally)
|
| 10 |
+
from .tokenizer import AnyTokenizer
|
| 11 |
+
from .tokenizer_group import BaseTokenizerGroup
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Detokenizer:
|
| 15 |
+
"""Provides methods to decode the output of a model into text."""
|
| 16 |
+
|
| 17 |
+
def __init__(self, tokenizer_group: BaseTokenizerGroup):
|
| 18 |
+
self.tokenizer_group = tokenizer_group
|
| 19 |
+
|
| 20 |
+
def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
|
| 21 |
+
"""Returns the HF tokenizer to use for a given sequence."""
|
| 22 |
+
return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
|
| 23 |
+
|
| 24 |
+
def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
|
| 25 |
+
prompt_logprobs: List[Optional[Dict[
|
| 26 |
+
int, Logprob]]],
|
| 27 |
+
position_offset: int) -> None:
|
| 28 |
+
"""Decodes the logprobs for the prompt of a sequence group.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
seq_group: The sequence group to decode.
|
| 32 |
+
prompt_logprobs: The logprobs to decode.
|
| 33 |
+
position_offset: Offset of the first index of the logprobs
|
| 34 |
+
relative to the start of the sequence (for chunked prefill).
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
The prompt logprobs with the decoded tokens.
|
| 38 |
+
"""
|
| 39 |
+
prms = seq_group.sampling_params
|
| 40 |
+
assert prms is not None
|
| 41 |
+
|
| 42 |
+
# We can pick any sequence for the prompt.
|
| 43 |
+
seq = seq_group.get_seqs()[0]
|
| 44 |
+
# Only prompt, without the generated token.
|
| 45 |
+
all_token_ids = seq.get_token_ids()
|
| 46 |
+
prompt_token_ids = all_token_ids[:-1]
|
| 47 |
+
tokenizer = self.get_tokenizer_for_seq(seq)
|
| 48 |
+
prefix_offset = 0
|
| 49 |
+
read_offset = 0
|
| 50 |
+
next_iter_prefix_offset = 0
|
| 51 |
+
next_iter_read_offset = 0
|
| 52 |
+
next_iter_tokens: List[str] = []
|
| 53 |
+
prev_tokens = None
|
| 54 |
+
|
| 55 |
+
for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
|
| 56 |
+
prompt_logprobs):
|
| 57 |
+
|
| 58 |
+
# Absolute token position equals the index in the logprobs
|
| 59 |
+
# list plus the offset of the entire logprobs list relative
|
| 60 |
+
# to the start of the sequence.
|
| 61 |
+
token_position = token_position_in_logprob + position_offset
|
| 62 |
+
if not prompt_logprobs_for_token:
|
| 63 |
+
continue
|
| 64 |
+
for token_id, sample_logprob in prompt_logprobs_for_token.items():
|
| 65 |
+
if (sample_logprob.decoded_token is None
|
| 66 |
+
and token_id != VLLM_INVALID_TOKEN_ID):
|
| 67 |
+
prompt_token_ids_with_token = (
|
| 68 |
+
prompt_token_ids[:token_position] + [token_id])
|
| 69 |
+
(new_tokens, new_text, new_prefix_offset,
|
| 70 |
+
new_read_offset) = detokenize_incrementally(
|
| 71 |
+
tokenizer=tokenizer,
|
| 72 |
+
all_input_ids=prompt_token_ids_with_token,
|
| 73 |
+
prev_tokens=prev_tokens,
|
| 74 |
+
prefix_offset=prefix_offset,
|
| 75 |
+
read_offset=read_offset,
|
| 76 |
+
skip_special_tokens=prms.skip_special_tokens,
|
| 77 |
+
spaces_between_special_tokens=prms.
|
| 78 |
+
spaces_between_special_tokens,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
sample_logprob.decoded_token = new_text
|
| 82 |
+
|
| 83 |
+
# Use the offsets & prev tokens corresponding to
|
| 84 |
+
# real tokens to ensure detokenization is consistent
|
| 85 |
+
# actual with prompt.
|
| 86 |
+
if token_id == all_token_ids[token_position]:
|
| 87 |
+
next_iter_prefix_offset = new_prefix_offset
|
| 88 |
+
next_iter_read_offset = new_read_offset
|
| 89 |
+
next_iter_tokens = new_tokens
|
| 90 |
+
|
| 91 |
+
# Advance to the next token position.
|
| 92 |
+
prefix_offset = next_iter_prefix_offset
|
| 93 |
+
read_offset = next_iter_read_offset
|
| 94 |
+
if prev_tokens is None:
|
| 95 |
+
prev_tokens = next_iter_tokens.copy()
|
| 96 |
+
else:
|
| 97 |
+
prev_tokens.extend(next_iter_tokens)
|
| 98 |
+
|
| 99 |
+
def decode_sequence_inplace(self, seq: Sequence,
|
| 100 |
+
prms: SamplingParams) -> int:
|
| 101 |
+
"""Decodes the new token for a sequence. In-place operation.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
seq: The sequence to decode.
|
| 105 |
+
prms: The sampling parameters used to generate the sequence.
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
The number of characters added to the output text.
|
| 109 |
+
"""
|
| 110 |
+
all_input_ids = seq.get_token_ids()
|
| 111 |
+
token_id_generated_this_iteration = all_input_ids[-1]
|
| 112 |
+
tokenizer = self.get_tokenizer_for_seq(seq)
|
| 113 |
+
|
| 114 |
+
# Convert prompt token IDs to tokens if necessary.
|
| 115 |
+
# Do it here so that we don't have to repeat this
|
| 116 |
+
# computation for each logprob.
|
| 117 |
+
if seq.tokens is None:
|
| 118 |
+
(seq.tokens, seq.prefix_offset,
|
| 119 |
+
seq.read_offset) = convert_prompt_ids_to_tokens(
|
| 120 |
+
tokenizer=tokenizer,
|
| 121 |
+
prompt_ids=all_input_ids[:-1],
|
| 122 |
+
skip_special_tokens=prms.skip_special_tokens,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
(new_tokens, new_decoded_token_text, prefix_offset,
|
| 126 |
+
read_offset) = detokenize_incrementally(
|
| 127 |
+
tokenizer=tokenizer,
|
| 128 |
+
all_input_ids=all_input_ids,
|
| 129 |
+
prev_tokens=seq.tokens,
|
| 130 |
+
prefix_offset=seq.prefix_offset,
|
| 131 |
+
read_offset=seq.read_offset,
|
| 132 |
+
skip_special_tokens=prms.skip_special_tokens,
|
| 133 |
+
spaces_between_special_tokens=prms.spaces_between_special_tokens,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Decode logprobs
|
| 137 |
+
logprobs = seq.output_logprobs[-1]
|
| 138 |
+
if logprobs:
|
| 139 |
+
previous_tokens = all_input_ids[:-1]
|
| 140 |
+
for token_id, sample_logprob in logprobs.items():
|
| 141 |
+
# If the token was generated this iteration,
|
| 142 |
+
# use the provided text.
|
| 143 |
+
if token_id == token_id_generated_this_iteration:
|
| 144 |
+
sample_logprob.decoded_token = new_decoded_token_text
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
if (sample_logprob.decoded_token is None
|
| 148 |
+
and token_id != VLLM_INVALID_TOKEN_ID):
|
| 149 |
+
all_input_ids_with_logprob = previous_tokens + [token_id]
|
| 150 |
+
(_, new_text, _, _) = detokenize_incrementally(
|
| 151 |
+
tokenizer=tokenizer,
|
| 152 |
+
all_input_ids=all_input_ids_with_logprob,
|
| 153 |
+
prev_tokens=seq.tokens,
|
| 154 |
+
prefix_offset=seq.prefix_offset,
|
| 155 |
+
read_offset=seq.read_offset,
|
| 156 |
+
skip_special_tokens=prms.skip_special_tokens,
|
| 157 |
+
spaces_between_special_tokens=prms.
|
| 158 |
+
spaces_between_special_tokens,
|
| 159 |
+
)
|
| 160 |
+
sample_logprob.decoded_token = new_text
|
| 161 |
+
|
| 162 |
+
seq.tokens.extend(new_tokens)
|
| 163 |
+
seq.prefix_offset = prefix_offset
|
| 164 |
+
seq.read_offset = read_offset
|
| 165 |
+
seq.output_text += new_decoded_token_text
|
| 166 |
+
|
| 167 |
+
return len(new_decoded_token_text)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer_utils.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import List, Optional, Tuple
|
| 4 |
+
|
| 5 |
+
from .tokenizer import AnyTokenizer
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _replace_none_with_empty(tokens: List[Optional[str]]):
|
| 9 |
+
for i, token in enumerate(tokens):
|
| 10 |
+
if token is None:
|
| 11 |
+
tokens[i] = ""
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _convert_tokens_to_string_with_added_encoders(
|
| 15 |
+
tokenizer: AnyTokenizer,
|
| 16 |
+
output_tokens: List[str],
|
| 17 |
+
skip_special_tokens: bool,
|
| 18 |
+
spaces_between_special_tokens: bool,
|
| 19 |
+
) -> str:
|
| 20 |
+
# Adapted from
|
| 21 |
+
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
|
| 22 |
+
# NOTE(woosuk): The following code is slow because it runs a for loop over
|
| 23 |
+
# the output_tokens. In Python, running a for loop over a list can be slow
|
| 24 |
+
# even when the loop body is very simple.
|
| 25 |
+
sub_texts: List[str] = []
|
| 26 |
+
current_sub_text: List[str] = []
|
| 27 |
+
all_special_tokens = set(tokenizer.all_special_tokens)
|
| 28 |
+
for token in output_tokens:
|
| 29 |
+
if skip_special_tokens and token in all_special_tokens:
|
| 30 |
+
continue
|
| 31 |
+
if token in tokenizer.get_added_vocab():
|
| 32 |
+
if current_sub_text:
|
| 33 |
+
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
| 34 |
+
sub_texts.append(sub_text)
|
| 35 |
+
current_sub_text = []
|
| 36 |
+
sub_texts.append(token)
|
| 37 |
+
else:
|
| 38 |
+
current_sub_text.append(token)
|
| 39 |
+
if current_sub_text:
|
| 40 |
+
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
| 41 |
+
sub_texts.append(sub_text)
|
| 42 |
+
if spaces_between_special_tokens:
|
| 43 |
+
return " ".join(sub_texts)
|
| 44 |
+
else:
|
| 45 |
+
return "".join(sub_texts)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# 5 is an arbitrary value that should work for all
|
| 49 |
+
# tokenizers (bigger = more conservative).
|
| 50 |
+
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def convert_prompt_ids_to_tokens(
|
| 54 |
+
tokenizer: AnyTokenizer,
|
| 55 |
+
prompt_ids: List[int],
|
| 56 |
+
skip_special_tokens: bool = False,
|
| 57 |
+
) -> Tuple[List[str], int, int]:
|
| 58 |
+
"""Converts the prompt ids to tokens and returns the tokens and offsets
|
| 59 |
+
for incremental detokenization.
|
| 60 |
+
|
| 61 |
+
Note that not all tokens are converted to strings. Only the tokens that
|
| 62 |
+
are necessary for incremental detokenization are converted to strings.
|
| 63 |
+
"""
|
| 64 |
+
# We do not need to convert the whole prompt to tokens.
|
| 65 |
+
# Offset a little more in case we have special tokens.
|
| 66 |
+
new_tokens = tokenizer.convert_ids_to_tokens(
|
| 67 |
+
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
|
| 68 |
+
skip_special_tokens=skip_special_tokens)
|
| 69 |
+
read_offset = len(new_tokens)
|
| 70 |
+
prefix_offset = max(
|
| 71 |
+
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
|
| 72 |
+
# This is required to guard against out-of-vocab prompt token ids
|
| 73 |
+
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
|
| 74 |
+
return new_tokens, prefix_offset, read_offset
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# Based on
|
| 78 |
+
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
|
| 79 |
+
# under Apache 2.0 license
|
| 80 |
+
def detokenize_incrementally(
|
| 81 |
+
tokenizer: AnyTokenizer,
|
| 82 |
+
all_input_ids: List[int],
|
| 83 |
+
prev_tokens: Optional[List[str]],
|
| 84 |
+
prefix_offset: int,
|
| 85 |
+
read_offset: int,
|
| 86 |
+
skip_special_tokens: bool = False,
|
| 87 |
+
spaces_between_special_tokens: bool = True,
|
| 88 |
+
) -> Tuple[List[str], str, int, int]:
|
| 89 |
+
"""Detokenizes the input ids incrementally and returns the new tokens
|
| 90 |
+
and the new text.
|
| 91 |
+
|
| 92 |
+
If `prev_tokens` is None, this function will convert the input ids to
|
| 93 |
+
tokens and return the tokens and the new text. Otherwise, it will return the
|
| 94 |
+
new tokens and the new text.
|
| 95 |
+
|
| 96 |
+
This function will also return the new prefix offset and the new read
|
| 97 |
+
offset to be used in the next iteration.
|
| 98 |
+
|
| 99 |
+
The offsets are necessary to defeat cleanup algorithms in the decode which
|
| 100 |
+
decide to add a space or not depending on the surrounding ids.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
tokenizer: The tokenizer to use.
|
| 104 |
+
all_input_ids: The input ids. The last id is the new token id.
|
| 105 |
+
prev_tokens: The previous tokens. If None, this function will convert
|
| 106 |
+
the input ids to tokens and return the tokens and the new text.
|
| 107 |
+
prefix_offset: The prefix offset.
|
| 108 |
+
read_offset: The read offset.
|
| 109 |
+
skip_special_tokens: Whether to skip special tokens.
|
| 110 |
+
spaces_between_special_tokens: Whether to add spaces between special
|
| 111 |
+
tokens.
|
| 112 |
+
"""
|
| 113 |
+
new_token_id = all_input_ids[-1]
|
| 114 |
+
# This is the first iteration for this sequence
|
| 115 |
+
is_first_iter = prev_tokens is None
|
| 116 |
+
if is_first_iter:
|
| 117 |
+
(prev_tokens, prefix_offset,
|
| 118 |
+
read_offset) = convert_prompt_ids_to_tokens(
|
| 119 |
+
tokenizer,
|
| 120 |
+
all_input_ids[:-1],
|
| 121 |
+
skip_special_tokens=skip_special_tokens)
|
| 122 |
+
assert prev_tokens is not None
|
| 123 |
+
|
| 124 |
+
# If the new token id is out of bounds, return an empty string.
|
| 125 |
+
if 0 <= new_token_id < len(tokenizer):
|
| 126 |
+
# Put new_token_id in a list so skip_special_tokens is respected
|
| 127 |
+
new_tokens = tokenizer.convert_ids_to_tokens(
|
| 128 |
+
[new_token_id], skip_special_tokens=skip_special_tokens)
|
| 129 |
+
if isinstance(new_tokens, str):
|
| 130 |
+
new_tokens = [new_tokens]
|
| 131 |
+
else:
|
| 132 |
+
new_tokens = [""]
|
| 133 |
+
output_tokens = prev_tokens + new_tokens
|
| 134 |
+
|
| 135 |
+
# If this is the first iteration, return all tokens.
|
| 136 |
+
if is_first_iter:
|
| 137 |
+
new_tokens = output_tokens
|
| 138 |
+
|
| 139 |
+
# The prefix text is necessary only to defeat cleanup algorithms in
|
| 140 |
+
# the decode which decide to add a space or not depending on the
|
| 141 |
+
# surrounding ids.
|
| 142 |
+
if tokenizer.is_fast or not tokenizer.get_added_vocab():
|
| 143 |
+
prefix_text = tokenizer.convert_tokens_to_string(
|
| 144 |
+
output_tokens[prefix_offset:read_offset])
|
| 145 |
+
new_text = tokenizer.convert_tokens_to_string(
|
| 146 |
+
output_tokens[prefix_offset:])
|
| 147 |
+
else:
|
| 148 |
+
prefix_text = _convert_tokens_to_string_with_added_encoders(
|
| 149 |
+
tokenizer,
|
| 150 |
+
output_tokens[prefix_offset:read_offset],
|
| 151 |
+
skip_special_tokens=skip_special_tokens,
|
| 152 |
+
spaces_between_special_tokens=spaces_between_special_tokens,
|
| 153 |
+
)
|
| 154 |
+
new_text = _convert_tokens_to_string_with_added_encoders(
|
| 155 |
+
tokenizer,
|
| 156 |
+
output_tokens[prefix_offset:],
|
| 157 |
+
skip_special_tokens=skip_special_tokens,
|
| 158 |
+
spaces_between_special_tokens=spaces_between_special_tokens,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
if len(new_text) <= len(prefix_text) or new_text.endswith("�"):
|
| 162 |
+
# utf-8 char at the end means it's a potential unfinished byte sequence
|
| 163 |
+
# from byte fallback tokenization.
|
| 164 |
+
# If it's in the middle, it's probably a real invalid id generated
|
| 165 |
+
# by the model
|
| 166 |
+
return new_tokens, "", prefix_offset, read_offset
|
| 167 |
+
|
| 168 |
+
new_text = new_text[len(prefix_text):]
|
| 169 |
+
return new_tokens, new_text, read_offset, len(output_tokens)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processor.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from typing import Any, cast
|
| 5 |
+
|
| 6 |
+
from transformers.processing_utils import ProcessorMixin
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_processor(
|
| 10 |
+
processor_name: str,
|
| 11 |
+
*args: Any,
|
| 12 |
+
trust_remote_code: bool = False,
|
| 13 |
+
processor_cls: type[ProcessorMixin] = ProcessorMixin,
|
| 14 |
+
**kwargs: Any,
|
| 15 |
+
):
|
| 16 |
+
"""Load a processor for the given model name via HuggingFace."""
|
| 17 |
+
# don't put this import at the top level
|
| 18 |
+
# it will call torch.cuda.device_count()
|
| 19 |
+
from transformers import AutoProcessor
|
| 20 |
+
|
| 21 |
+
processor_factory = (AutoProcessor
|
| 22 |
+
if processor_cls == ProcessorMixin else processor_cls)
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
processor = processor_factory.from_pretrained(
|
| 26 |
+
processor_name,
|
| 27 |
+
*args,
|
| 28 |
+
trust_remote_code=trust_remote_code,
|
| 29 |
+
**kwargs,
|
| 30 |
+
)
|
| 31 |
+
except ValueError as e:
|
| 32 |
+
# If the error pertains to the processor class not existing or not
|
| 33 |
+
# currently being imported, suggest using the --trust-remote-code flag.
|
| 34 |
+
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
|
| 35 |
+
if not trust_remote_code:
|
| 36 |
+
err_msg = (
|
| 37 |
+
"Failed to load the processor. If the processor is "
|
| 38 |
+
"a custom processor not yet available in the HuggingFace "
|
| 39 |
+
"transformers library, consider setting "
|
| 40 |
+
"`trust_remote_code=True` in LLM or using the "
|
| 41 |
+
"`--trust-remote-code` flag in the CLI.")
|
| 42 |
+
raise RuntimeError(err_msg) from e
|
| 43 |
+
else:
|
| 44 |
+
raise e
|
| 45 |
+
|
| 46 |
+
return cast(ProcessorMixin, processor)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
cached_get_processor = lru_cache(get_processor)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def get_image_processor(
|
| 53 |
+
processor_name: str,
|
| 54 |
+
*args: Any,
|
| 55 |
+
trust_remote_code: bool = False,
|
| 56 |
+
**kwargs: Any,
|
| 57 |
+
):
|
| 58 |
+
"""Load an image processor for the given model name via HuggingFace."""
|
| 59 |
+
# don't put this import at the top level
|
| 60 |
+
# it will call torch.cuda.device_count()
|
| 61 |
+
from transformers import AutoImageProcessor
|
| 62 |
+
from transformers.image_processing_utils import BaseImageProcessor
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
processor = AutoImageProcessor.from_pretrained(
|
| 66 |
+
processor_name,
|
| 67 |
+
*args,
|
| 68 |
+
trust_remote_code=trust_remote_code,
|
| 69 |
+
**kwargs)
|
| 70 |
+
except ValueError as e:
|
| 71 |
+
# If the error pertains to the processor class not existing or not
|
| 72 |
+
# currently being imported, suggest using the --trust-remote-code flag.
|
| 73 |
+
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
|
| 74 |
+
if not trust_remote_code:
|
| 75 |
+
err_msg = (
|
| 76 |
+
"Failed to load the image processor. If the image processor is "
|
| 77 |
+
"a custom processor not yet available in the HuggingFace "
|
| 78 |
+
"transformers library, consider setting "
|
| 79 |
+
"`trust_remote_code=True` in LLM or using the "
|
| 80 |
+
"`--trust-remote-code` flag in the CLI.")
|
| 81 |
+
raise RuntimeError(err_msg) from e
|
| 82 |
+
else:
|
| 83 |
+
raise e
|
| 84 |
+
|
| 85 |
+
return cast(BaseImageProcessor, processor)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def get_video_processor(
|
| 89 |
+
processor_name: str,
|
| 90 |
+
*args: Any,
|
| 91 |
+
trust_remote_code: bool = False,
|
| 92 |
+
**kwargs: Any,
|
| 93 |
+
):
|
| 94 |
+
"""Load a video processor for the given model name via HuggingFace."""
|
| 95 |
+
# don't put this import at the top level
|
| 96 |
+
# it will call torch.cuda.device_count()
|
| 97 |
+
from transformers.image_processing_utils import BaseImageProcessor
|
| 98 |
+
|
| 99 |
+
processor = get_processor(
|
| 100 |
+
processor_name,
|
| 101 |
+
*args,
|
| 102 |
+
trust_remote_code=trust_remote_code,
|
| 103 |
+
**kwargs,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
return cast(BaseImageProcessor, processor.video_processor)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from vllm.transformers_utils.processors.deepseek_vl2 import (
|
| 4 |
+
DeepseekVLV2Processor)
|
| 5 |
+
|
| 6 |
+
__all__ = ["DeepseekVLV2Processor"]
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (354 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/deepseek_vl2.cpython-311.pyc
ADDED
|
Binary file (15.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/deepseek_vl2.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# yapf: disable
|
| 4 |
+
# ruff: noqa: E501
|
| 5 |
+
# coding=utf-8
|
| 6 |
+
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
|
| 7 |
+
# Copyright (c) 2023-2024 DeepSeek.
|
| 8 |
+
#
|
| 9 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy of
|
| 10 |
+
# this software and associated documentation files (the "Software"), to deal in
|
| 11 |
+
# the Software without restriction, including without limitation the rights to
|
| 12 |
+
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
| 13 |
+
# the Software, and to permit persons to whom the Software is furnished to do so,
|
| 14 |
+
# subject to the following conditions:
|
| 15 |
+
#
|
| 16 |
+
# The above copyright notice and this permission notice shall be included in all
|
| 17 |
+
# copies or substantial portions of the Software.
|
| 18 |
+
#
|
| 19 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 20 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
| 21 |
+
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
| 22 |
+
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
| 23 |
+
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
| 24 |
+
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
| 25 |
+
|
| 26 |
+
import math
|
| 27 |
+
from typing import List, Tuple
|
| 28 |
+
|
| 29 |
+
import torch
|
| 30 |
+
import torchvision.transforms as T
|
| 31 |
+
from PIL import Image, ImageOps
|
| 32 |
+
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
|
| 33 |
+
from transformers.processing_utils import ProcessorMixin
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ImageTransform:
|
| 37 |
+
|
| 38 |
+
def __init__(self,
|
| 39 |
+
mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
|
| 40 |
+
std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
|
| 41 |
+
normalize: bool = True):
|
| 42 |
+
self.mean = mean
|
| 43 |
+
self.std = std
|
| 44 |
+
self.normalize = normalize
|
| 45 |
+
|
| 46 |
+
transform_pipelines = [T.ToTensor()]
|
| 47 |
+
|
| 48 |
+
if normalize:
|
| 49 |
+
transform_pipelines.append(T.Normalize(mean, std))
|
| 50 |
+
|
| 51 |
+
self.transform = T.Compose(transform_pipelines)
|
| 52 |
+
|
| 53 |
+
def __call__(self, pil_img: Image.Image):
|
| 54 |
+
x = self.transform(pil_img)
|
| 55 |
+
return x
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class DeepseekVLV2Processor(ProcessorMixin):
|
| 59 |
+
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
|
| 60 |
+
attributes = ["tokenizer"]
|
| 61 |
+
|
| 62 |
+
def __init__(
|
| 63 |
+
self,
|
| 64 |
+
tokenizer: LlamaTokenizerFast,
|
| 65 |
+
candidate_resolutions: Tuple[Tuple[int, int]],
|
| 66 |
+
patch_size: int,
|
| 67 |
+
downsample_ratio: int,
|
| 68 |
+
image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
|
| 69 |
+
image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
|
| 70 |
+
normalize: bool = True,
|
| 71 |
+
image_token: str = "<image>",
|
| 72 |
+
pad_token: str = "<|▁pad▁|>",
|
| 73 |
+
add_special_token: bool = False,
|
| 74 |
+
sft_format: str = "deepseek",
|
| 75 |
+
mask_prompt: bool = True,
|
| 76 |
+
ignore_id: int = -100,
|
| 77 |
+
**kwargs,
|
| 78 |
+
):
|
| 79 |
+
|
| 80 |
+
self.candidate_resolutions = candidate_resolutions
|
| 81 |
+
self.image_size = candidate_resolutions[0][0]
|
| 82 |
+
self.patch_size = patch_size
|
| 83 |
+
self.image_mean = image_mean
|
| 84 |
+
self.image_std = image_std
|
| 85 |
+
self.normalize = normalize
|
| 86 |
+
self.downsample_ratio = downsample_ratio
|
| 87 |
+
|
| 88 |
+
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
|
| 89 |
+
self.tokenizer = tokenizer
|
| 90 |
+
self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference
|
| 91 |
+
|
| 92 |
+
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
|
| 93 |
+
if tokenizer.pad_token is None:
|
| 94 |
+
self.tokenizer.add_special_tokens({'pad_token': pad_token})
|
| 95 |
+
|
| 96 |
+
# add image token
|
| 97 |
+
image_token_id = self.tokenizer.vocab.get(image_token)
|
| 98 |
+
if image_token_id is None:
|
| 99 |
+
special_tokens = [image_token]
|
| 100 |
+
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
| 101 |
+
self.tokenizer.add_special_tokens(special_tokens_dict)
|
| 102 |
+
self.image_token_id = self.tokenizer.vocab.get(image_token)
|
| 103 |
+
|
| 104 |
+
# add five special tokens for grounding-related tasks
|
| 105 |
+
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
|
| 106 |
+
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
|
| 107 |
+
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
| 108 |
+
self.tokenizer.add_special_tokens(special_tokens_dict)
|
| 109 |
+
|
| 110 |
+
# add special tokens for SFT data
|
| 111 |
+
special_tokens = ["<|User|>", "<|Assistant|>"]
|
| 112 |
+
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
| 113 |
+
self.tokenizer.add_special_tokens(special_tokens_dict)
|
| 114 |
+
|
| 115 |
+
self.image_token = image_token
|
| 116 |
+
self.pad_token = pad_token
|
| 117 |
+
self.add_special_token = add_special_token
|
| 118 |
+
self.sft_format = sft_format
|
| 119 |
+
self.mask_prompt = mask_prompt
|
| 120 |
+
self.ignore_id = ignore_id
|
| 121 |
+
|
| 122 |
+
super().__init__(
|
| 123 |
+
tokenizer,
|
| 124 |
+
**kwargs,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
def select_best_resolution(self, image_size):
|
| 128 |
+
# used for cropping
|
| 129 |
+
original_width, original_height = image_size
|
| 130 |
+
best_fit = None
|
| 131 |
+
max_effective_resolution = 0
|
| 132 |
+
min_wasted_resolution = float("inf")
|
| 133 |
+
|
| 134 |
+
for width, height in self.candidate_resolutions:
|
| 135 |
+
scale = min(width / original_width, height / original_height)
|
| 136 |
+
downscaled_width, downscaled_height = int(
|
| 137 |
+
original_width * scale), int(original_height * scale)
|
| 138 |
+
effective_resolution = min(downscaled_width * downscaled_height,
|
| 139 |
+
original_width * original_height)
|
| 140 |
+
wasted_resolution = (width * height) - effective_resolution
|
| 141 |
+
|
| 142 |
+
if effective_resolution > max_effective_resolution or (
|
| 143 |
+
effective_resolution == max_effective_resolution
|
| 144 |
+
and wasted_resolution < min_wasted_resolution):
|
| 145 |
+
max_effective_resolution = effective_resolution
|
| 146 |
+
min_wasted_resolution = wasted_resolution
|
| 147 |
+
best_fit = (width, height)
|
| 148 |
+
|
| 149 |
+
return best_fit
|
| 150 |
+
|
| 151 |
+
@property
|
| 152 |
+
def bos_id(self):
|
| 153 |
+
return self.tokenizer.bos_token_id
|
| 154 |
+
|
| 155 |
+
@property
|
| 156 |
+
def eos_id(self):
|
| 157 |
+
return self.tokenizer.eos_token_id
|
| 158 |
+
|
| 159 |
+
@property
|
| 160 |
+
def pad_id(self):
|
| 161 |
+
return self.tokenizer.pad_token_id
|
| 162 |
+
|
| 163 |
+
def encode(self, text: str, bos: bool = True, eos: bool = False):
|
| 164 |
+
t = self.tokenizer.encode(text, add_special_tokens=False)
|
| 165 |
+
|
| 166 |
+
if bos:
|
| 167 |
+
t = [self.bos_id] + t
|
| 168 |
+
if eos:
|
| 169 |
+
t = t + [self.eos_id]
|
| 170 |
+
|
| 171 |
+
return t
|
| 172 |
+
|
| 173 |
+
def decode(self, t: List[int], **kwargs) -> str:
|
| 174 |
+
return self.tokenizer.decode(t, **kwargs)
|
| 175 |
+
|
| 176 |
+
def process_one(
|
| 177 |
+
self,
|
| 178 |
+
prompt: str,
|
| 179 |
+
images: List[Image.Image],
|
| 180 |
+
inference_mode: bool = True,
|
| 181 |
+
**kwargs,
|
| 182 |
+
):
|
| 183 |
+
"""
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
prompt (str): the formatted prompt;
|
| 187 |
+
conversations (List[Dict]): conversations with a list of messages;
|
| 188 |
+
images (List[ImageType]): the list of images;
|
| 189 |
+
inference_mode (bool): if True, then remove the last eos token;
|
| 190 |
+
system_prompt (str): the system prompt;
|
| 191 |
+
**kwargs:
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
outputs (BaseProcessorOutput): the output of the processor,
|
| 195 |
+
- input_ids (torch.LongTensor): [N + image tokens]
|
| 196 |
+
- target_ids (torch.LongTensor): [N + image tokens]
|
| 197 |
+
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
|
| 198 |
+
- image_id (int): the id of the image token
|
| 199 |
+
- num_image_tokens (List[int]): the number of image tokens
|
| 200 |
+
"""
|
| 201 |
+
|
| 202 |
+
assert (prompt is not None and images is not None
|
| 203 |
+
), "prompt and images must be used at the same time."
|
| 204 |
+
|
| 205 |
+
sft_format = prompt
|
| 206 |
+
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
|
| 207 |
+
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
|
| 208 |
+
masked_tokenized_str = []
|
| 209 |
+
for token_index in tokenized_str:
|
| 210 |
+
if token_index != self.image_token_id:
|
| 211 |
+
masked_tokenized_str.append(token_index)
|
| 212 |
+
else:
|
| 213 |
+
masked_tokenized_str.append(self.ignore_id)
|
| 214 |
+
|
| 215 |
+
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
|
| 216 |
+
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
| 217 |
+
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
|
| 218 |
+
|
| 219 |
+
input_ids = torch.LongTensor(tokenized_str)
|
| 220 |
+
target_ids = torch.LongTensor(masked_tokenized_str)
|
| 221 |
+
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
| 222 |
+
|
| 223 |
+
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
| 224 |
+
target_ids[(input_ids < 0) |
|
| 225 |
+
(input_ids == self.image_token_id)] = self.ignore_id
|
| 226 |
+
input_ids[input_ids < 0] = self.pad_id
|
| 227 |
+
|
| 228 |
+
if inference_mode:
|
| 229 |
+
# 去掉结尾的eos token
|
| 230 |
+
assert input_ids[-1] == self.eos_id
|
| 231 |
+
input_ids = input_ids[:-1]
|
| 232 |
+
target_ids = target_ids[:-1]
|
| 233 |
+
images_seq_mask = images_seq_mask[:-1]
|
| 234 |
+
|
| 235 |
+
if len(images_list) == 0:
|
| 236 |
+
pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
|
| 237 |
+
images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
|
| 238 |
+
else:
|
| 239 |
+
pixel_values = torch.stack(images_list, dim=0)
|
| 240 |
+
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
|
| 241 |
+
|
| 242 |
+
input_ids = input_ids.unsqueeze(0)
|
| 243 |
+
|
| 244 |
+
prepare = BatchFeature(
|
| 245 |
+
data=dict(
|
| 246 |
+
input_ids=input_ids,
|
| 247 |
+
pixel_values=pixel_values,
|
| 248 |
+
images_seq_mask=images_seq_mask,
|
| 249 |
+
images_spatial_crop=images_spatial_crop,
|
| 250 |
+
num_image_tokens=num_image_tokens,
|
| 251 |
+
),
|
| 252 |
+
tensor_type="pt",
|
| 253 |
+
)
|
| 254 |
+
return prepare
|
| 255 |
+
|
| 256 |
+
def __call__(
|
| 257 |
+
self,
|
| 258 |
+
*,
|
| 259 |
+
prompt: str,
|
| 260 |
+
images: List[Image.Image],
|
| 261 |
+
inference_mode: bool = True,
|
| 262 |
+
**kwargs,
|
| 263 |
+
):
|
| 264 |
+
"""
|
| 265 |
+
|
| 266 |
+
Args:
|
| 267 |
+
prompt (str): the formatted prompt;
|
| 268 |
+
images (List[ImageType]): the list of images;
|
| 269 |
+
inference_mode (bool): if True, then remove the last eos token;
|
| 270 |
+
**kwargs:
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
outputs (BaseProcessorOutput): the output of the processor,
|
| 274 |
+
- input_ids (torch.LongTensor): [N + image tokens]
|
| 275 |
+
- images (torch.FloatTensor): [n_images, 3, H, W]
|
| 276 |
+
- image_id (int): the id of the image token
|
| 277 |
+
- num_image_tokens (List[int]): the number of image tokens
|
| 278 |
+
"""
|
| 279 |
+
|
| 280 |
+
prepare = self.process_one(
|
| 281 |
+
prompt=prompt,
|
| 282 |
+
images=images,
|
| 283 |
+
inference_mode=inference_mode,
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
return prepare
|
| 287 |
+
|
| 288 |
+
def tokenize_with_images(
|
| 289 |
+
self,
|
| 290 |
+
conversation: str,
|
| 291 |
+
images: List[Image.Image],
|
| 292 |
+
bos: bool = True,
|
| 293 |
+
eos: bool = True,
|
| 294 |
+
cropping: bool = True,
|
| 295 |
+
):
|
| 296 |
+
"""Tokenize text with <image> tags."""
|
| 297 |
+
assert conversation.count(self.image_token) == len(images)
|
| 298 |
+
text_splits = conversation.split(self.image_token)
|
| 299 |
+
images_list, images_seq_mask, images_spatial_crop = [], [], []
|
| 300 |
+
num_image_tokens = []
|
| 301 |
+
tokenized_str = []
|
| 302 |
+
for text_sep, image in zip(text_splits, images):
|
| 303 |
+
"""encode text_sep"""
|
| 304 |
+
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
|
| 305 |
+
tokenized_str += tokenized_sep
|
| 306 |
+
images_seq_mask += [False] * len(tokenized_sep)
|
| 307 |
+
|
| 308 |
+
"""select best resolution for anyres"""
|
| 309 |
+
if cropping:
|
| 310 |
+
best_width, best_height = self.select_best_resolution(image.size)
|
| 311 |
+
else:
|
| 312 |
+
best_width, best_height = self.image_size, self.image_size
|
| 313 |
+
|
| 314 |
+
"""process the global view"""
|
| 315 |
+
global_view = ImageOps.pad(image, (self.image_size, self.image_size),
|
| 316 |
+
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
| 317 |
+
images_list.append(self.image_transform(global_view))
|
| 318 |
+
|
| 319 |
+
"""process the local views"""
|
| 320 |
+
local_view = ImageOps.pad(image, (best_width, best_height),
|
| 321 |
+
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
| 322 |
+
for i in range(0, best_height, self.image_size):
|
| 323 |
+
for j in range(0, best_width, self.image_size):
|
| 324 |
+
images_list.append(
|
| 325 |
+
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
|
| 326 |
+
|
| 327 |
+
"""record height / width crop num"""
|
| 328 |
+
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
|
| 329 |
+
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
| 330 |
+
|
| 331 |
+
"""add image tokens"""
|
| 332 |
+
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
|
| 333 |
+
# global views tokens h * (w + 1), 1 is for line separator
|
| 334 |
+
tokenized_image = [self.image_token_id] * h * (w + 1)
|
| 335 |
+
# add a separator between global and local views
|
| 336 |
+
tokenized_image += [self.image_token_id]
|
| 337 |
+
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
| 338 |
+
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
|
| 339 |
+
|
| 340 |
+
tokenized_str += tokenized_image
|
| 341 |
+
images_seq_mask += [True] * len(tokenized_image)
|
| 342 |
+
num_image_tokens.append(len(tokenized_image))
|
| 343 |
+
|
| 344 |
+
"""process the last text split"""
|
| 345 |
+
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
|
| 346 |
+
tokenized_str += tokenized_sep
|
| 347 |
+
images_seq_mask += [False] * len(tokenized_sep)
|
| 348 |
+
|
| 349 |
+
"""add the bos and eos tokens"""
|
| 350 |
+
if bos:
|
| 351 |
+
tokenized_str = [self.bos_id] + tokenized_str
|
| 352 |
+
images_seq_mask = [False] + images_seq_mask
|
| 353 |
+
if eos:
|
| 354 |
+
tokenized_str = tokenized_str + [self.eos_id]
|
| 355 |
+
images_seq_mask = images_seq_mask + [False]
|
| 356 |
+
|
| 357 |
+
assert len(tokenized_str) == len(
|
| 358 |
+
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
| 359 |
+
|
| 360 |
+
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/s3_utils.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import fnmatch
|
| 4 |
+
import os
|
| 5 |
+
import shutil
|
| 6 |
+
import signal
|
| 7 |
+
import tempfile
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
from vllm.utils import PlaceholderModule
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
import boto3
|
| 15 |
+
except ImportError:
|
| 16 |
+
boto3 = PlaceholderModule("boto3") # type: ignore[assignment]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
|
| 20 |
+
return [
|
| 21 |
+
path for path in paths if any(
|
| 22 |
+
fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
|
| 27 |
+
return [
|
| 28 |
+
path for path in paths
|
| 29 |
+
if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def glob(s3=None,
|
| 34 |
+
path: str = "",
|
| 35 |
+
allow_pattern: Optional[list[str]] = None) -> list[str]:
|
| 36 |
+
"""
|
| 37 |
+
List full file names from S3 path and filter by allow pattern.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
s3: S3 client to use.
|
| 41 |
+
path: The S3 path to list from.
|
| 42 |
+
allow_pattern: A list of patterns of which files to pull.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
list[str]: List of full S3 paths allowed by the pattern
|
| 46 |
+
"""
|
| 47 |
+
if s3 is None:
|
| 48 |
+
s3 = boto3.client("s3")
|
| 49 |
+
bucket_name, _, paths = list_files(s3,
|
| 50 |
+
path=path,
|
| 51 |
+
allow_pattern=allow_pattern)
|
| 52 |
+
return [f"s3://{bucket_name}/{path}" for path in paths]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def list_files(
|
| 56 |
+
s3,
|
| 57 |
+
path: str,
|
| 58 |
+
allow_pattern: Optional[list[str]] = None,
|
| 59 |
+
ignore_pattern: Optional[list[str]] = None
|
| 60 |
+
) -> tuple[str, str, list[str]]:
|
| 61 |
+
"""
|
| 62 |
+
List files from S3 path and filter by pattern.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
s3: S3 client to use.
|
| 66 |
+
path: The S3 path to list from.
|
| 67 |
+
allow_pattern: A list of patterns of which files to pull.
|
| 68 |
+
ignore_pattern: A list of patterns of which files not to pull.
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
tuple[str, str, list[str]]: A tuple where:
|
| 72 |
+
- The first element is the bucket name
|
| 73 |
+
- The second element is string represent the bucket
|
| 74 |
+
and the prefix as a dir like string
|
| 75 |
+
- The third element is a list of files allowed or
|
| 76 |
+
disallowed by pattern
|
| 77 |
+
"""
|
| 78 |
+
parts = path.removeprefix('s3://').split('/')
|
| 79 |
+
prefix = '/'.join(parts[1:])
|
| 80 |
+
bucket_name = parts[0]
|
| 81 |
+
|
| 82 |
+
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
|
| 83 |
+
paths = [obj['Key'] for obj in objects.get('Contents', [])]
|
| 84 |
+
|
| 85 |
+
paths = _filter_ignore(paths, ["*/"])
|
| 86 |
+
if allow_pattern is not None:
|
| 87 |
+
paths = _filter_allow(paths, allow_pattern)
|
| 88 |
+
|
| 89 |
+
if ignore_pattern is not None:
|
| 90 |
+
paths = _filter_ignore(paths, ignore_pattern)
|
| 91 |
+
|
| 92 |
+
return bucket_name, prefix, paths
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
class S3Model:
|
| 96 |
+
"""
|
| 97 |
+
A class representing a S3 model mirrored into a temporary directory.
|
| 98 |
+
|
| 99 |
+
Attributes:
|
| 100 |
+
s3: S3 client.
|
| 101 |
+
dir: The temporary created directory.
|
| 102 |
+
|
| 103 |
+
Methods:
|
| 104 |
+
pull_files(): Pull model from S3 to the temporary directory.
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
def __init__(self) -> None:
|
| 108 |
+
self.s3 = boto3.client('s3')
|
| 109 |
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
| 110 |
+
existing_handler = signal.getsignal(sig)
|
| 111 |
+
signal.signal(sig, self._close_by_signal(existing_handler))
|
| 112 |
+
self.dir = tempfile.mkdtemp()
|
| 113 |
+
|
| 114 |
+
def __del__(self):
|
| 115 |
+
self._close()
|
| 116 |
+
|
| 117 |
+
def _close(self) -> None:
|
| 118 |
+
if os.path.exists(self.dir):
|
| 119 |
+
shutil.rmtree(self.dir)
|
| 120 |
+
|
| 121 |
+
def _close_by_signal(self, existing_handler=None):
|
| 122 |
+
|
| 123 |
+
def new_handler(signum, frame):
|
| 124 |
+
self._close()
|
| 125 |
+
if existing_handler:
|
| 126 |
+
existing_handler(signum, frame)
|
| 127 |
+
|
| 128 |
+
return new_handler
|
| 129 |
+
|
| 130 |
+
def pull_files(self,
|
| 131 |
+
s3_model_path: str = "",
|
| 132 |
+
allow_pattern: Optional[list[str]] = None,
|
| 133 |
+
ignore_pattern: Optional[list[str]] = None) -> None:
|
| 134 |
+
"""
|
| 135 |
+
Pull files from S3 storage into the temporary directory.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
s3_model_path: The S3 path of the model.
|
| 139 |
+
allow_pattern: A list of patterns of which files to pull.
|
| 140 |
+
ignore_pattern: A list of patterns of which files not to pull.
|
| 141 |
+
|
| 142 |
+
"""
|
| 143 |
+
bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
|
| 144 |
+
allow_pattern,
|
| 145 |
+
ignore_pattern)
|
| 146 |
+
if len(files) == 0:
|
| 147 |
+
return
|
| 148 |
+
|
| 149 |
+
for file in files:
|
| 150 |
+
destination_file = os.path.join(self.dir,
|
| 151 |
+
file.removeprefix(base_dir))
|
| 152 |
+
local_dir = Path(destination_file).parent
|
| 153 |
+
os.makedirs(local_dir, exist_ok=True)
|
| 154 |
+
self.s3.download_file(bucket_name, file, destination_file)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import contextlib
|
| 4 |
+
import os
|
| 5 |
+
import warnings
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from types import MethodType
|
| 8 |
+
from typing import Optional, Union
|
| 9 |
+
|
| 10 |
+
import huggingface_hub
|
| 11 |
+
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
| 12 |
+
PreTrainedTokenizerFast)
|
| 13 |
+
|
| 14 |
+
from vllm.envs import VLLM_USE_MODELSCOPE
|
| 15 |
+
from vllm.logger import init_logger
|
| 16 |
+
from vllm.lora.request import LoRARequest
|
| 17 |
+
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
| 18 |
+
from vllm.transformers_utils.utils import check_gguf_file
|
| 19 |
+
from vllm.utils import make_async
|
| 20 |
+
|
| 21 |
+
logger = init_logger(__name__)
|
| 22 |
+
|
| 23 |
+
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
|
| 24 |
+
MistralTokenizer]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def decode_tokens(
|
| 28 |
+
tokenizer: AnyTokenizer,
|
| 29 |
+
token_ids: list[int],
|
| 30 |
+
*,
|
| 31 |
+
skip_special_tokens: bool = False,
|
| 32 |
+
) -> str:
|
| 33 |
+
"""
|
| 34 |
+
Backend-agnostic equivalent of HF's
|
| 35 |
+
:code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
|
| 36 |
+
"""
|
| 37 |
+
return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def encode_tokens(
|
| 41 |
+
tokenizer: AnyTokenizer,
|
| 42 |
+
text: str,
|
| 43 |
+
*,
|
| 44 |
+
add_special_tokens: Optional[bool] = None,
|
| 45 |
+
) -> list[int]:
|
| 46 |
+
"""
|
| 47 |
+
Backend-agnostic equivalent of HF's
|
| 48 |
+
:code:`tokenizer.encode(text, add_special_tokens=...)`.
|
| 49 |
+
"""
|
| 50 |
+
if isinstance(tokenizer, MistralTokenizer):
|
| 51 |
+
return tokenizer.tokenizer.encode(text,
|
| 52 |
+
bos=add_special_tokens,
|
| 53 |
+
eos=add_special_tokens)
|
| 54 |
+
elif add_special_tokens is not None:
|
| 55 |
+
return tokenizer.encode(text, add_special_tokens=add_special_tokens)
|
| 56 |
+
return tokenizer.encode(text)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
| 60 |
+
"""Get tokenizer with cached properties.
|
| 61 |
+
|
| 62 |
+
This will patch the tokenizer object in place.
|
| 63 |
+
|
| 64 |
+
By default, transformers will recompute multiple tokenizer properties
|
| 65 |
+
each time they are called, leading to a significant slowdown. This
|
| 66 |
+
function caches these properties for faster access."""
|
| 67 |
+
|
| 68 |
+
tokenizer_all_special_ids = set(tokenizer.all_special_ids)
|
| 69 |
+
tokenizer_all_special_tokens_extended = (
|
| 70 |
+
tokenizer.all_special_tokens_extended)
|
| 71 |
+
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
|
| 72 |
+
tokenizer_vocab = tokenizer.get_vocab()
|
| 73 |
+
tokenizer_len = len(tokenizer)
|
| 74 |
+
|
| 75 |
+
max_token_id = max(tokenizer_vocab.values())
|
| 76 |
+
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
|
| 77 |
+
# are added and included in the implementation of the vocab_size
|
| 78 |
+
# property, but not in get_vocab(); if there is an implementation
|
| 79 |
+
# of vocab size, we should take the greater value.
|
| 80 |
+
if hasattr(tokenizer, "vocab_size"):
|
| 81 |
+
with contextlib.suppress(NotImplementedError):
|
| 82 |
+
max_token_id = max(max_token_id, tokenizer.vocab_size)
|
| 83 |
+
|
| 84 |
+
class CachedTokenizer(tokenizer.__class__): # type: ignore
|
| 85 |
+
|
| 86 |
+
@property
|
| 87 |
+
def all_special_ids(self):
|
| 88 |
+
return tokenizer_all_special_ids
|
| 89 |
+
|
| 90 |
+
@property
|
| 91 |
+
def all_special_tokens(self):
|
| 92 |
+
return tokenizer_all_special_tokens
|
| 93 |
+
|
| 94 |
+
@property
|
| 95 |
+
def all_special_tokens_extended(self):
|
| 96 |
+
return tokenizer_all_special_tokens_extended
|
| 97 |
+
|
| 98 |
+
@property
|
| 99 |
+
def max_token_id(self):
|
| 100 |
+
return max_token_id
|
| 101 |
+
|
| 102 |
+
def get_vocab(self):
|
| 103 |
+
return tokenizer_vocab
|
| 104 |
+
|
| 105 |
+
def __len__(self):
|
| 106 |
+
return tokenizer_len
|
| 107 |
+
|
| 108 |
+
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
|
| 109 |
+
|
| 110 |
+
tokenizer.__class__ = CachedTokenizer
|
| 111 |
+
return tokenizer
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
|
| 115 |
+
"""Patch _pad method to accept `padding_side` for older tokenizers."""
|
| 116 |
+
orig_pad = tokenizer._pad
|
| 117 |
+
|
| 118 |
+
def _pad(
|
| 119 |
+
self: PreTrainedTokenizer,
|
| 120 |
+
*args,
|
| 121 |
+
padding_side: Optional[str] = None,
|
| 122 |
+
**kwargs,
|
| 123 |
+
):
|
| 124 |
+
if padding_side is not None and padding_side != self.padding_side:
|
| 125 |
+
msg = ("`padding_side` argument is not supported by "
|
| 126 |
+
f"{type(tokenizer).__name__} and will be ignored.")
|
| 127 |
+
warnings.warn(msg, stacklevel=2)
|
| 128 |
+
|
| 129 |
+
return orig_pad(*args, **kwargs)
|
| 130 |
+
|
| 131 |
+
tokenizer._pad = MethodType(_pad, tokenizer)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def get_tokenizer(
|
| 135 |
+
tokenizer_name: Union[str, Path],
|
| 136 |
+
*args,
|
| 137 |
+
tokenizer_mode: str = "auto",
|
| 138 |
+
trust_remote_code: bool = False,
|
| 139 |
+
revision: Optional[str] = None,
|
| 140 |
+
download_dir: Optional[str] = None,
|
| 141 |
+
**kwargs,
|
| 142 |
+
) -> AnyTokenizer:
|
| 143 |
+
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
|
| 144 |
+
"""
|
| 145 |
+
if VLLM_USE_MODELSCOPE:
|
| 146 |
+
# download model from ModelScope hub,
|
| 147 |
+
# lazy import so that modelscope is not required for normal use.
|
| 148 |
+
# pylint: disable=C.
|
| 149 |
+
from modelscope.hub.snapshot_download import snapshot_download
|
| 150 |
+
|
| 151 |
+
# Only set the tokenizer here, model will be downloaded on the workers.
|
| 152 |
+
if not os.path.exists(tokenizer_name):
|
| 153 |
+
tokenizer_path = snapshot_download(
|
| 154 |
+
model_id=tokenizer_name,
|
| 155 |
+
cache_dir=download_dir,
|
| 156 |
+
revision=revision,
|
| 157 |
+
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
| 158 |
+
# Ignore weights - we only need the tokenizer.
|
| 159 |
+
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
| 160 |
+
tokenizer_name = tokenizer_path
|
| 161 |
+
|
| 162 |
+
if tokenizer_mode == "slow":
|
| 163 |
+
if kwargs.get("use_fast", False):
|
| 164 |
+
raise ValueError(
|
| 165 |
+
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
| 166 |
+
kwargs["use_fast"] = False
|
| 167 |
+
|
| 168 |
+
if "truncation_side" not in kwargs:
|
| 169 |
+
kwargs["truncation_side"] = "left"
|
| 170 |
+
|
| 171 |
+
# Separate model folder from file path for GGUF models
|
| 172 |
+
is_gguf = check_gguf_file(tokenizer_name)
|
| 173 |
+
if is_gguf:
|
| 174 |
+
kwargs["gguf_file"] = Path(tokenizer_name).name
|
| 175 |
+
tokenizer_name = Path(tokenizer_name).parent
|
| 176 |
+
|
| 177 |
+
# if tokenizer is from official mistral org
|
| 178 |
+
is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
|
| 179 |
+
if is_from_mistral_org and tokenizer_mode != "mistral":
|
| 180 |
+
warnings.warn(
|
| 181 |
+
'It is strongly recommended to run mistral models with '
|
| 182 |
+
'`--tokenizer-mode "mistral"` to ensure correct '
|
| 183 |
+
'encoding and decoding.',
|
| 184 |
+
FutureWarning,
|
| 185 |
+
stacklevel=2)
|
| 186 |
+
if tokenizer_mode == "mistral":
|
| 187 |
+
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
|
| 188 |
+
revision=revision)
|
| 189 |
+
else:
|
| 190 |
+
try:
|
| 191 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 192 |
+
tokenizer_name,
|
| 193 |
+
*args,
|
| 194 |
+
trust_remote_code=trust_remote_code,
|
| 195 |
+
revision=revision,
|
| 196 |
+
**kwargs,
|
| 197 |
+
)
|
| 198 |
+
except ValueError as e:
|
| 199 |
+
# If the error pertains to the tokenizer class not existing or not
|
| 200 |
+
# currently being imported,
|
| 201 |
+
# suggest using the --trust-remote-code flag.
|
| 202 |
+
if not trust_remote_code and (
|
| 203 |
+
"does not exist or is not currently imported." in str(e)
|
| 204 |
+
or "requires you to execute the tokenizer file" in str(e)):
|
| 205 |
+
err_msg = ("Failed to load the tokenizer. If the tokenizer "
|
| 206 |
+
"is a custom tokenizer not yet available in the "
|
| 207 |
+
"HuggingFace transformers library, consider "
|
| 208 |
+
"setting `trust_remote_code=True` in LLM or using "
|
| 209 |
+
"the `--trust-remote-code` flag in the CLI.")
|
| 210 |
+
raise RuntimeError(err_msg) from e
|
| 211 |
+
else:
|
| 212 |
+
raise e
|
| 213 |
+
|
| 214 |
+
# NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
|
| 215 |
+
if type(tokenizer).__name__ in ("ChatGLMTokenizer",
|
| 216 |
+
"ChatGLM4Tokenizer"):
|
| 217 |
+
assert isinstance(tokenizer, PreTrainedTokenizer)
|
| 218 |
+
patch_padding_side(tokenizer)
|
| 219 |
+
|
| 220 |
+
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
| 221 |
+
logger.warning(
|
| 222 |
+
"Using a slow tokenizer. This might cause a significant "
|
| 223 |
+
"slowdown. Consider using a fast tokenizer instead.")
|
| 224 |
+
tokenizer = get_cached_tokenizer(tokenizer)
|
| 225 |
+
|
| 226 |
+
return tokenizer
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def get_lora_tokenizer(lora_request: LoRARequest, *args,
|
| 230 |
+
**kwargs) -> Optional[AnyTokenizer]:
|
| 231 |
+
if lora_request is None:
|
| 232 |
+
return None
|
| 233 |
+
try:
|
| 234 |
+
tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
|
| 235 |
+
except Exception as e:
|
| 236 |
+
# No tokenizer was found in the LoRA folder,
|
| 237 |
+
# use base model tokenizer
|
| 238 |
+
logger.warning(
|
| 239 |
+
"No tokenizer found in %s, using base model tokenizer instead. "
|
| 240 |
+
"(Exception: %s)", lora_request.lora_path, e)
|
| 241 |
+
tokenizer = None
|
| 242 |
+
return tokenizer
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
get_lora_tokenizer_async = make_async(get_lora_tokenizer)
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__init__.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Type
|
| 4 |
+
|
| 5 |
+
from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
|
| 6 |
+
SchedulerConfig, TokenizerPoolConfig)
|
| 7 |
+
from vllm.executor.ray_utils import ray
|
| 8 |
+
|
| 9 |
+
from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
|
| 10 |
+
from .tokenizer_group import TokenizerGroup
|
| 11 |
+
|
| 12 |
+
if ray:
|
| 13 |
+
from .ray_tokenizer_group import RayTokenizerGroupPool
|
| 14 |
+
else:
|
| 15 |
+
RayTokenizerGroupPool = None # type: ignore
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def init_tokenizer_from_configs(model_config: ModelConfig,
|
| 19 |
+
scheduler_config: SchedulerConfig,
|
| 20 |
+
parallel_config: ParallelConfig,
|
| 21 |
+
lora_config: LoRAConfig):
|
| 22 |
+
init_kwargs = dict(tokenizer_id=model_config.tokenizer,
|
| 23 |
+
enable_lora=bool(lora_config),
|
| 24 |
+
max_num_seqs=scheduler_config.max_num_seqs,
|
| 25 |
+
max_loras=lora_config.max_loras if lora_config else 0,
|
| 26 |
+
max_input_length=None,
|
| 27 |
+
tokenizer_mode=model_config.tokenizer_mode,
|
| 28 |
+
trust_remote_code=model_config.trust_remote_code,
|
| 29 |
+
revision=model_config.tokenizer_revision,
|
| 30 |
+
truncation_side=model_config.truncation_side)
|
| 31 |
+
|
| 32 |
+
return get_tokenizer_group(parallel_config.tokenizer_pool_config,
|
| 33 |
+
**init_kwargs)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
| 37 |
+
**init_kwargs) -> BaseTokenizerGroup:
|
| 38 |
+
tokenizer_cls: Type[BaseTokenizerGroup]
|
| 39 |
+
if tokenizer_pool_config is None:
|
| 40 |
+
tokenizer_cls = TokenizerGroup
|
| 41 |
+
elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
|
| 42 |
+
tokenizer_pool_config.pool_type, BaseTokenizerGroup):
|
| 43 |
+
tokenizer_cls = tokenizer_pool_config.pool_type
|
| 44 |
+
elif tokenizer_pool_config.pool_type == "ray":
|
| 45 |
+
if RayTokenizerGroupPool is None:
|
| 46 |
+
raise ImportError(
|
| 47 |
+
"RayTokenizerGroupPool is not available. Please install "
|
| 48 |
+
"the ray package to use the Ray tokenizer group pool.")
|
| 49 |
+
tokenizer_cls = RayTokenizerGroupPool
|
| 50 |
+
else:
|
| 51 |
+
raise ValueError(
|
| 52 |
+
f"Unknown pool type: {tokenizer_pool_config.pool_type}")
|
| 53 |
+
return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (2.67 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-311.pyc
ADDED
|
Binary file (3.43 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-311.pyc
ADDED
|
Binary file (12.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-311.pyc
ADDED
|
Binary file (5.92 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
|
| 6 |
+
from vllm.config import TokenizerPoolConfig
|
| 7 |
+
from vllm.lora.request import LoRARequest
|
| 8 |
+
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class BaseTokenizerGroup(ABC):
|
| 12 |
+
"""A group of tokenizers that can be used for LoRA adapters."""
|
| 13 |
+
|
| 14 |
+
@classmethod
|
| 15 |
+
@abstractmethod
|
| 16 |
+
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
| 17 |
+
**init_kwargs) -> "BaseTokenizerGroup":
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
@abstractmethod
|
| 21 |
+
def ping(self) -> bool:
|
| 22 |
+
"""Check if the tokenizer group is alive."""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
@abstractmethod
|
| 26 |
+
def get_max_input_len(
|
| 27 |
+
self,
|
| 28 |
+
lora_request: Optional[LoRARequest] = None,
|
| 29 |
+
) -> Optional[int]:
|
| 30 |
+
"""Get the maximum input length for the LoRA request."""
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
def encode(self,
|
| 35 |
+
prompt: str,
|
| 36 |
+
request_id: Optional[str] = None,
|
| 37 |
+
lora_request: Optional[LoRARequest] = None,
|
| 38 |
+
add_special_tokens: Optional[bool] = None) -> List[int]:
|
| 39 |
+
"""Encode a prompt using the tokenizer group."""
|
| 40 |
+
pass
|
| 41 |
+
|
| 42 |
+
@abstractmethod
|
| 43 |
+
async def encode_async(
|
| 44 |
+
self,
|
| 45 |
+
prompt: str,
|
| 46 |
+
request_id: Optional[str] = None,
|
| 47 |
+
lora_request: Optional[LoRARequest] = None,
|
| 48 |
+
add_special_tokens: Optional[bool] = None) -> List[int]:
|
| 49 |
+
"""Encode a prompt using the tokenizer group."""
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
+
@abstractmethod
|
| 53 |
+
def get_lora_tokenizer(
|
| 54 |
+
self,
|
| 55 |
+
lora_request: Optional[LoRARequest] = None,
|
| 56 |
+
) -> AnyTokenizer:
|
| 57 |
+
"""Get a tokenizer for a LoRA request."""
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
@abstractmethod
|
| 61 |
+
async def get_lora_tokenizer_async(
|
| 62 |
+
self,
|
| 63 |
+
lora_request: Optional[LoRARequest] = None,
|
| 64 |
+
) -> AnyTokenizer:
|
| 65 |
+
"""Get a tokenizer for a LoRA request."""
|
| 66 |
+
pass
|
| 67 |
+
|
| 68 |
+
def check_health(self):
|
| 69 |
+
"""Raise exception if the tokenizer group is unhealthy."""
|
| 70 |
+
return
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import os
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
from ray.exceptions import ActorDiedError # type: ignore
|
| 9 |
+
except ImportError:
|
| 10 |
+
# For older versions of Ray
|
| 11 |
+
from ray.exceptions import RayActorError as ActorDiedError # type: ignore
|
| 12 |
+
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
|
| 13 |
+
|
| 14 |
+
from vllm.config import TokenizerPoolConfig
|
| 15 |
+
from vllm.executor.ray_utils import ray
|
| 16 |
+
from vllm.logger import init_logger
|
| 17 |
+
from vllm.lora.request import LoRARequest
|
| 18 |
+
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
| 19 |
+
|
| 20 |
+
from .base_tokenizer_group import BaseTokenizerGroup
|
| 21 |
+
from .tokenizer_group import TokenizerGroup
|
| 22 |
+
|
| 23 |
+
logger = init_logger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class RayTokenizerGroupPool(BaseTokenizerGroup):
|
| 27 |
+
"""A Ray-based pool of TokenizerGroups for async tokenization."""
|
| 28 |
+
|
| 29 |
+
# Class to use for workers making up the pool.
|
| 30 |
+
_worker_cls = TokenizerGroup
|
| 31 |
+
|
| 32 |
+
@classmethod
|
| 33 |
+
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
| 34 |
+
**init_kwargs) -> "RayTokenizerGroupPool":
|
| 35 |
+
if not tokenizer_pool_config:
|
| 36 |
+
raise ValueError("tokenizer_pool_config must not be None.")
|
| 37 |
+
ray_actor_options = (tokenizer_pool_config.extra_config or {
|
| 38 |
+
"num_cpus": 0
|
| 39 |
+
})
|
| 40 |
+
ray_actor_options.setdefault(
|
| 41 |
+
"scheduling_strategy",
|
| 42 |
+
NodeAffinitySchedulingStrategy(
|
| 43 |
+
node_id=ray.get_runtime_context().get_node_id(), soft=True))
|
| 44 |
+
|
| 45 |
+
# Carry over the env vars to the actors.
|
| 46 |
+
# This is necessary for API keys and such.
|
| 47 |
+
ray_actor_options.setdefault("runtime_env", {})
|
| 48 |
+
_carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
|
| 49 |
+
|
| 50 |
+
init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
|
| 51 |
+
init_kwargs["ray_actor_options"] = ray_actor_options
|
| 52 |
+
|
| 53 |
+
return cls(**init_kwargs)
|
| 54 |
+
|
| 55 |
+
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
| 56 |
+
max_input_length: Optional[int], num_actors: int,
|
| 57 |
+
ray_actor_options: dict, **tokenizer_config):
|
| 58 |
+
# Store a local copy of the TokenizerGroup for quick access
|
| 59 |
+
# to underlying HF tokenizers.
|
| 60 |
+
self._tokenizer_config = {
|
| 61 |
+
"tokenizer_id": tokenizer_id,
|
| 62 |
+
"enable_lora": enable_lora,
|
| 63 |
+
"max_num_seqs": max_num_seqs,
|
| 64 |
+
"max_input_length": max_input_length,
|
| 65 |
+
**tokenizer_config
|
| 66 |
+
}
|
| 67 |
+
self._local_tokenizer_group = self._worker_cls(
|
| 68 |
+
**self._tokenizer_config, )
|
| 69 |
+
|
| 70 |
+
self._ray_tokenizer_group_cls = ray.remote(
|
| 71 |
+
self._worker_cls).options(**ray_actor_options) # type: ignore
|
| 72 |
+
self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
|
| 73 |
+
self._idle_actors: Optional[asyncio.Queue] = None
|
| 74 |
+
|
| 75 |
+
# If set, actor is unhealthy. Will reraise on the next
|
| 76 |
+
# check_health call.
|
| 77 |
+
self._exception: Optional[ActorDiedError] = None
|
| 78 |
+
|
| 79 |
+
def _init_actor(self) -> ray.ObjectRef:
|
| 80 |
+
return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
|
| 81 |
+
|
| 82 |
+
@property
|
| 83 |
+
def pool_size(self) -> int:
|
| 84 |
+
return len(self.tokenizer_actors)
|
| 85 |
+
|
| 86 |
+
def ping(self):
|
| 87 |
+
return ray.get([
|
| 88 |
+
actor.ping.remote() # type: ignore
|
| 89 |
+
for actor in self.tokenizer_actors
|
| 90 |
+
])
|
| 91 |
+
|
| 92 |
+
def _ensure_queue_initialized(self):
|
| 93 |
+
if self._idle_actors is None:
|
| 94 |
+
self._idle_actors = asyncio.Queue()
|
| 95 |
+
for actor in self.tokenizer_actors:
|
| 96 |
+
self._idle_actors.put_nowait(actor)
|
| 97 |
+
|
| 98 |
+
def _finalize_encode(self, actor: ray.ObjectRef,
|
| 99 |
+
original_actor: ray.ObjectRef, actor_is_alive: bool):
|
| 100 |
+
assert self._idle_actors is not None
|
| 101 |
+
# Cleanup the dead actor.
|
| 102 |
+
if not actor_is_alive or original_actor is not actor:
|
| 103 |
+
self.tokenizer_actors.remove(original_actor)
|
| 104 |
+
if actor_is_alive:
|
| 105 |
+
# Put the actor back in the queue.
|
| 106 |
+
# This is done in a finally block to ensure that the actor is
|
| 107 |
+
# always put back in the queue, even if an exception/cancellation
|
| 108 |
+
# is raised.
|
| 109 |
+
self._idle_actors.put_nowait(actor)
|
| 110 |
+
# Add back the new actor.
|
| 111 |
+
if original_actor is not actor:
|
| 112 |
+
self.tokenizer_actors.append(actor)
|
| 113 |
+
|
| 114 |
+
def encode(self,
|
| 115 |
+
prompt: str,
|
| 116 |
+
request_id: Optional[str] = None,
|
| 117 |
+
lora_request: Optional[LoRARequest] = None,
|
| 118 |
+
add_special_tokens: Optional[bool] = None) -> List[int]:
|
| 119 |
+
"""Encode a prompt using the tokenizer group.
|
| 120 |
+
|
| 121 |
+
We pick an idle actor and use it to encode the prompt.
|
| 122 |
+
The actor is then put back in the queue for future use.
|
| 123 |
+
This is blocking.
|
| 124 |
+
"""
|
| 125 |
+
self.check_health()
|
| 126 |
+
self._ensure_queue_initialized()
|
| 127 |
+
assert self._idle_actors is not None
|
| 128 |
+
|
| 129 |
+
if self._idle_actors.empty():
|
| 130 |
+
raise RuntimeError("No idle actors available.")
|
| 131 |
+
actor = self._idle_actors.get_nowait()
|
| 132 |
+
actor_is_alive = True
|
| 133 |
+
original_actor = actor
|
| 134 |
+
try:
|
| 135 |
+
ret = ray.get(
|
| 136 |
+
actor.encode.remote(request_id=request_id,
|
| 137 |
+
prompt=prompt,
|
| 138 |
+
lora_request=lora_request,
|
| 139 |
+
add_special_tokens=add_special_tokens))
|
| 140 |
+
except ActorDiedError as e:
|
| 141 |
+
# If the actor is dead, we first try to reinitialize it.
|
| 142 |
+
logger.warning("%s died with ActorDiedError, reinitializing.",
|
| 143 |
+
actor,
|
| 144 |
+
exc_info=e)
|
| 145 |
+
actor = self._init_actor()
|
| 146 |
+
try:
|
| 147 |
+
ret = ray.get(
|
| 148 |
+
actor.encode.remote(request_id=request_id,
|
| 149 |
+
prompt=prompt,
|
| 150 |
+
lora_request=lora_request,
|
| 151 |
+
add_special_tokens=add_special_tokens))
|
| 152 |
+
except ActorDiedError as e:
|
| 153 |
+
logger.error(
|
| 154 |
+
"%s died for second time in a row, marking "
|
| 155 |
+
"RayTokenizerGroupPool as unhealthy.", actor)
|
| 156 |
+
actor_is_alive = False
|
| 157 |
+
if not self._exception:
|
| 158 |
+
self._exception = e
|
| 159 |
+
self.check_health()
|
| 160 |
+
finally:
|
| 161 |
+
self._finalize_encode(actor, original_actor, actor_is_alive)
|
| 162 |
+
return ret
|
| 163 |
+
|
| 164 |
+
async def encode_async(
|
| 165 |
+
self,
|
| 166 |
+
prompt: str,
|
| 167 |
+
request_id: Optional[str] = None,
|
| 168 |
+
lora_request: Optional[LoRARequest] = None,
|
| 169 |
+
add_special_tokens: Optional[bool] = None) -> List[int]:
|
| 170 |
+
"""Encode a prompt using the tokenizer group.
|
| 171 |
+
|
| 172 |
+
We pick an idle actor and use it to encode the prompt.
|
| 173 |
+
If there are no idle actors, we wait until one becomes
|
| 174 |
+
available.
|
| 175 |
+
The actor is then put back in the queue for future use.
|
| 176 |
+
This is non-blocking.
|
| 177 |
+
"""
|
| 178 |
+
self.check_health()
|
| 179 |
+
self._ensure_queue_initialized()
|
| 180 |
+
assert self._idle_actors is not None
|
| 181 |
+
|
| 182 |
+
actor = await self._idle_actors.get()
|
| 183 |
+
actor_is_alive = True
|
| 184 |
+
original_actor = actor
|
| 185 |
+
try:
|
| 186 |
+
ret = await actor.encode.remote(
|
| 187 |
+
request_id=request_id,
|
| 188 |
+
prompt=prompt,
|
| 189 |
+
lora_request=lora_request,
|
| 190 |
+
add_special_tokens=add_special_tokens)
|
| 191 |
+
except ActorDiedError as e:
|
| 192 |
+
# If the actor is dead, we first try to reinitialize it.
|
| 193 |
+
logger.warning("%s died with ActorDiedError, reinitializing.",
|
| 194 |
+
actor,
|
| 195 |
+
exc_info=e)
|
| 196 |
+
actor = self._init_actor()
|
| 197 |
+
try:
|
| 198 |
+
ret = await actor.encode.remote(
|
| 199 |
+
request_id=request_id,
|
| 200 |
+
prompt=prompt,
|
| 201 |
+
lora_request=lora_request,
|
| 202 |
+
add_special_tokens=add_special_tokens)
|
| 203 |
+
except ActorDiedError as e:
|
| 204 |
+
logger.error(
|
| 205 |
+
"%s died for second time in a row, marking "
|
| 206 |
+
"RayTokenizerGroupPool as unhealthy.", actor)
|
| 207 |
+
actor_is_alive = False
|
| 208 |
+
if not self._exception:
|
| 209 |
+
self._exception = e
|
| 210 |
+
self.check_health()
|
| 211 |
+
finally:
|
| 212 |
+
self._finalize_encode(actor, original_actor, actor_is_alive)
|
| 213 |
+
return ret
|
| 214 |
+
|
| 215 |
+
def get_max_input_len(self,
|
| 216 |
+
lora_request: Optional[LoRARequest] = None
|
| 217 |
+
) -> Optional[int]:
|
| 218 |
+
"""Get the maximum input length for the LoRA request."""
|
| 219 |
+
return self._local_tokenizer_group.get_max_input_len(lora_request)
|
| 220 |
+
|
| 221 |
+
def get_lora_tokenizer(
|
| 222 |
+
self,
|
| 223 |
+
lora_request: Optional[LoRARequest] = None,
|
| 224 |
+
) -> AnyTokenizer:
|
| 225 |
+
return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
|
| 226 |
+
|
| 227 |
+
async def get_lora_tokenizer_async(
|
| 228 |
+
self,
|
| 229 |
+
lora_request: Optional[LoRARequest] = None,
|
| 230 |
+
) -> AnyTokenizer:
|
| 231 |
+
return await self._local_tokenizer_group.get_lora_tokenizer_async(
|
| 232 |
+
lora_request)
|
| 233 |
+
|
| 234 |
+
def check_health(self):
|
| 235 |
+
if self._exception:
|
| 236 |
+
raise RuntimeError(
|
| 237 |
+
"TokenizerGroupPool is unhealthy.") from self._exception
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
|
| 241 |
+
"""Copy over all current process environment variables to the runtime_env.
|
| 242 |
+
|
| 243 |
+
The variables in runtime_env will take precedence over the current process
|
| 244 |
+
environment variables.
|
| 245 |
+
|
| 246 |
+
runtime_env will be modified in place."""
|
| 247 |
+
env_vars = os.environ.copy()
|
| 248 |
+
runtime_env.setdefault("env_vars", {})
|
| 249 |
+
env_vars.update(runtime_env["env_vars"])
|
| 250 |
+
runtime_env["env_vars"] = env_vars
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
|
| 5 |
+
from vllm.config import TokenizerPoolConfig
|
| 6 |
+
from vllm.lora.request import LoRARequest
|
| 7 |
+
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
|
| 8 |
+
get_lora_tokenizer,
|
| 9 |
+
get_lora_tokenizer_async,
|
| 10 |
+
get_tokenizer)
|
| 11 |
+
from vllm.utils import LRUCache
|
| 12 |
+
|
| 13 |
+
from .base_tokenizer_group import BaseTokenizerGroup
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class TokenizerGroup(BaseTokenizerGroup):
|
| 17 |
+
"""A group of tokenizers that can be used for LoRA adapters."""
|
| 18 |
+
|
| 19 |
+
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
| 20 |
+
max_input_length: Optional[int], **tokenizer_config):
|
| 21 |
+
self.tokenizer_id = tokenizer_id
|
| 22 |
+
self.tokenizer_config = tokenizer_config
|
| 23 |
+
self.enable_lora = enable_lora
|
| 24 |
+
self.max_input_length = max_input_length
|
| 25 |
+
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
|
| 26 |
+
max_loras = tokenizer_config.get("max_loras", 0)
|
| 27 |
+
self.lora_tokenizers = LRUCache[int, AnyTokenizer](
|
| 28 |
+
capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
|
| 29 |
+
|
| 30 |
+
@classmethod
|
| 31 |
+
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
| 32 |
+
**init_kwargs) -> "TokenizerGroup":
|
| 33 |
+
return cls(**init_kwargs)
|
| 34 |
+
|
| 35 |
+
def ping(self) -> bool:
|
| 36 |
+
"""Check if the tokenizer group is alive."""
|
| 37 |
+
return True
|
| 38 |
+
|
| 39 |
+
def get_max_input_len(self,
|
| 40 |
+
lora_request: Optional[LoRARequest] = None
|
| 41 |
+
) -> Optional[int]:
|
| 42 |
+
"""Get the maximum input length for the LoRA request."""
|
| 43 |
+
return self.max_input_length
|
| 44 |
+
|
| 45 |
+
def _raise_if_input_too_long(self,
|
| 46 |
+
encoded_tokens: List[int],
|
| 47 |
+
lora_request: Optional[LoRARequest] = None):
|
| 48 |
+
input_length = len(encoded_tokens)
|
| 49 |
+
if lora_request:
|
| 50 |
+
max_input_length = (lora_request.long_lora_max_len
|
| 51 |
+
or self.max_input_length)
|
| 52 |
+
else:
|
| 53 |
+
max_input_length = self.max_input_length
|
| 54 |
+
if max_input_length is not None and input_length > max_input_length:
|
| 55 |
+
raise ValueError("Input too long.", input_length, max_input_length)
|
| 56 |
+
|
| 57 |
+
def encode(self,
|
| 58 |
+
prompt: str,
|
| 59 |
+
request_id: Optional[str] = None,
|
| 60 |
+
lora_request: Optional[LoRARequest] = None,
|
| 61 |
+
add_special_tokens: Optional[bool] = None) -> List[int]:
|
| 62 |
+
tokenizer = self.get_lora_tokenizer(lora_request)
|
| 63 |
+
ret = encode_tokens(tokenizer,
|
| 64 |
+
prompt,
|
| 65 |
+
add_special_tokens=add_special_tokens)
|
| 66 |
+
self._raise_if_input_too_long(ret, lora_request)
|
| 67 |
+
return ret
|
| 68 |
+
|
| 69 |
+
async def encode_async(
|
| 70 |
+
self,
|
| 71 |
+
prompt: str,
|
| 72 |
+
request_id: Optional[str] = None,
|
| 73 |
+
lora_request: Optional[LoRARequest] = None,
|
| 74 |
+
add_special_tokens: Optional[bool] = None) -> List[int]:
|
| 75 |
+
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
| 76 |
+
ret = encode_tokens(tokenizer,
|
| 77 |
+
prompt,
|
| 78 |
+
add_special_tokens=add_special_tokens)
|
| 79 |
+
self._raise_if_input_too_long(ret, lora_request)
|
| 80 |
+
return ret
|
| 81 |
+
|
| 82 |
+
def get_lora_tokenizer(
|
| 83 |
+
self,
|
| 84 |
+
lora_request: Optional[LoRARequest] = None,
|
| 85 |
+
) -> AnyTokenizer:
|
| 86 |
+
if not lora_request or not self.enable_lora:
|
| 87 |
+
return self.tokenizer
|
| 88 |
+
if lora_request.lora_int_id not in self.lora_tokenizers:
|
| 89 |
+
tokenizer = (get_lora_tokenizer(
|
| 90 |
+
lora_request, **self.tokenizer_config) or self.tokenizer)
|
| 91 |
+
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
| 92 |
+
return tokenizer
|
| 93 |
+
else:
|
| 94 |
+
return self.lora_tokenizers[lora_request.lora_int_id]
|
| 95 |
+
|
| 96 |
+
async def get_lora_tokenizer_async(
|
| 97 |
+
self,
|
| 98 |
+
lora_request: Optional[LoRARequest] = None,
|
| 99 |
+
) -> AnyTokenizer:
|
| 100 |
+
if not lora_request or not self.enable_lora:
|
| 101 |
+
return self.tokenizer
|
| 102 |
+
if lora_request.lora_int_id not in self.lora_tokenizers:
|
| 103 |
+
tokenizer = (await get_lora_tokenizer_async(
|
| 104 |
+
lora_request, **self.tokenizer_config) or self.tokenizer)
|
| 105 |
+
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
| 106 |
+
return tokenizer
|
| 107 |
+
else:
|
| 108 |
+
return self.lora_tokenizers[lora_request.lora_int_id]
|
.venv/lib/python3.11/site-packages/vllm/transformers_utils/utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from os import PathLike
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Union
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def is_s3(model_or_path: str) -> bool:
|
| 9 |
+
return model_or_path.lower().startswith('s3://')
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def check_gguf_file(model: Union[str, PathLike]) -> bool:
|
| 13 |
+
"""Check if the file is a GGUF model."""
|
| 14 |
+
model = Path(model)
|
| 15 |
+
if not model.is_file():
|
| 16 |
+
return False
|
| 17 |
+
elif model.suffix == ".gguf":
|
| 18 |
+
return True
|
| 19 |
+
|
| 20 |
+
with open(model, "rb") as f:
|
| 21 |
+
header = f.read(4)
|
| 22 |
+
return header == b"GGUF"
|
.venv/lib/python3.11/site-packages/vllm/v1/engine/__init__.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import enum
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import TYPE_CHECKING, List, Optional, Union
|
| 6 |
+
|
| 7 |
+
import msgspec
|
| 8 |
+
|
| 9 |
+
from vllm.v1.metrics.stats import SchedulerStats
|
| 10 |
+
|
| 11 |
+
if TYPE_CHECKING:
|
| 12 |
+
from vllm.lora.request import LoRARequest
|
| 13 |
+
from vllm.multimodal import MultiModalKwargs
|
| 14 |
+
from vllm.multimodal.inputs import PlaceholderRange
|
| 15 |
+
from vllm.sampling_params import SamplingParams
|
| 16 |
+
|
| 17 |
+
# These are possible values of RequestOutput.finish_reason,
|
| 18 |
+
# so form part of the external API.
|
| 19 |
+
FINISH_REASON_STRINGS = ("stop", "length", "abort")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class FinishReason(enum.IntEnum):
|
| 23 |
+
"""
|
| 24 |
+
Reason a request finished - stop, length, or abort.
|
| 25 |
+
|
| 26 |
+
Int rather than Str for more compact serialization.
|
| 27 |
+
|
| 28 |
+
stop - a stop string was emitted
|
| 29 |
+
length - max_tokens was consumed, or max_model_len was reached
|
| 30 |
+
abort - aborted for another reason
|
| 31 |
+
|
| 32 |
+
"""
|
| 33 |
+
STOP = 0
|
| 34 |
+
LENGTH = 1
|
| 35 |
+
ABORT = 2
|
| 36 |
+
|
| 37 |
+
def __str__(self):
|
| 38 |
+
return FINISH_REASON_STRINGS[self.value]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@dataclass
|
| 42 |
+
class EngineCoreRequest:
|
| 43 |
+
|
| 44 |
+
# NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
|
| 45 |
+
# but this object is currently not playing well with msgspec
|
| 46 |
+
# due to circular imports and typing we have in data.py
|
| 47 |
+
|
| 48 |
+
request_id: str
|
| 49 |
+
# NOTE(ywang96): original text prompt is needed when a request is added to
|
| 50 |
+
# Detokenizer, but set to None when it is added to EngineCoreClient.
|
| 51 |
+
prompt: Optional[str]
|
| 52 |
+
prompt_token_ids: List[int]
|
| 53 |
+
mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
|
| 54 |
+
mm_hashes: Optional[List[str]]
|
| 55 |
+
mm_placeholders: Optional[List["PlaceholderRange"]]
|
| 56 |
+
sampling_params: "SamplingParams"
|
| 57 |
+
eos_token_id: Optional[int]
|
| 58 |
+
arrival_time: float
|
| 59 |
+
lora_request: Optional["LoRARequest"]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class EngineCoreOutput(
|
| 63 |
+
msgspec.Struct,
|
| 64 |
+
array_like=True, # type: ignore[call-arg]
|
| 65 |
+
omit_defaults=True, # type: ignore[call-arg]
|
| 66 |
+
gc=False): # type: ignore[call-arg]
|
| 67 |
+
|
| 68 |
+
request_id: str
|
| 69 |
+
new_token_ids: List[int]
|
| 70 |
+
finished: bool
|
| 71 |
+
finish_reason: Optional[FinishReason] = None
|
| 72 |
+
stop_reason: Union[int, str, None] = None
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class EngineCoreOutputs(
|
| 76 |
+
msgspec.Struct,
|
| 77 |
+
array_like=True, # type: ignore[call-arg]
|
| 78 |
+
omit_defaults=True, # type: ignore[call-arg]
|
| 79 |
+
gc=False): # type: ignore[call-arg]
|
| 80 |
+
|
| 81 |
+
#NOTE(Nick): We could consider ways to make this more compact,
|
| 82 |
+
# e.g. columnwise layout
|
| 83 |
+
|
| 84 |
+
# [num_reqs]
|
| 85 |
+
outputs: List[EngineCoreOutput]
|
| 86 |
+
scheduler_stats: SchedulerStats
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@dataclass
|
| 90 |
+
class EngineCoreProfile:
|
| 91 |
+
is_start: bool
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@dataclass
|
| 95 |
+
class EngineCoreResetPrefixCache:
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class EngineCoreRequestType(enum.Enum):
|
| 100 |
+
"""
|
| 101 |
+
Request types defined as hex byte strings, so it can be sent over sockets
|
| 102 |
+
without separate encoding step.
|
| 103 |
+
"""
|
| 104 |
+
ADD = b'\x00'
|
| 105 |
+
ABORT = b'\x01'
|
| 106 |
+
PROFILE = b'\x02'
|
| 107 |
+
RESET_PREFIX_CACHE = b'\x03'
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile,
|
| 111 |
+
EngineCoreResetPrefixCache, List[str]]
|
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (4.57 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/async_llm.cpython-311.pyc
ADDED
|
Binary file (16.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core.cpython-311.pyc
ADDED
|
Binary file (14.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core_client.cpython-311.pyc
ADDED
|
Binary file (17.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/detokenizer.cpython-311.pyc
ADDED
|
Binary file (6.93 kB). View file
|
|
|