koichi12 commited on
Commit
f0ca319
·
verified ·
1 Parent(s): b0fb87d

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__init__.py +19 -0
  2. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/__init__.cpython-311.pyc +0 -0
  3. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/config.cpython-311.pyc +0 -0
  4. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer.cpython-311.pyc +0 -0
  5. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer_utils.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/processor.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/s3_utils.cpython-311.pyc +0 -0
  8. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/tokenizer.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/utils.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/vllm/transformers_utils/config.py +605 -0
  11. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/arctic.py +206 -0
  12. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/chatglm.py +71 -0
  13. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/cohere2.py +194 -0
  14. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  15. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/eagle.py +51 -0
  16. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/exaone.py +191 -0
  17. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/falcon.py +89 -0
  18. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/h2ovl.py +15 -0
  19. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/internvl.py +53 -0
  20. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/medusa.py +62 -0
  21. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mllama.py +30 -0
  22. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mpt.py +179 -0
  23. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nemotron.py +204 -0
  24. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nvlm_d.py +14 -0
  25. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/solar.py +246 -0
  26. .venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/ultravox.py +101 -0
  27. .venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer.py +167 -0
  28. .venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer_utils.py +169 -0
  29. .venv/lib/python3.11/site-packages/vllm/transformers_utils/processor.py +106 -0
  30. .venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__init__.py +6 -0
  31. .venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/__init__.cpython-311.pyc +0 -0
  32. .venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/deepseek_vl2.cpython-311.pyc +0 -0
  33. .venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  34. .venv/lib/python3.11/site-packages/vllm/transformers_utils/s3_utils.py +154 -0
  35. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer.py +245 -0
  36. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__init__.py +56 -0
  37. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-311.pyc +0 -0
  38. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-311.pyc +0 -0
  39. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-311.pyc +0 -0
  41. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +70 -0
  42. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +250 -0
  43. .venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +108 -0
  44. .venv/lib/python3.11/site-packages/vllm/transformers_utils/utils.py +22 -0
  45. .venv/lib/python3.11/site-packages/vllm/v1/engine/__init__.py +111 -0
  46. .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/__init__.cpython-311.pyc +0 -0
  47. .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/async_llm.cpython-311.pyc +0 -0
  48. .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core.cpython-311.pyc +0 -0
  49. .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core_client.cpython-311.pyc +0 -0
  50. .venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/detokenizer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from vllm.envs import VLLM_USE_MODELSCOPE
4
+
5
+ if VLLM_USE_MODELSCOPE:
6
+ # Patch here, before each import happens
7
+ import modelscope
8
+ from packaging import version
9
+
10
+ # patch_hub begins from modelscope>=1.18.1
11
+ if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
12
+ raise ImportError(
13
+ 'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
14
+ 'install by `pip install modelscope -U`')
15
+
16
+ from modelscope.utils.hf_util import patch_hub
17
+
18
+ # Patch hub to download models from modelscope to speed up.
19
+ patch_hub()
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (793 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/config.cpython-311.pyc ADDED
Binary file (25 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer.cpython-311.pyc ADDED
Binary file (6.19 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/detokenizer_utils.cpython-311.pyc ADDED
Binary file (5.98 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/processor.cpython-311.pyc ADDED
Binary file (3.33 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/s3_utils.cpython-311.pyc ADDED
Binary file (8.73 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/tokenizer.cpython-311.pyc ADDED
Binary file (11 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.37 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/config.py ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import enum
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional, Type, Union
8
+
9
+ import huggingface_hub
10
+ from huggingface_hub import (file_exists, hf_hub_download,
11
+ try_to_load_from_cache)
12
+ from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
13
+ LocalEntryNotFoundError,
14
+ RepositoryNotFoundError,
15
+ RevisionNotFoundError)
16
+ from torch import nn
17
+ from transformers import GenerationConfig, PretrainedConfig
18
+ from transformers.models.auto.image_processing_auto import (
19
+ get_image_processor_config)
20
+ from transformers.models.auto.modeling_auto import (
21
+ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
22
+ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
23
+
24
+ from vllm.envs import VLLM_USE_MODELSCOPE
25
+ from vllm.logger import init_logger
26
+ # yapf conflicts with isort for this block
27
+ # yapf: disable
28
+ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
29
+ DbrxConfig, DeepseekVLV2Config,
30
+ EAGLEConfig, ExaoneConfig,
31
+ H2OVLChatConfig,
32
+ InternVLChatConfig, JAISConfig,
33
+ MedusaConfig, MllamaConfig,
34
+ MLPSpeculatorConfig, MPTConfig,
35
+ NemotronConfig, NVLM_D_Config,
36
+ Olmo2Config, RWConfig,
37
+ SolarConfig, Telechat2Config,
38
+ UltravoxConfig)
39
+ # yapf: enable
40
+ from vllm.transformers_utils.utils import check_gguf_file
41
+ from vllm.utils import resolve_obj_by_qualname
42
+
43
+ if VLLM_USE_MODELSCOPE:
44
+ from modelscope import AutoConfig
45
+ else:
46
+ from transformers import AutoConfig
47
+
48
+ MISTRAL_CONFIG_NAME = "params.json"
49
+ HF_TOKEN = os.getenv('HF_TOKEN', None)
50
+
51
+ logger = init_logger(__name__)
52
+
53
+ _CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = {
54
+ "mllama": MllamaConfig
55
+ }
56
+
57
+ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
58
+ "chatglm": ChatGLMConfig,
59
+ "cohere2": Cohere2Config,
60
+ "dbrx": DbrxConfig,
61
+ "deepseek_vl_v2": DeepseekVLV2Config,
62
+ "mpt": MPTConfig,
63
+ "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
64
+ "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
65
+ "jais": JAISConfig,
66
+ "mlp_speculator": MLPSpeculatorConfig,
67
+ "medusa": MedusaConfig,
68
+ "eagle": EAGLEConfig,
69
+ "exaone": ExaoneConfig,
70
+ "h2ovl_chat": H2OVLChatConfig,
71
+ "internvl_chat": InternVLChatConfig,
72
+ "nemotron": NemotronConfig,
73
+ "NVLM_D": NVLM_D_Config,
74
+ "olmo2": Olmo2Config,
75
+ "solar": SolarConfig,
76
+ "telechat": Telechat2Config,
77
+ "ultravox": UltravoxConfig,
78
+ **_CONFIG_REGISTRY_OVERRIDE_HF
79
+ }
80
+
81
+
82
+ class ConfigFormat(str, enum.Enum):
83
+ AUTO = "auto"
84
+ HF = "hf"
85
+ MISTRAL = "mistral"
86
+
87
+
88
+ def file_or_path_exists(model: Union[str, Path], config_name: str,
89
+ revision: Optional[str]) -> bool:
90
+ if Path(model).exists():
91
+ return (Path(model) / config_name).is_file()
92
+
93
+ # Offline mode support: Check if config file is cached already
94
+ cached_filepath = try_to_load_from_cache(repo_id=model,
95
+ filename=config_name,
96
+ revision=revision)
97
+ if isinstance(cached_filepath, str):
98
+ # The config file exists in cache- we can continue trying to load
99
+ return True
100
+
101
+ # NB: file_exists will only check for the existence of the config file on
102
+ # hf_hub. This will fail in offline mode.
103
+ try:
104
+ return file_exists(model,
105
+ config_name,
106
+ revision=revision,
107
+ token=HF_TOKEN)
108
+ except huggingface_hub.errors.OfflineModeIsEnabled:
109
+ # Don't raise in offline mode, all we know is that we don't have this
110
+ # file cached.
111
+ return False
112
+
113
+
114
+ def patch_rope_scaling(config: PretrainedConfig) -> None:
115
+ """Provide backwards compatibility for RoPE."""
116
+ text_config = getattr(config, "text_config", None)
117
+ if text_config is not None:
118
+ patch_rope_scaling(text_config)
119
+
120
+ rope_scaling = getattr(config, "rope_scaling", None)
121
+ if rope_scaling is not None:
122
+ patch_rope_scaling_dict(rope_scaling)
123
+
124
+
125
+ def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None:
126
+ if "rope_type" in rope_scaling and "type" in rope_scaling:
127
+ rope_type = rope_scaling["rope_type"]
128
+ rope_type_legacy = rope_scaling["type"]
129
+ if rope_type != rope_type_legacy:
130
+ raise ValueError(
131
+ f"Found conflicts between 'rope_type={rope_type}' (modern "
132
+ f"field) and 'type={rope_type_legacy}' (legacy field). "
133
+ "You should only specify one of them.")
134
+
135
+ if "rope_type" not in rope_scaling and "type" in rope_scaling:
136
+ rope_scaling["rope_type"] = rope_scaling["type"]
137
+ logger.info("Replacing legacy 'type' key with 'rope_type'")
138
+
139
+ if "rope_type" not in rope_scaling:
140
+ raise ValueError("rope_scaling should have a 'rope_type' key")
141
+
142
+ if rope_scaling["rope_type"] == "su":
143
+ rope_scaling["rope_type"] = "longrope"
144
+ logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
145
+ elif rope_scaling["rope_type"] == "mrope":
146
+ assert "mrope_section" in rope_scaling
147
+ rope_scaling["rope_type"] = "default"
148
+ logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
149
+
150
+
151
+ def uses_mrope(config: PretrainedConfig) -> bool:
152
+ """Detect if the model with this config uses M-ROPE."""
153
+ rope_scaling = getattr(config, "rope_scaling", None)
154
+ if rope_scaling is None:
155
+ return False
156
+
157
+ return "mrope_section" in rope_scaling
158
+
159
+
160
+ def is_encoder_decoder(config: PretrainedConfig) -> bool:
161
+ """Detect if the model with this config is used as an encoder/decoder."""
162
+ text_config = getattr(config, "text_config", None)
163
+ if text_config is not None:
164
+ return is_encoder_decoder(text_config)
165
+
166
+ return getattr(config, "is_encoder_decoder", False)
167
+
168
+
169
+ def get_config(
170
+ model: Union[str, Path],
171
+ trust_remote_code: bool,
172
+ revision: Optional[str] = None,
173
+ code_revision: Optional[str] = None,
174
+ config_format: ConfigFormat = ConfigFormat.AUTO,
175
+ **kwargs,
176
+ ) -> PretrainedConfig:
177
+ # Separate model folder from file path for GGUF models
178
+
179
+ is_gguf = check_gguf_file(model)
180
+ if is_gguf:
181
+ kwargs["gguf_file"] = Path(model).name
182
+ model = Path(model).parent
183
+
184
+ if config_format == ConfigFormat.AUTO:
185
+ if is_gguf or file_or_path_exists(
186
+ model, HF_CONFIG_NAME, revision=revision):
187
+ config_format = ConfigFormat.HF
188
+ elif file_or_path_exists(model, MISTRAL_CONFIG_NAME,
189
+ revision=revision):
190
+ config_format = ConfigFormat.MISTRAL
191
+ else:
192
+ # If we're in offline mode and found no valid config format, then
193
+ # raise an offline mode error to indicate to the user that they
194
+ # don't have files cached and may need to go online.
195
+ # This is conveniently triggered by calling file_exists().
196
+ file_exists(model,
197
+ HF_CONFIG_NAME,
198
+ revision=revision,
199
+ token=HF_TOKEN)
200
+
201
+ raise ValueError(f"No supported config format found in {model}")
202
+
203
+ if config_format == ConfigFormat.HF:
204
+ config_dict, _ = PretrainedConfig.get_config_dict(
205
+ model,
206
+ revision=revision,
207
+ code_revision=code_revision,
208
+ token=HF_TOKEN,
209
+ **kwargs,
210
+ )
211
+
212
+ # Use custom model class if it's in our registry
213
+ model_type = config_dict.get("model_type")
214
+ if model_type in _CONFIG_REGISTRY:
215
+ config_class = _CONFIG_REGISTRY[model_type]
216
+ config = config_class.from_pretrained(
217
+ model,
218
+ revision=revision,
219
+ code_revision=code_revision,
220
+ token=HF_TOKEN,
221
+ **kwargs,
222
+ )
223
+ else:
224
+ try:
225
+ config = AutoConfig.from_pretrained(
226
+ model,
227
+ trust_remote_code=trust_remote_code,
228
+ revision=revision,
229
+ code_revision=code_revision,
230
+ token=HF_TOKEN,
231
+ **kwargs,
232
+ )
233
+ except ValueError as e:
234
+ if (not trust_remote_code
235
+ and "requires you to execute the configuration file"
236
+ in str(e)):
237
+ err_msg = (
238
+ "Failed to load the model config. If the model "
239
+ "is a custom model not yet available in the "
240
+ "HuggingFace transformers library, consider setting "
241
+ "`trust_remote_code=True` in LLM or using the "
242
+ "`--trust-remote-code` flag in the CLI.")
243
+ raise RuntimeError(err_msg) from e
244
+ else:
245
+ raise e
246
+
247
+ elif config_format == ConfigFormat.MISTRAL:
248
+ config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
249
+ else:
250
+ raise ValueError(f"Unsupported config format: {config_format}")
251
+
252
+ # Special architecture mapping check for GGUF models
253
+ if is_gguf:
254
+ if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
255
+ raise RuntimeError(
256
+ f"Can't get gguf config for {config.model_type}.")
257
+ model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
258
+ config.update({"architectures": [model_type]})
259
+
260
+ patch_rope_scaling(config)
261
+
262
+ if trust_remote_code:
263
+ maybe_register_config_serialize_by_value()
264
+
265
+ return config
266
+
267
+
268
+ def get_hf_file_to_dict(file_name: str,
269
+ model: Union[str, Path],
270
+ revision: Optional[str] = 'main'):
271
+ """
272
+ Downloads a file from the Hugging Face Hub and returns
273
+ its contents as a dictionary.
274
+
275
+ Parameters:
276
+ - file_name (str): The name of the file to download.
277
+ - model (str): The name of the model on the Hugging Face Hub.
278
+ - revision (str): The specific version of the model.
279
+
280
+ Returns:
281
+ - config_dict (dict): A dictionary containing
282
+ the contents of the downloaded file.
283
+ """
284
+ file_path = Path(model) / file_name
285
+
286
+ if file_or_path_exists(model=model,
287
+ config_name=file_name,
288
+ revision=revision):
289
+
290
+ if not file_path.is_file():
291
+ try:
292
+ hf_hub_file = hf_hub_download(model,
293
+ file_name,
294
+ revision=revision)
295
+ except (RepositoryNotFoundError, RevisionNotFoundError,
296
+ EntryNotFoundError, LocalEntryNotFoundError) as e:
297
+ logger.debug("File or repository not found in hf_hub_download",
298
+ e)
299
+ return None
300
+ except HfHubHTTPError as e:
301
+ logger.warning(
302
+ "Cannot connect to Hugging Face Hub. Skipping file "
303
+ "download for '%s':",
304
+ file_name,
305
+ exc_info=e)
306
+ return None
307
+ file_path = Path(hf_hub_file)
308
+
309
+ with open(file_path) as file:
310
+ return json.load(file)
311
+ return None
312
+
313
+
314
+ def get_pooling_config(model: str, revision: Optional[str] = 'main'):
315
+ """
316
+ This function gets the pooling and normalize
317
+ config from the model - only applies to
318
+ sentence-transformers models.
319
+
320
+ Args:
321
+ model (str): The name of the Hugging Face model.
322
+ revision (str, optional): The specific version
323
+ of the model to use. Defaults to 'main'.
324
+
325
+ Returns:
326
+ dict: A dictionary containing the pooling
327
+ type and whether normalization is used.
328
+ """
329
+
330
+ modules_file_name = "modules.json"
331
+ modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
332
+
333
+ if modules_dict is None:
334
+ return None
335
+
336
+ pooling = next((item for item in modules_dict
337
+ if item["type"] == "sentence_transformers.models.Pooling"),
338
+ None)
339
+ normalize = bool(
340
+ next((item for item in modules_dict
341
+ if item["type"] == "sentence_transformers.models.Normalize"),
342
+ False))
343
+
344
+ if pooling:
345
+
346
+ pooling_file_name = "{}/config.json".format(pooling["path"])
347
+ pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
348
+ pooling_type_name = next(
349
+ (item for item, val in pooling_dict.items() if val is True), None)
350
+
351
+ if pooling_type_name is not None:
352
+ pooling_type_name = get_pooling_config_name(pooling_type_name)
353
+
354
+ return {"pooling_type": pooling_type_name, "normalize": normalize}
355
+
356
+ return None
357
+
358
+
359
+ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
360
+ if "pooling_mode_" in pooling_name:
361
+ pooling_name = pooling_name.replace("pooling_mode_", "")
362
+
363
+ if "_" in pooling_name:
364
+ pooling_name = pooling_name.split("_")[0]
365
+
366
+ if "lasttoken" in pooling_name:
367
+ pooling_name = "last"
368
+
369
+ supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
370
+ pooling_type_name = pooling_name.upper()
371
+
372
+ try:
373
+ if pooling_type_name in supported_pooling_types:
374
+ return pooling_type_name
375
+ except NotImplementedError as e:
376
+ logger.debug("Pooling type not supported", e)
377
+ return None
378
+ return None
379
+
380
+
381
+ def get_sentence_transformer_tokenizer_config(model: str,
382
+ revision: Optional[str] = 'main'
383
+ ):
384
+ """
385
+ Returns the tokenization configuration dictionary for a
386
+ given Sentence Transformer BERT model.
387
+
388
+ Parameters:
389
+ - model (str): The name of the Sentence Transformer
390
+ BERT model.
391
+ - revision (str, optional): The revision of the m
392
+ odel to use. Defaults to 'main'.
393
+
394
+ Returns:
395
+ - dict: A dictionary containing the configuration parameters
396
+ for the Sentence Transformer BERT model.
397
+ """
398
+ for config_name in [
399
+ "sentence_bert_config.json",
400
+ "sentence_roberta_config.json",
401
+ "sentence_distilbert_config.json",
402
+ "sentence_camembert_config.json",
403
+ "sentence_albert_config.json",
404
+ "sentence_xlm-roberta_config.json",
405
+ "sentence_xlnet_config.json",
406
+ ]:
407
+ encoder_dict = get_hf_file_to_dict(config_name, model, revision)
408
+ if encoder_dict:
409
+ break
410
+
411
+ if not encoder_dict:
412
+ return None
413
+
414
+ if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
415
+ return encoder_dict
416
+ return None
417
+
418
+
419
+ def maybe_register_config_serialize_by_value() -> None:
420
+ """Try to register HF model configuration class to serialize by value
421
+
422
+ If trust_remote_code is set, and the model's config file specifies an
423
+ `AutoConfig` class, then the config class is typically an instance of
424
+ a custom class imported from the HF modules cache.
425
+
426
+ Examples:
427
+
428
+ >>> from transformers import AutoConfig
429
+ >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
430
+ >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
431
+ >>> import transformers_modules # error, not initialized
432
+ >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
433
+ >>> import transformers_modules # success, initialized
434
+ >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
435
+
436
+ In the DeepSeek example, the config class is an instance of a custom
437
+ class that is not serializable by default. This class will not be
438
+ importable in spawned workers, and won't exist at all on
439
+ other nodes, which breaks serialization of the config.
440
+
441
+ In this function we tell the cloudpickle serialization library to pass
442
+ instances of these generated classes by value instead of by reference,
443
+ i.e. the class definition is serialized along with its data so that the
444
+ class module does not need to be importable on the receiving end.
445
+
446
+ See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
447
+ """ # noqa
448
+ try:
449
+ import transformers_modules
450
+ except ImportError:
451
+ # the config does not need trust_remote_code
452
+ return
453
+
454
+ try:
455
+ import cloudpickle
456
+ cloudpickle.register_pickle_by_value(transformers_modules)
457
+
458
+ # ray vendors its own version of cloudpickle
459
+ from vllm.executor.ray_utils import ray
460
+ if ray:
461
+ ray.cloudpickle.register_pickle_by_value(transformers_modules)
462
+
463
+ # multiprocessing uses pickle to serialize arguments when using spawn
464
+ # Here we get pickle to use cloudpickle to serialize config objects
465
+ # that contain instances of the custom config class to avoid
466
+ # serialization problems if the generated module (and model) has a `.`
467
+ # in its name
468
+ import multiprocessing
469
+ import pickle
470
+
471
+ from vllm.config import VllmConfig
472
+
473
+ def _reduce_config(config: VllmConfig):
474
+ return (pickle.loads, (cloudpickle.dumps(config), ))
475
+
476
+ multiprocessing.reducer.register(VllmConfig, _reduce_config)
477
+
478
+ except Exception as e:
479
+ logger.warning(
480
+ "Unable to register remote classes used by"
481
+ " trust_remote_code with by-value serialization. This may"
482
+ " lead to a later error. If remote code is not needed"
483
+ " remove `--trust-remote-code`",
484
+ exc_info=e)
485
+
486
+
487
+ def load_params_config(model: Union[str, Path], revision: Optional[str],
488
+ **kwargs) -> PretrainedConfig:
489
+ # This function loads a params.json config which
490
+ # should be used when loading models in mistral format
491
+
492
+ config_file_name = "params.json"
493
+
494
+ config_dict = get_hf_file_to_dict(config_file_name, model, revision)
495
+ assert isinstance(config_dict, dict)
496
+
497
+ config_mapping = {
498
+ "dim": "hidden_size",
499
+ "norm_eps": "rms_norm_eps",
500
+ "n_kv_heads": "num_key_value_heads",
501
+ "n_layers": "num_hidden_layers",
502
+ "n_heads": "num_attention_heads",
503
+ "hidden_dim": "intermediate_size",
504
+ }
505
+
506
+ def recurse_elems(elem: Any):
507
+ if isinstance(elem, dict):
508
+ config_dict = {}
509
+ for key, value in elem.items():
510
+ key = config_mapping.get(key, key)
511
+ config_dict[key] = recurse_elems(value)
512
+ return PretrainedConfig(**config_dict)
513
+ else:
514
+ return elem
515
+
516
+ config_dict["model_type"] = config_dict.get("model_type", "transformer")
517
+ config_dict["hidden_act"] = config_dict.get("activation", "silu")
518
+ config_dict["tie_word_embeddings"] = config_dict.get(
519
+ "tie_embeddings", False)
520
+ config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
521
+ config_dict["max_position_embeddings"] = config_dict.get(
522
+ "max_position_embeddings", 128_000)
523
+
524
+ if config_dict.get("moe") is not None:
525
+ config_dict["architectures"] = ["MixtralForCausalLM"]
526
+ else:
527
+ config_dict["architectures"] = ["MistralForCausalLM"]
528
+
529
+ if config_dict.get("vision_encoder") is not None:
530
+ multimodal_config = config_dict.pop("vision_encoder")
531
+
532
+ config_dict = {
533
+ "text_config": config_dict,
534
+ "vision_config": multimodal_config
535
+ }
536
+ config_dict["architectures"] = ["PixtralForConditionalGeneration"]
537
+ config_dict["model_type"] = "pixtral"
538
+
539
+ config_dict.update(kwargs)
540
+
541
+ config = recurse_elems(config_dict)
542
+ return config
543
+
544
+
545
+ def get_hf_image_processor_config(
546
+ model: Union[str, Path],
547
+ revision: Optional[str] = None,
548
+ **kwargs,
549
+ ) -> Dict[str, Any]:
550
+ # ModelScope does not provide an interface for image_processor
551
+ if VLLM_USE_MODELSCOPE:
552
+ return dict()
553
+ # Separate model folder from file path for GGUF models
554
+ if check_gguf_file(model):
555
+ model = Path(model).parent
556
+ return get_image_processor_config(model, revision=revision, **kwargs)
557
+
558
+
559
+ def get_hf_text_config(config: PretrainedConfig):
560
+ """Get the "sub" config relevant to llm for multi modal models.
561
+ No op for pure text models.
562
+ """
563
+ if hasattr(config, "text_config"):
564
+ # The code operates under the assumption that text_config should have
565
+ # `num_attention_heads` (among others). Assert here to fail early
566
+ # if transformers config doesn't align with this assumption.
567
+ assert hasattr(config.text_config, "num_attention_heads")
568
+ return config.text_config
569
+ else:
570
+ return config
571
+
572
+
573
+ def try_get_generation_config(
574
+ model: str,
575
+ trust_remote_code: bool,
576
+ revision: Optional[str] = None,
577
+ ) -> Optional[GenerationConfig]:
578
+ try:
579
+ return GenerationConfig.from_pretrained(
580
+ model,
581
+ revision=revision,
582
+ )
583
+ except OSError: # Not found
584
+ try:
585
+ config = get_config(
586
+ model,
587
+ trust_remote_code=trust_remote_code,
588
+ revision=revision,
589
+ )
590
+ return GenerationConfig.from_model_config(config)
591
+ except OSError: # Not found
592
+ return None
593
+
594
+
595
+ def get_cross_encoder_activation_function(config: PretrainedConfig):
596
+ if (hasattr(config, "sbert_ce_default_activation_function")
597
+ and config.sbert_ce_default_activation_function is not None):
598
+
599
+ function_name = config.sbert_ce_default_activation_function
600
+ assert function_name.startswith("torch.nn.modules."), \
601
+ "Loading of activation functions is restricted to " \
602
+ "torch.nn.modules for security reasons"
603
+ return resolve_obj_by_qualname(function_name)()
604
+ else:
605
+ return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/arctic.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # yapf: disable
4
+ # ruff: noqa: E501
5
+ # coding=utf-8
6
+ # Copied from
7
+ # https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
8
+ """ Arctic model configuration"""
9
+
10
+ from dataclasses import asdict, dataclass
11
+ from typing import Any, Dict
12
+
13
+ from transformers.configuration_utils import PretrainedConfig
14
+ from transformers.utils import logging
15
+
16
+ logger = logging.get_logger(__name__)
17
+
18
+ ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
19
+ "arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
20
+ }
21
+
22
+
23
+ @dataclass
24
+ class ArcticLoraConfig:
25
+ lora_r: int = 64
26
+ lora_alpha: float = 16
27
+ shard_base_weights: bool = False
28
+
29
+
30
+ @dataclass
31
+ class ArcticQuantizationConfig:
32
+ q_bits: int = 8
33
+ rounding: str = "nearest"
34
+ mantissa_bits: int = 3
35
+ group_size: int = 128
36
+
37
+
38
+ class ArcticConfig(PretrainedConfig):
39
+ r"""
40
+ This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
41
+ Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
42
+ with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
43
+
44
+
45
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
46
+ documentation from [`PretrainedConfig`] for more information.
47
+
48
+
49
+ Args:
50
+ vocab_size (`int`, *optional*, defaults to 32000):
51
+ Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
52
+ `inputs_ids` passed when calling [`ArcticModel`]
53
+ hidden_size (`int`, *optional*, defaults to 4096):
54
+ Dimension of the hidden representations.
55
+ intermediate_size (`int`, *optional*, defaults to 14336):
56
+ Dimension of the MLP representations.
57
+ num_hidden_layers (`int`, *optional*, defaults to 32):
58
+ Number of hidden layers in the Transformer encoder.
59
+ num_attention_heads (`int`, *optional*, defaults to 32):
60
+ Number of attention heads for each attention layer in the Transformer encoder.
61
+ num_key_value_heads (`int`, *optional*, defaults to 8):
62
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
63
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
64
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
65
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
66
+ by meanpooling all the original heads within that group. For more details checkout [this
67
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
68
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
69
+ The non-linear activation function (function or string) in the decoder.
70
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
71
+ The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
72
+ allows sequence of up to 4096*32 tokens.
73
+ initializer_range (`float`, *optional*, defaults to 0.02):
74
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
75
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
76
+ The epsilon used by the rms normalization layers.
77
+ use_cache (`bool`, *optional*, defaults to `True`):
78
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
79
+ relevant if `config.is_decoder=True`.
80
+ pad_token_id (`int`, *optional*):
81
+ The id of the padding token.
82
+ bos_token_id (`int`, *optional*, defaults to 1):
83
+ The id of the "beginning-of-sequence" token.
84
+ eos_token_id (`int`, *optional*, defaults to 2):
85
+ The id of the "end-of-sequence" token.
86
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
87
+ Whether the model's input and output word embeddings should be tied.
88
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
89
+ The base period of the RoPE embeddings.
90
+ sliding_window (`int`, *optional*):
91
+ Sliding window attention window size. If not specified, will default to `4096`.
92
+ attention_dropout (`float`, *optional*, defaults to 0.0):
93
+ The dropout ratio for the attention probabilities.
94
+ num_experts_per_tok (`int`, *optional*, defaults to 2):
95
+ The number of experts to root per-token, can be also interpreted as the `top-p` routing
96
+ parameter
97
+ num_local_experts (`int`, *optional*, defaults to 8):
98
+ Number of experts per Sparse MLP layer.
99
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
100
+ The aux loss factor for the total loss.
101
+
102
+ ```python
103
+ >>> from transformers import ArcticModel, ArcticConfig
104
+
105
+ >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
106
+ >>> configuration = ArcticConfig()
107
+
108
+ >>> # Initializing a model from the Arctic 7B style configuration
109
+ >>> model = ArcticModel(configuration)
110
+
111
+ >>> # Accessing the model configuration
112
+ >>> configuration = model.config
113
+ ```"""
114
+
115
+ model_type = "arctic"
116
+ keys_to_ignore_at_inference = ["past_key_values"]
117
+
118
+ def __init__(
119
+ self,
120
+ vocab_size=32000,
121
+ hidden_size=4096,
122
+ intermediate_size=14336,
123
+ num_hidden_layers=32,
124
+ num_attention_heads=32,
125
+ num_key_value_heads=None,
126
+ hidden_act="silu",
127
+ max_position_embeddings=4096,
128
+ initializer_range=0.02,
129
+ rms_norm_eps=1e-5,
130
+ use_cache=True,
131
+ pad_token_id=None,
132
+ bos_token_id=1,
133
+ eos_token_id=2,
134
+ tie_word_embeddings=False,
135
+ rope_theta=1e6,
136
+ sliding_window=None,
137
+ attention_dropout=0.0,
138
+ num_experts_per_tok=1,
139
+ num_local_experts=8,
140
+ router_aux_loss_coef=0.001,
141
+ moe_layer_frequency=2,
142
+ parallel_attn_mlp_res=False,
143
+ moe_train_capacity_factor=1,
144
+ moe_eval_capacity_factor=1,
145
+ enable_expert_tensor_parallelism=False,
146
+ moe_min_capacity=0,
147
+ moe_token_dropping=True,
148
+ quantization=None,
149
+ **kwargs,
150
+ ):
151
+ self.vocab_size = vocab_size
152
+ self.max_position_embeddings = max_position_embeddings
153
+ self.hidden_size = hidden_size
154
+ self.intermediate_size = intermediate_size
155
+ self.num_hidden_layers = num_hidden_layers
156
+ self.num_attention_heads = num_attention_heads
157
+ self.sliding_window = sliding_window
158
+
159
+ # for backward compatibility
160
+ if num_key_value_heads is None:
161
+ num_key_value_heads = num_attention_heads
162
+
163
+ self.num_key_value_heads = num_key_value_heads
164
+ self.hidden_act = hidden_act
165
+ self.initializer_range = initializer_range
166
+ self.rms_norm_eps = rms_norm_eps
167
+ self.use_cache = use_cache
168
+ self.rope_theta = rope_theta
169
+ self.attention_dropout = attention_dropout
170
+
171
+ self.num_experts_per_tok = num_experts_per_tok
172
+ self.num_local_experts = num_local_experts
173
+ self.router_aux_loss_coef = router_aux_loss_coef
174
+ self.moe_layer_frequency = moe_layer_frequency
175
+ self.moe_train_capacity_factor = moe_train_capacity_factor
176
+ self.moe_eval_capacity_factor = moe_eval_capacity_factor
177
+ self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
178
+ self.moe_min_capacity = moe_min_capacity
179
+ self.moe_token_dropping = moe_token_dropping
180
+ self.parallel_attn_mlp_res = parallel_attn_mlp_res
181
+ if isinstance(quantization, dict):
182
+ self.quantization = ArcticQuantizationConfig(**quantization)
183
+ else:
184
+ self.quantization = quantization
185
+
186
+ super().__init__(
187
+ pad_token_id=pad_token_id,
188
+ bos_token_id=bos_token_id,
189
+ eos_token_id=eos_token_id,
190
+ tie_word_embeddings=tie_word_embeddings,
191
+ **kwargs,
192
+ )
193
+
194
+ @classmethod
195
+ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig":
196
+ result = super().from_dict(config_dict, **kwargs)
197
+ config = result[0] if isinstance(result, tuple) else result
198
+ if isinstance(config.quantization, dict):
199
+ config.quantization = ArcticQuantizationConfig(**config.quantization)
200
+ return result
201
+
202
+ def to_dict(self) -> Dict[str, Any]:
203
+ ret = super().to_dict()
204
+ if isinstance(ret["quantization"], ArcticQuantizationConfig):
205
+ ret["quantization"] = asdict(ret["quantization"])
206
+ return ret
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/chatglm.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from
4
+ # https://github.com/THUDM/ChatGLM2-6B
5
+ from transformers import PretrainedConfig
6
+
7
+
8
+ class ChatGLMConfig(PretrainedConfig):
9
+ model_type = "chatglm"
10
+ attribute_map = {
11
+ "num_hidden_layers": "num_layers",
12
+ "n_head_kv": "multi_query_group_num",
13
+ }
14
+
15
+ def __init__(self,
16
+ num_layers=28,
17
+ padded_vocab_size=65024,
18
+ hidden_size=4096,
19
+ ffn_hidden_size=13696,
20
+ kv_channels=128,
21
+ num_attention_heads=32,
22
+ seq_length=2048,
23
+ hidden_dropout=0.0,
24
+ attention_dropout=0.0,
25
+ layernorm_epsilon=1e-5,
26
+ rmsnorm=True,
27
+ apply_residual_connection_post_layernorm=False,
28
+ post_layer_norm=True,
29
+ add_bias_linear=False,
30
+ add_qkv_bias=False,
31
+ interleaved_qkv=False,
32
+ bias_dropout_fusion=True,
33
+ multi_query_attention=False,
34
+ multi_query_group_num=1,
35
+ apply_query_key_layer_scaling=True,
36
+ attention_softmax_in_fp32=True,
37
+ fp32_residual_connection=False,
38
+ quantization_bit=0,
39
+ pre_seq_len=None,
40
+ prefix_projection=False,
41
+ **kwargs):
42
+ self.num_layers = num_layers
43
+ self.vocab_size = padded_vocab_size
44
+ self.padded_vocab_size = padded_vocab_size
45
+ self.hidden_size = hidden_size
46
+ self.ffn_hidden_size = ffn_hidden_size
47
+ self.kv_channels = kv_channels
48
+ self.num_attention_heads = num_attention_heads
49
+ self.seq_length = seq_length
50
+ # It is to be compatible with long lora.
51
+ self.max_position_embeddings = seq_length
52
+ self.hidden_dropout = hidden_dropout
53
+ self.attention_dropout = attention_dropout
54
+ self.layernorm_epsilon = layernorm_epsilon
55
+ self.rmsnorm = rmsnorm
56
+ self.apply_residual_connection_post_layernorm = (
57
+ apply_residual_connection_post_layernorm)
58
+ self.post_layer_norm = post_layer_norm
59
+ self.add_bias_linear = add_bias_linear
60
+ self.add_qkv_bias = add_qkv_bias
61
+ self.bias_dropout_fusion = bias_dropout_fusion
62
+ self.multi_query_attention = multi_query_attention
63
+ self.multi_query_group_num = multi_query_group_num
64
+ self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
65
+ self.attention_softmax_in_fp32 = attention_softmax_in_fp32
66
+ self.fp32_residual_connection = fp32_residual_connection
67
+ self.quantization_bit = quantization_bit
68
+ self.pre_seq_len = pre_seq_len
69
+ self.prefix_projection = prefix_projection
70
+ self.interleaved_qkv = interleaved_qkv
71
+ super().__init__(**kwargs)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/cohere2.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # ruff: noqa
4
+
5
+ # Adapted from
6
+ # https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
7
+ from transformers import PretrainedConfig
8
+ from transformers.modeling_rope_utils import rope_config_validation
9
+
10
+
11
+ class Cohere2Config(PretrainedConfig):
12
+ r"""
13
+ This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
14
+ model according to the specified arguments, defining the model architecture.
15
+
16
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
17
+ documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
18
+ with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
19
+
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 256000):
23
+ Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`CohereModel`]
25
+ hidden_size (`int`, *optional*, defaults to 8192):
26
+ Dimension of the hidden representations.
27
+ intermediate_size (`int`, *optional*, defaults to 22528):
28
+ Dimension of the MLP representations.
29
+ logit_scale (`float`, *optional*, defaults to 0.0625):
30
+ The scaling factor for the output logits.
31
+ num_hidden_layers (`int`, *optional*, defaults to 40):
32
+ Number of hidden layers in the Transformer decoder.
33
+ num_attention_heads (`int`, *optional*, defaults to 64):
34
+ Number of attention heads for each attention layer in the Transformer decoder.
35
+ num_key_value_heads (`int`, *optional*):
36
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
37
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
38
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
39
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
40
+ by meanpooling all the original heads within that group. For more details checkout [this
41
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
42
+ `num_attention_heads`.
43
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
44
+ The non-linear activation function (function or string) in the decoder.
45
+ max_position_embeddings (`int`, *optional*, defaults to 8192):
46
+ The maximum sequence length that this model might ever be used with.
47
+ initializer_range (`float`, *optional*, defaults to 0.02):
48
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
49
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
50
+ The epsilon used by the layer normalization.
51
+ use_cache (`bool`, *optional*, defaults to `True`):
52
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
53
+ relevant if `config.is_decoder=True`.
54
+ pad_token_id (`int`, *optional*, defaults to 0):
55
+ Padding token id.
56
+ bos_token_id (`int`, *optional*, defaults to 5):
57
+ Beginning of stream token id.
58
+ eos_token_id (`int`, *optional*, defaults to 255001):
59
+ End of stream token id.
60
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
61
+ Whether to tie weight embeddings
62
+ rope_theta (`float`, *optional*, defaults to 10000.0):
63
+ The base period of the RoPE embeddings.
64
+ rope_scaling (`Dict`, *optional*):
65
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
66
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
67
+ accordingly.
68
+ Expected contents:
69
+ `rope_type` (`str`):
70
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
71
+ 'llama3'], with 'default' being the original RoPE implementation.
72
+ `factor` (`float`, *optional*):
73
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
74
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
75
+ original maximum pre-trained length.
76
+ `original_max_position_embeddings` (`int`, *optional*):
77
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
78
+ pretraining.
79
+ `attention_factor` (`float`, *optional*):
80
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
81
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
82
+ `factor` field to infer the suggested value.
83
+ `beta_fast` (`float`, *optional*):
84
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
85
+ ramp function. If unspecified, it defaults to 32.
86
+ `beta_slow` (`float`, *optional*):
87
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
88
+ ramp function. If unspecified, it defaults to 1.
89
+ `short_factor` (`List[float]`, *optional*):
90
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
91
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
92
+ size divided by the number of attention heads divided by 2
93
+ `long_factor` (`List[float]`, *optional*):
94
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
95
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
96
+ size divided by the number of attention heads divided by 2
97
+ `low_freq_factor` (`float`, *optional*):
98
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
99
+ `high_freq_factor` (`float`, *optional*):
100
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
101
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
102
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
103
+ attention_dropout (`float`, *optional*, defaults to 0.0):
104
+ The dropout ratio for the attention probabilities.
105
+ sliding_window (`int`, *optional*, defaults to 4096):
106
+ Size of the sliding window attention context.
107
+ sliding_window_pattern (`int`, *optional*, defaults to 4):
108
+ Pattern for the sliding window attention.
109
+ cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
110
+
111
+ ```python
112
+ >>> from transformers import Cohere2Model, Cohere2Config
113
+
114
+ >>> # Initializing a Cohere Nextmodel configuration
115
+ >>> configuration = Cohere2Config()
116
+
117
+ >>> # Initializing a model from the Cohere2 configuration
118
+ >>> model = Cohere2Model(configuration) # doctest: +SKIP
119
+
120
+ >>> # Accessing the model configuration
121
+ >>> configuration = model.config # doctest: +SKIP
122
+ ```
123
+ """
124
+
125
+ model_type = "cohere2"
126
+ keys_to_ignore_at_inference = ["past_key_values"]
127
+
128
+ def __init__(
129
+ self,
130
+ vocab_size=256000,
131
+ hidden_size=8192,
132
+ intermediate_size=22528,
133
+ logit_scale=0.0625,
134
+ num_hidden_layers=40,
135
+ num_attention_heads=64,
136
+ num_key_value_heads=None,
137
+ hidden_act="silu",
138
+ max_position_embeddings=8192,
139
+ initializer_range=0.02,
140
+ layer_norm_eps=1e-5,
141
+ use_cache=True,
142
+ pad_token_id=0,
143
+ bos_token_id=5,
144
+ eos_token_id=255001,
145
+ tie_word_embeddings=True,
146
+ rope_theta=10000.0,
147
+ rope_scaling=None,
148
+ attention_bias=False,
149
+ attention_dropout=0.0,
150
+ sliding_window=4096,
151
+ sliding_window_pattern=4,
152
+ cache_implementation="hybrid",
153
+ **kwargs,
154
+ ):
155
+ self.vocab_size = vocab_size
156
+ self.max_position_embeddings = max_position_embeddings
157
+ self.hidden_size = hidden_size
158
+ self.logit_scale = logit_scale
159
+ self.intermediate_size = intermediate_size
160
+ self.num_hidden_layers = num_hidden_layers
161
+ self.num_attention_heads = num_attention_heads
162
+
163
+ # for backward compatibility
164
+ if num_key_value_heads is None:
165
+ num_key_value_heads = num_attention_heads
166
+
167
+ self.num_key_value_heads = num_key_value_heads
168
+ self.hidden_act = hidden_act
169
+ self.initializer_range = initializer_range
170
+ self.layer_norm_eps = layer_norm_eps
171
+ self.use_cache = use_cache
172
+ self.rope_theta = rope_theta
173
+ self.rope_scaling = rope_scaling
174
+ self.attention_bias = attention_bias
175
+ self.attention_dropout = attention_dropout
176
+ self.sliding_window = sliding_window
177
+ self.sliding_window_pattern = sliding_window_pattern
178
+ # Need to specify head_dim in the config so it can be used in the attention forward functions
179
+ self.head_dim = hidden_size // num_attention_heads
180
+ self.cache_implementation = cache_implementation
181
+
182
+ # Validate the correctness of rotary position embeddings parameters
183
+ rope_config_validation(self)
184
+
185
+ super().__init__(
186
+ pad_token_id=pad_token_id,
187
+ bos_token_id=bos_token_id,
188
+ eos_token_id=eos_token_id,
189
+ tie_word_embeddings=tie_word_embeddings,
190
+ **kwargs,
191
+ )
192
+
193
+
194
+ __all__ = ["Cohere2Config"]
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/deepseek_vl2.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
4
+ from typing import Tuple
5
+
6
+ from transformers.configuration_utils import PretrainedConfig
7
+
8
+
9
+ class VisionEncoderConfig(PretrainedConfig):
10
+ model_type: str = "vision"
11
+
12
+ model_name: str = "vit_so400m_patch14_siglip_384.webli"
13
+ image_size: int = 384
14
+ patch_size: int = 16
15
+ width: int = 1024
16
+ layers: int = 24
17
+ heads: int = 16
18
+ mlp_ratio: int = 4
19
+ global_pool: str = "map"
20
+ ignore_head: bool = True
21
+ class_token: bool = False
22
+ num_classes: int = 0
23
+ use_checkpoint: bool = False
24
+ weight_init: str = "skip"
25
+ deterministic: bool = False
26
+ num_recomputing_layers: int = 0
27
+
28
+ def __init__(self,
29
+ model_name: str = "vit_so400m_patch14_siglip_384.webli",
30
+ image_size: int = 384,
31
+ patch_size: int = 16,
32
+ width: int = 1024,
33
+ layers: int = 24,
34
+ heads: int = 16,
35
+ mlp_ratio: int = 4,
36
+ global_pool: str = "map",
37
+ ignore_head: bool = True,
38
+ class_token: bool = False,
39
+ num_classes: int = 0,
40
+ use_checkpoint: bool = False,
41
+ **kwargs):
42
+ self.model_name = model_name
43
+ self.image_size = image_size
44
+ self.patch_size = patch_size
45
+ self.width = width
46
+ self.layers = layers
47
+ self.heads = heads
48
+ self.mlp_ratio = mlp_ratio
49
+ self.global_pool = global_pool
50
+ self.ignore_head = ignore_head
51
+ self.class_token = class_token
52
+ self.num_classes = num_classes
53
+ self.use_checkpoint = use_checkpoint
54
+
55
+ super().__init__(**kwargs)
56
+
57
+
58
+ class MlpProjectorConfig(PretrainedConfig):
59
+ model_type = "mlp_projector"
60
+ projector_type: str = "downsample_mlp_gelu"
61
+ input_dim: int = 1152
62
+ n_embed: int = 2048
63
+ depth: int = 2
64
+ mlp_ratio: int = 1
65
+ downsample_ratio: int = 2
66
+ token_pooling: bool = False
67
+
68
+ def __init__(self,
69
+ projector_type: str = "downsample_mlp_gelu",
70
+ input_dim: int = 1152,
71
+ n_embed: int = 2048,
72
+ depth: int = 2,
73
+ mlp_ratio: int = 1,
74
+ downsample_ratio: int = 2,
75
+ **kwargs):
76
+ self.projector_type = projector_type
77
+ self.input_dim = input_dim
78
+ self.n_embed = n_embed
79
+ self.depth = depth
80
+ self.mlp_ratio = mlp_ratio
81
+ self.downsample_ratio = downsample_ratio
82
+
83
+ super().__init__(**kwargs)
84
+
85
+
86
+ class DeepseekV2Config(PretrainedConfig):
87
+
88
+ model_type = "deepseek_v2"
89
+ keys_to_ignore_at_inference = ["past_key_values"]
90
+
91
+ def __init__(
92
+ self,
93
+ vocab_size=102400,
94
+ hidden_size=4096,
95
+ intermediate_size=11008,
96
+ moe_intermediate_size=1407,
97
+ num_hidden_layers=30,
98
+ num_attention_heads=32,
99
+ num_key_value_heads=32,
100
+ n_shared_experts=None,
101
+ n_routed_experts=None,
102
+ ep_size=1,
103
+ routed_scaling_factor=1.0,
104
+ kv_lora_rank=512,
105
+ q_lora_rank=1536,
106
+ qk_rope_head_dim=64,
107
+ v_head_dim=128,
108
+ qk_nope_head_dim=128,
109
+ topk_method='gready',
110
+ n_group=None,
111
+ topk_group=None,
112
+ num_experts_per_tok=None,
113
+ moe_layer_freq=1,
114
+ first_k_dense_replace=0,
115
+ norm_topk_prob=False,
116
+ scoring_func='softmax',
117
+ aux_loss_alpha=0.001,
118
+ seq_aux=True,
119
+ hidden_act="silu",
120
+ max_position_embeddings=2048,
121
+ initializer_range=0.02,
122
+ rms_norm_eps=1e-6,
123
+ use_cache=True,
124
+ pad_token_id=None,
125
+ bos_token_id=100000,
126
+ eos_token_id=100001,
127
+ pretraining_tp=1,
128
+ tie_word_embeddings=False,
129
+ rope_theta=10000.0,
130
+ rope_scaling=None,
131
+ attention_bias=False,
132
+ attention_dropout=0.0,
133
+ use_mla=True,
134
+ **kwargs,
135
+ ):
136
+ self.vocab_size = vocab_size
137
+ self.max_position_embeddings = max_position_embeddings
138
+ self.hidden_size = hidden_size
139
+ self.intermediate_size = intermediate_size
140
+ self.moe_intermediate_size = moe_intermediate_size
141
+ self.num_hidden_layers = num_hidden_layers
142
+ self.num_attention_heads = num_attention_heads
143
+ self.n_shared_experts = n_shared_experts
144
+ self.n_routed_experts = n_routed_experts
145
+ self.ep_size = ep_size
146
+ self.routed_scaling_factor = routed_scaling_factor
147
+ self.kv_lora_rank = kv_lora_rank
148
+ self.q_lora_rank = q_lora_rank
149
+ self.qk_rope_head_dim = qk_rope_head_dim
150
+ self.v_head_dim = v_head_dim
151
+ self.qk_nope_head_dim = qk_nope_head_dim
152
+ self.topk_method = topk_method
153
+ self.n_group = n_group
154
+ self.topk_group = topk_group
155
+ self.num_experts_per_tok = num_experts_per_tok
156
+ self.moe_layer_freq = moe_layer_freq
157
+ self.first_k_dense_replace = first_k_dense_replace
158
+ self.norm_topk_prob = norm_topk_prob
159
+ self.scoring_func = scoring_func
160
+ self.aux_loss_alpha = aux_loss_alpha
161
+ self.seq_aux = seq_aux
162
+ # for backward compatibility
163
+ if num_key_value_heads is None:
164
+ num_key_value_heads = num_attention_heads
165
+
166
+ self.num_key_value_heads = num_key_value_heads
167
+ self.hidden_act = hidden_act
168
+ self.initializer_range = initializer_range
169
+ self.rms_norm_eps = float(rms_norm_eps)
170
+ self.pretraining_tp = pretraining_tp
171
+ self.use_cache = use_cache
172
+ self.rope_theta = rope_theta
173
+ self.rope_scaling = rope_scaling
174
+ self.attention_bias = attention_bias
175
+ self.attention_dropout = attention_dropout
176
+ self.use_mla = use_mla
177
+
178
+ super().__init__(
179
+ pad_token_id=pad_token_id,
180
+ bos_token_id=bos_token_id,
181
+ eos_token_id=eos_token_id,
182
+ tie_word_embeddings=tie_word_embeddings,
183
+ **kwargs,
184
+ )
185
+
186
+
187
+ class DeepseekVLV2Config(PretrainedConfig):
188
+ model_type = "deepseek_vl_v2"
189
+ vision_config: VisionEncoderConfig
190
+ projector_config: MlpProjectorConfig
191
+
192
+ tile_tag: str = "2D"
193
+ global_view_pos: str = "head"
194
+ candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), )
195
+
196
+ def __init__(self,
197
+ tile_tag: str = "tile_tag",
198
+ global_view_pos: str = "head",
199
+ candidate_resolutions: Tuple[Tuple[int,
200
+ int]] = ((384, 384), ),
201
+ **kwargs):
202
+ super().__init__(**kwargs)
203
+
204
+ vision_config = kwargs.get("vision_config", {})
205
+ self.vision_config = VisionEncoderConfig(**vision_config)
206
+
207
+ projector_config = kwargs.get("projector_config", {})
208
+ self.projector_config = MlpProjectorConfig(**projector_config)
209
+
210
+ language_config = kwargs.get("language_config", {})
211
+ self.text_config = DeepseekV2Config(**language_config)
212
+
213
+ self.tile_tag = tile_tag
214
+ self.global_view_pos = global_view_pos
215
+ self.candidate_resolutions = candidate_resolutions
216
+ self.vocab_size = self.text_config.vocab_size
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/eagle.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import os
4
+ from typing import Optional, Union
5
+
6
+ from transformers import AutoConfig, PretrainedConfig
7
+
8
+
9
+ class EAGLEConfig(PretrainedConfig):
10
+ model_type = "eagle"
11
+
12
+ def __init__(self,
13
+ model: Union[PretrainedConfig, dict, None] = None,
14
+ truncated_vocab_size: Optional[int] = None,
15
+ **kwargs):
16
+
17
+ model_config = None if model is None else (AutoConfig.for_model(
18
+ **model) if isinstance(model, dict) else model)
19
+
20
+ for k, v in kwargs.items():
21
+ if k != "architectures" and k != "model_type" and hasattr(
22
+ model_config, k):
23
+ setattr(model_config, k, v)
24
+
25
+ self.model = model_config
26
+
27
+ if self.model is None:
28
+ self.truncated_vocab_size = None
29
+ else:
30
+ self.truncated_vocab_size = self.model.vocab_size if \
31
+ truncated_vocab_size is None else truncated_vocab_size
32
+
33
+ if "architectures" not in kwargs:
34
+ kwargs["architectures"] = ["EAGLEModel"]
35
+
36
+ super().__init__(**kwargs)
37
+
38
+ if self.model is not None:
39
+ for k, v in self.model.to_dict().items():
40
+ if not hasattr(self, k):
41
+ setattr(self, k, v)
42
+
43
+ @classmethod
44
+ def from_pretrained(
45
+ cls,
46
+ pretrained_model_name_or_path: Union[str, os.PathLike],
47
+ **kwargs,
48
+ ) -> "EAGLEConfig":
49
+ config_dict, kwargs = cls.get_config_dict(
50
+ pretrained_model_name_or_path, **kwargs)
51
+ return cls.from_dict(config_dict, **kwargs)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/exaone.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Copied from
4
+ # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
5
+ # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ """Exaone model configuration"""
19
+
20
+ from typing import Dict
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.utils import logging
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
28
+
29
+
30
+ class ExaoneConfig(PretrainedConfig):
31
+ r"""
32
+ This is the configuration class to store the configuration of a :class:
33
+ `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
34
+ according to the specified arguments, defining the model architecture.
35
+ Instantiating a configuration with the defaults will yield a similar
36
+ configuration to that of the Exaone
37
+
38
+ Configuration objects inherit from :class:`~transformers.PretrainedConfig`
39
+ and can be used to control the model outputs. Read the documentation from :
40
+ class:`~transformers.PretrainedConfig` for more information.
41
+
42
+ Args:
43
+ vocab_size (:obj:`int`, `optional`, defaults to 50257):
44
+ Vocabulary size of the GPT Lingvo model. Defines the number of
45
+ different tokens that can be represented by the :obj:`inputs_ids`
46
+ passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
47
+ size of the model.
48
+ Defines the different tokens that can be represented by the
49
+ `inputs_ids` passed to the forward method of :class:
50
+ `~transformers.EXAONEModel`.
51
+ hidden_size (:obj:`int`, `optional`, defaults to 2048):
52
+ Dimensionality of the encoder layers and the pooler layer.
53
+ num_layers (:obj:`int`, `optional`, defaults to 24):
54
+ Number of hidden layers in the Transformer encoder.
55
+ num_attention_heads (`int`, *optional*, defaults to 32):
56
+ Number of attention heads for each attention layer in the
57
+ Transformer decoder.
58
+ num_key_value_heads (`int`, *optional*):
59
+ This is the number of key_value heads that should be used to
60
+ implement Grouped Query Attention. If
61
+ `num_key_value_heads=num_attention_heads`, the model will use Multi
62
+ Head Attention (MHA), if `num_key_value_heads=1 the model will use
63
+ Multi Query Attention (MQA) otherwise GQA is used. When
64
+ converting a multi-head checkpoint to a GQA checkpoint,
65
+ each group key and value head should be constructed by meanpooling
66
+ all the original heads within that group. For more details checkout
67
+ [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
68
+ specified, will default to `num_attention_heads`.
69
+ rotary_pct (`float`, *optional*, defaults to 0.25):
70
+ percentage of hidden dimensions to allocate to rotary embeddings
71
+ intermediate_size (:obj:`int`, `optional`, defaults to 8192):
72
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in
73
+ the Transformer encoder.
74
+ activation_function (:obj:`str` or :obj:`function`, `optional`,
75
+ defaults to :obj:`"gelu_new"`):
76
+ The non-linear activation function (function or string) in the
77
+ encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
78
+ :obj:`"selu"` and :obj:`"gelu_new"` are supported.
79
+ embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
80
+ The dropout probabilitiy for all fully connected layers in the
81
+ embeddings, encoder, and pooler.
82
+ attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
83
+ The dropout ratio for the attention probabilities.
84
+ max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
85
+ The maximum sequence length that this model might ever be used with.
86
+ Typically set this to something large just in case
87
+ (e.g., 512 or 1024 or 2048).
88
+ type_vocab_size (:obj:`int`, `optional`, defaults to 2):
89
+ The vocabulary size of the :obj:`token_type_ids` passed when calling
90
+ :class:`~transformers.EXAONEModel`.
91
+ initializer_range (:obj:`float`, `optional`, defaults to 0.02):
92
+ The standard deviation of the truncated_normal_initializer for
93
+ initializing all weight matrices.
94
+ layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
95
+ The epsilon used by the layer normalization layers.
96
+ use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
97
+ Whether or not the model should return the last key/values
98
+ attentions (not used by all models).
99
+ Only relevant if ``config.is_decoder=True``.
100
+ gradient_checkpointing (:obj:`bool`, `optional`,
101
+ defaults to :obj:`False`):
102
+ If True, use gradient checkpointing to save memory at the expense
103
+ of slower backward pass.
104
+ Example::
105
+
106
+ >>> from transformers import ExoneModel, ExaoneConfig
107
+
108
+ >>> # Initializing a EXAONE configuration
109
+ >>> configuration = ExaoneConfig()
110
+
111
+ >>> # Initializing a model from configuration
112
+ >>> model = ExoneModel(configuration)
113
+
114
+ >>> # Accessing the model configuration
115
+ >>> configuration = model.config
116
+ """
117
+
118
+ model_type = "exaone"
119
+ keys_to_ignore_at_inference = ["past_key_values"]
120
+ attribute_map = {"num_hidden_layers": "num_layers"}
121
+
122
+ def __init__(
123
+ self,
124
+ vocab_size=102400,
125
+ max_position_embeddings=2048,
126
+ hidden_size=2048,
127
+ num_layers=32,
128
+ num_attention_heads=32,
129
+ num_key_value_heads=None,
130
+ intermediate_size=None,
131
+ activation_function="silu",
132
+ rotary_pct=0.25,
133
+ resid_dropout=0.0,
134
+ embed_dropout=0.0,
135
+ attention_dropout=0.0,
136
+ layer_norm_epsilon=1e-6,
137
+ initializer_range=0.02,
138
+ use_cache=True,
139
+ bos_token_id=0,
140
+ eos_token_id=2,
141
+ tie_word_embeddings=True,
142
+ **kwargs,
143
+ ):
144
+ super().__init__(
145
+ bos_token_id=bos_token_id,
146
+ eos_token_id=eos_token_id,
147
+ tie_word_embeddings=tie_word_embeddings,
148
+ **kwargs,
149
+ )
150
+
151
+ self.vocab_size = vocab_size
152
+ self.max_position_embeddings = max_position_embeddings
153
+ self.hidden_size = hidden_size
154
+ self.num_layers = num_layers
155
+ self.num_attention_heads = num_attention_heads
156
+ self.num_hidden_layers = num_layers
157
+ if num_key_value_heads is None:
158
+ num_key_value_heads = num_attention_heads
159
+ self.num_key_value_heads = num_key_value_heads
160
+ if intermediate_size:
161
+ self.intermediate_size = intermediate_size
162
+ else:
163
+ self.intermediate_size = hidden_size * 4
164
+ self.activation_function = activation_function
165
+ self.resid_dropout = resid_dropout
166
+ self.embed_dropout = embed_dropout
167
+ self.attention_dropout = attention_dropout
168
+ self.layer_norm_epsilon = layer_norm_epsilon
169
+ self.initializer_range = initializer_range
170
+ self.use_cache = use_cache
171
+ self.rotary_pct = rotary_pct
172
+
173
+ self.bos_token_id = bos_token_id
174
+ self.eos_token_id = eos_token_id
175
+
176
+ self.use_logit_cap = kwargs.pop("use_logit_cap", False)
177
+ self.ln_no_scale = kwargs.pop("ln_no_scale", False)
178
+ self.use_gated = kwargs.pop("use_gated", False)
179
+ self.use_emb_norm = kwargs.pop("use_emb_norm", False)
180
+ self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
181
+ self.rotary_type = kwargs.pop("rotary_type", None)
182
+ self.scaling_factor = kwargs.pop("scaling_factor", 1)
183
+ self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
184
+ self.use_extra_logit = kwargs.pop("use_extra_logit", True)
185
+ self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
186
+ self.rotary_base = kwargs.pop("rotary_base", 10000.0)
187
+ self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
188
+ self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
189
+ (rotary_pct == 0.25))
190
+ if self.use_rotary_pos:
191
+ self.use_absolute_pos = False
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/falcon.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from
4
+ # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
5
+ # Copyright 2023 The vLLM team.
6
+ # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
7
+ # All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """Falcon configuration"""
21
+ from transformers.configuration_utils import PretrainedConfig
22
+
23
+
24
+ class RWConfig(PretrainedConfig):
25
+ model_type = "falcon"
26
+ keys_to_ignore_at_inference = ["past_key_values"]
27
+ attribute_map = {
28
+ "num_hidden_layers": "n_layer",
29
+ "num_attention_heads": "n_head",
30
+ "num_kv_heads": "n_head_kv",
31
+ }
32
+
33
+ def __init__(
34
+ self,
35
+ vocab_size=250880,
36
+ hidden_size=64,
37
+ n_layer=2,
38
+ n_head=8,
39
+ layer_norm_epsilon=1e-5,
40
+ initializer_range=0.02,
41
+ use_cache=True,
42
+ bos_token_id=1,
43
+ eos_token_id=2,
44
+ hidden_dropout=0.0,
45
+ attention_dropout=0.0,
46
+ multi_query=True,
47
+ n_head_kv=None,
48
+ alibi=False,
49
+ bias=False,
50
+ parallel_attn=False,
51
+ new_decoder_architecture=False,
52
+ **kwargs,
53
+ ) -> None:
54
+ self.vocab_size = vocab_size
55
+ # Backward compatibility with n_embed kwarg
56
+ n_embed = kwargs.pop("n_embed", None)
57
+ self.hidden_size = hidden_size if n_embed is None else n_embed
58
+ self.n_layer = n_layer
59
+ self.n_head = n_head
60
+ self.layer_norm_epsilon = layer_norm_epsilon
61
+ self.initializer_range = initializer_range
62
+ self.use_cache = use_cache
63
+ self.hidden_dropout = hidden_dropout
64
+ self.attention_dropout = attention_dropout
65
+
66
+ self.bos_token_id = bos_token_id
67
+ self.eos_token_id = eos_token_id
68
+ self.multi_query = multi_query
69
+ self.n_head_kv = 1 if n_head_kv is None else n_head_kv
70
+ self.alibi = alibi
71
+ self.bias = bias
72
+ self.parallel_attn = parallel_attn
73
+ self.new_decoder_architecture = new_decoder_architecture
74
+
75
+ if self.hidden_size == 8192:
76
+ # Hack for falcon-40b
77
+ self.new_decoder_architecture = True
78
+
79
+ super().__init__(bos_token_id=bos_token_id,
80
+ eos_token_id=eos_token_id,
81
+ **kwargs)
82
+
83
+ @property
84
+ def head_dim(self):
85
+ return self.hidden_size // self.n_head
86
+
87
+ @property
88
+ def rotary(self):
89
+ return not self.alibi
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/h2ovl.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from
4
+ # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
5
+ # --------------------------------------------------------
6
+ # H2OVL-Mississippi
7
+ # Copyright (c) 2024 H2O.AI
8
+ # Licensed under Apache 2.0 License [see LICENSE for details]
9
+ # --------------------------------------------------------
10
+
11
+ from .internvl import InternVLChatConfig
12
+
13
+
14
+ class H2OVLChatConfig(InternVLChatConfig):
15
+ model_type = "h2ovl_chat"
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/internvl.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from
4
+ # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
5
+ # --------------------------------------------------------
6
+ # InternVL
7
+ # Copyright (c) 2024 OpenGVLab
8
+ # Licensed under The MIT License [see LICENSE for details]
9
+ # --------------------------------------------------------
10
+ from transformers.configuration_utils import PretrainedConfig
11
+
12
+
13
+ class InternVLChatConfig(PretrainedConfig):
14
+ model_type = 'internvl_chat'
15
+ is_composition = True
16
+
17
+ def __init__(self,
18
+ vision_config=None,
19
+ llm_config=None,
20
+ use_backbone_lora=0,
21
+ use_llm_lora=0,
22
+ select_layer=-1,
23
+ force_image_size=None,
24
+ downsample_ratio=0.5,
25
+ template=None,
26
+ dynamic_image_size=False,
27
+ use_thumbnail=False,
28
+ ps_version='v1',
29
+ min_dynamic_patch=1,
30
+ max_dynamic_patch=6,
31
+ **kwargs):
32
+ super().__init__(**kwargs)
33
+
34
+ if vision_config is None:
35
+ vision_config = {}
36
+
37
+ if llm_config is None:
38
+ llm_config = {}
39
+
40
+ self.vision_config = PretrainedConfig(**vision_config)
41
+ self.text_config = PretrainedConfig(**llm_config)
42
+
43
+ self.use_backbone_lora = use_backbone_lora
44
+ self.use_llm_lora = use_llm_lora
45
+ self.select_layer = select_layer
46
+ self.force_image_size = force_image_size
47
+ self.downsample_ratio = downsample_ratio
48
+ self.template = template
49
+ self.dynamic_image_size = dynamic_image_size
50
+ self.use_thumbnail = use_thumbnail
51
+ self.ps_version = ps_version # pixel shuffle version
52
+ self.min_dynamic_patch = min_dynamic_patch
53
+ self.max_dynamic_patch = max_dynamic_patch
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/medusa.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import os
4
+ from typing import Optional, Union
5
+
6
+ from transformers import PretrainedConfig
7
+
8
+
9
+ class MedusaConfig(PretrainedConfig):
10
+ model_type = "medusa"
11
+
12
+ def __init__(self,
13
+ hidden_size: int = 4096,
14
+ vocab_size: int = 32001,
15
+ num_heads: int = 5,
16
+ num_hidden_layers: int = 1,
17
+ max_paths: int = 64,
18
+ topk: int = 10,
19
+ truncated_vocab_size: Optional[int] = None,
20
+ **kwargs):
21
+
22
+ self.hidden_size = hidden_size
23
+ self.vocab_size = vocab_size
24
+ self.num_heads = num_heads
25
+ self.num_hidden_layers = num_hidden_layers
26
+ self.max_paths = max_paths
27
+ self.topk = topk
28
+ self.max_seq_len = int(2**20)
29
+ self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
30
+ else truncated_vocab_size
31
+ if "architectures" not in kwargs:
32
+ kwargs["architectures"] = ["MedusaModel"]
33
+
34
+ super().__init__(**kwargs)
35
+
36
+ @classmethod
37
+ def from_pretrained(
38
+ cls,
39
+ pretrained_model_name_or_path: Union[str, os.PathLike],
40
+ **kwargs,
41
+ ) -> "MedusaConfig":
42
+ config_dict, kwargs = cls.get_config_dict(
43
+ pretrained_model_name_or_path, **kwargs)
44
+ for k in list(config_dict.keys()):
45
+ if 'num' in k:
46
+ if 'heads' in k:
47
+ config_dict["num_heads"] = config_dict.pop(k)
48
+ elif 'layers' in k:
49
+ config_dict["num_hidden_layers"] = config_dict.pop(k)
50
+ return cls.from_dict(config_dict, **kwargs)
51
+
52
+ @property
53
+ def num_attention_heads(self):
54
+ return 0
55
+
56
+ @property
57
+ def num_lookahead_tokens(self):
58
+ return self.num_heads
59
+
60
+ @num_lookahead_tokens.setter
61
+ def num_lookahead_tokens(self, num_lookahead_tokens: int):
62
+ self.num_heads = num_lookahead_tokens
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mllama.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from transformers.models.mllama import configuration_mllama as mllama_hf_config
4
+
5
+
6
+ class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
7
+ '''
8
+ Use this class to override is_encoder_decoder:
9
+ - transformers regards mllama as is_encoder_decoder=False
10
+ - vllm needs is_encoder_decoder=True to enable cross-attention
11
+ '''
12
+
13
+ def __init__(
14
+ self,
15
+ **kwargs,
16
+ ):
17
+ super().__init__(**kwargs)
18
+ self.is_encoder_decoder = True
19
+
20
+
21
+ class MllamaConfig(mllama_hf_config.MllamaConfig):
22
+
23
+ def __init__(
24
+ self,
25
+ text_config=None,
26
+ **kwargs,
27
+ ):
28
+ if isinstance(text_config, dict):
29
+ text_config = MllamaTextConfig(**text_config)
30
+ super().__init__(text_config=text_config, **kwargs)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/mpt.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Copied from
4
+ # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
5
+ """A HuggingFace-style model configuration."""
6
+ import warnings
7
+ from typing import Any, Dict, Optional, Union
8
+
9
+ from transformers import PretrainedConfig
10
+
11
+ attn_config_defaults: Dict = {
12
+ 'attn_type': 'multihead_attention',
13
+ 'attn_pdrop': 0.0,
14
+ 'attn_impl': 'triton',
15
+ 'qk_ln': False,
16
+ 'clip_qkv': None,
17
+ 'softmax_scale': None,
18
+ 'prefix_lm': False,
19
+ 'attn_uses_sequence_id': False,
20
+ 'alibi': False,
21
+ 'alibi_bias_max': 8
22
+ }
23
+ ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
24
+ init_config_defaults: Dict = {
25
+ 'name': 'kaiming_normal_',
26
+ 'fan_mode': 'fan_in',
27
+ 'init_nonlinearity': 'relu',
28
+ 'init_div_is_residual': True,
29
+ 'emb_init_std': None,
30
+ 'emb_init_uniform_lim': None,
31
+ 'init_std': None,
32
+ 'init_gain': 0.0
33
+ }
34
+
35
+
36
+ class MPTConfig(PretrainedConfig):
37
+ model_type = 'mpt'
38
+ attribute_map = {
39
+ 'num_attention_heads': 'n_heads',
40
+ 'hidden_size': 'd_model',
41
+ 'num_hidden_layers': 'n_layers',
42
+ }
43
+
44
+ # pylint: disable=dangerous-default-value
45
+ def __init__(self,
46
+ d_model: int = 2048,
47
+ n_heads: int = 16,
48
+ n_layers: int = 24,
49
+ expansion_ratio: int = 4,
50
+ max_seq_len: int = 2048,
51
+ vocab_size: int = 50368,
52
+ resid_pdrop: float = 0.0,
53
+ emb_pdrop: float = 0.0,
54
+ learned_pos_emb: bool = True,
55
+ attn_config: Dict = attn_config_defaults,
56
+ ffn_config: Dict = ffn_config_defaults,
57
+ init_device: str = 'cpu',
58
+ logit_scale: Optional[Union[float, str]] = None,
59
+ no_bias: bool = False,
60
+ embedding_fraction: float = 1.0,
61
+ norm_type: str = 'low_precision_layernorm',
62
+ use_cache: bool = False,
63
+ init_config: Dict = init_config_defaults,
64
+ fc_type: str = 'torch',
65
+ verbose: Optional[int] = None,
66
+ **kwargs: Any):
67
+ self.d_model = d_model
68
+ self.n_heads = n_heads
69
+ self.n_layers = n_layers
70
+ self.expansion_ratio = expansion_ratio
71
+ self.max_seq_len = max_seq_len
72
+ self.vocab_size = vocab_size
73
+ self.resid_pdrop = resid_pdrop
74
+ self.emb_pdrop = emb_pdrop
75
+ self.learned_pos_emb = learned_pos_emb
76
+ self.attn_config = attn_config
77
+ self.ffn_config = ffn_config
78
+ self.init_device = init_device
79
+ self.logit_scale = logit_scale
80
+ self.no_bias = no_bias
81
+ self.embedding_fraction = embedding_fraction
82
+ self.norm_type = norm_type
83
+ self.use_cache = use_cache
84
+ self.init_config = init_config
85
+ self.fc_type = fc_type
86
+ if verbose is not None:
87
+ warnings.warn(DeprecationWarning(
88
+ 'verbose argument for MPTConfig is now ignored and '
89
+ 'will be removed. Use python_log_level instead.'),
90
+ stacklevel=2)
91
+ if 'name' in kwargs:
92
+ del kwargs['name']
93
+ if 'loss_fn' in kwargs:
94
+ del kwargs['loss_fn']
95
+ if self.attn_config.get('alibi', False):
96
+ self.learned_pos_emb = False
97
+ warnings.warn(
98
+ f'alibi is turned on, setting `learned_pos_emb` '
99
+ f'to {self.learned_pos_emb}`',
100
+ stacklevel=2)
101
+ super().__init__(**kwargs)
102
+ self._validate_config()
103
+
104
+ def _set_config_defaults(
105
+ self, config: Dict[str, Any],
106
+ config_defaults: Dict[str, Any]) -> Dict[str, Any]:
107
+ for (k, v) in config_defaults.items():
108
+ if k not in config:
109
+ config[k] = v
110
+ return config
111
+
112
+ def _validate_config(self) -> None:
113
+ self.attn_config = self._set_config_defaults(self.attn_config,
114
+ attn_config_defaults)
115
+ self.ffn_config = self._set_config_defaults(self.ffn_config,
116
+ ffn_config_defaults)
117
+ self.init_config = self._set_config_defaults(self.init_config,
118
+ init_config_defaults)
119
+ if self.d_model % self.n_heads != 0:
120
+ raise ValueError('d_model must be divisible by n_heads')
121
+ if any(
122
+ prob < 0 or prob > 1 for prob in
123
+ [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
124
+ ]):
125
+ raise ValueError(
126
+ "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
127
+ "probabilities and must be between 0 and 1")
128
+ if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
129
+ raise ValueError(
130
+ f"Unknown attn_impl={self.attn_config['attn_impl']}")
131
+ if self.attn_config['prefix_lm'] and self.attn_config[
132
+ 'attn_impl'] not in ['torch', 'triton']:
133
+ raise NotImplementedError(
134
+ 'prefix_lm only implemented with torch and triton attention.')
135
+ if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
136
+ 'torch', 'triton'
137
+ ]:
138
+ raise NotImplementedError(
139
+ 'alibi only implemented with torch and triton attention.')
140
+ if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
141
+ 'attn_impl'] not in ['torch', 'triton']:
142
+ raise NotImplementedError(
143
+ 'attn_uses_sequence_id only implemented with torch '
144
+ 'and triton attention.')
145
+ if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
146
+ raise ValueError(
147
+ 'model.embedding_fraction must be between 0 (exclusive) '
148
+ 'and 1 (inclusive)!')
149
+ if isinstance(self.logit_scale,
150
+ str) and self.logit_scale != 'inv_sqrt_d_model':
151
+ raise ValueError(
152
+ f"self.logit_scale={self.logit_scale!r} is not recognized as "
153
+ "an option; use numeric value or 'inv_sqrt_d_model'.")
154
+ if self.init_config.get('name', None) is None:
155
+ raise ValueError(
156
+ f"self.init_config={self.init_config!r} 'name' needs to be set."
157
+ )
158
+ if not self.learned_pos_emb and (not self.attn_config['alibi']):
159
+ warnings.warn(
160
+ 'Positional information not being provided to the model.',
161
+ stacklevel=2)
162
+ if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
163
+ try:
164
+ # pylint: disable=import-outside-toplevel
165
+ import transformer_engine.pytorch as te
166
+ del te
167
+ except Exception as exc:
168
+ raise ImportError(
169
+ 'TransformerEngine import fail. `fc_type: te` requires '
170
+ 'TransformerEngine be installed. '
171
+ 'The required version of transformer_engine also requires '
172
+ 'FlashAttention v1.0.6 is installed:\n'
173
+ 'pip install flash-attn==1.0.6 --no-build-isolation \n'
174
+ 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
175
+ ) from exc
176
+ if self.ffn_config['ffn_type'] == 'mptmlp':
177
+ self.ffn_config['fc_type'] = self.fc_type
178
+ elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
179
+ self.ffn_config['bias'] = not self.no_bias
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nemotron.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Copyright 2024 HuggingFace Inc. team. All rights reserved.
4
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """Nemotron model configuration"""
18
+
19
+ from transformers import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class NemotronConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a
28
+ [`NemotronModel`]. It is used to instantiate an Nemotron model
29
+ according to the specified arguments, defining the model architecture.
30
+ Instantiating a configuration with the defaults will yield a similar
31
+ configuration to that of the Nemotron-8B.
32
+
33
+ Configuration objects inherit from [`PretrainedConfig`] and can be
34
+ used to control the model outputs. Read the documentation from
35
+ [`PretrainedConfig`] for more information.
36
+
37
+
38
+ Args:
39
+ vocab_size (`int`, *optional*, defaults to 256000):
40
+ Vocabulary size of the Nemotron model. Defines the number of
41
+ different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`NemotronModel`]
43
+ hidden_size (`int`, *optional*, defaults to 6144):
44
+ Dimension of the hidden representations.
45
+ intermediate_size (`int`, *optional*, defaults to 24576):
46
+ Dimension of the MLP representations.
47
+ num_hidden_layers (`int`, *optional*, defaults to 32):
48
+ Number of hidden layers in the Transformer decoder.
49
+ num_attention_heads (`int`, *optional*, defaults to 48):
50
+ Number of attention heads for each attention layer in the
51
+ Transformer decoder.
52
+ head_dim (`int`, *optional*):
53
+ Projection weights dimension in multi-head attention. Set to
54
+ hidden_size // num_attention_heads if None
55
+ num_key_value_heads (`int`, *optional*):
56
+ This is the number of key_value heads that should be used to
57
+ implement Grouped Query Attention. If
58
+ `num_key_value_heads=num_attention_heads`, the model will use
59
+ Multi Head Attention (MHA), if
60
+ `num_key_value_heads=1 the model will use Multi Query Attention
61
+ (MQA) otherwise GQA is used. When converting a multi-head
62
+ checkpoint to a GQA checkpoint, each group key and value
63
+ head should be constructed by meanpooling all the original
64
+ heads within that group. For more details checkout
65
+ [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
66
+ is not specified, will default to `num_attention_heads`.
67
+ hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
68
+ The non-linear activation function (function or string) in the
69
+ decoder.
70
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
71
+ The maximum sequence length that this model might ever be used
72
+ with.
73
+ initializer_range (`float`, *optional*, defaults to 0.0134):
74
+ The standard deviation of the truncated_normal_initializer for
75
+ initializing all weight matrices.
76
+ norm_eps (`float`, *optional*, defaults to 1e-05):
77
+ The epsilon used by the normalization layers.
78
+ use_cache (`bool`, *optional*, defaults to `True`):
79
+ Whether or not the model should return the last key/values
80
+ attentions (not used by all models). Only relevant if
81
+ `config.is_decoder=True`.
82
+ pad_token_id (`int`, *optional*):
83
+ Padding token id.
84
+ bos_token_id (`int`, *optional*, defaults to 2):
85
+ Beginning of stream token id.
86
+ eos_token_id (`int`, *optional*, defaults to 3):
87
+ End of stream token id.
88
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
89
+ Whether to tie weight embeddings
90
+ rope_theta (`float`, *optional*, defaults to 10000.0):
91
+ The base period of the RoPE embeddings.
92
+ partial_rotary_factor (`float`, *optional*, defaults to 0.5):
93
+ Percentage of the query and keys which will have rotary embedding.
94
+ attention_bias (`bool`, *optional*, defaults to `False`):
95
+ Whether to use a bias in the query, key, value and output
96
+ projection layers during self-attention.
97
+ attention_dropout (`float`, *optional*, defaults to 0.0):
98
+ The dropout ratio for the attention probabilities.
99
+ mlp_bias (`bool`, *optional*, defaults to `False`):
100
+ Whether to use a bias in up_proj and down_proj layers in the MLP
101
+ layers.
102
+
103
+ ```python
104
+ >>> from transformers import NemotronModel, NemotronConfig
105
+ >>> # Initializing a Nemotron nemotron-15b style configuration
106
+ >>> configuration = NemotronConfig()
107
+ >>> # Initializing a model from the nemotron-15b style configuration
108
+ >>> model = NemotronModel(configuration)
109
+ >>> # Accessing the model configuration
110
+ >>> configuration = model.config
111
+ ```"""
112
+
113
+ model_type = "nemotron"
114
+ keys_to_ignore_at_inference = ["past_key_values"]
115
+
116
+ def __init__(
117
+ self,
118
+ vocab_size=256000,
119
+ hidden_size=6144,
120
+ intermediate_size=24576,
121
+ num_hidden_layers=32,
122
+ num_attention_heads=48,
123
+ head_dim=None,
124
+ num_key_value_heads=None,
125
+ hidden_act="relu2",
126
+ max_position_embeddings=4096,
127
+ initializer_range=0.0134,
128
+ norm_eps=1e-5,
129
+ use_cache=True,
130
+ pad_token_id=None,
131
+ bos_token_id=2,
132
+ eos_token_id=3,
133
+ tie_word_embeddings=False,
134
+ rope_theta=10000.0,
135
+ rope_scaling=None,
136
+ partial_rotary_factor=0.5,
137
+ attention_bias=False,
138
+ attention_dropout=0.0,
139
+ mlp_bias=False,
140
+ **kwargs,
141
+ ):
142
+ self.vocab_size = vocab_size
143
+ self.max_position_embeddings = max_position_embeddings
144
+ self.hidden_size = hidden_size
145
+ self.intermediate_size = intermediate_size
146
+ self.num_hidden_layers = num_hidden_layers
147
+ self.num_attention_heads = num_attention_heads
148
+ head_dim = head_dim or kwargs.get("kv_channels")
149
+ self.head_dim = head_dim if head_dim is not None else (
150
+ hidden_size // num_attention_heads)
151
+
152
+ # for backward compatibility
153
+ if num_key_value_heads is None:
154
+ num_key_value_heads = num_attention_heads
155
+
156
+ self.num_key_value_heads = num_key_value_heads
157
+ self.hidden_act = hidden_act
158
+ self.initializer_range = initializer_range
159
+ self.norm_eps = norm_eps
160
+ self.use_cache = use_cache
161
+ self.rope_theta = rope_theta
162
+ self.rope_scaling = rope_scaling
163
+ # for backward compatibility
164
+ partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
165
+ "rope_percentage") or partial_rotary_factor
166
+ self.partial_rotary_factor = partial_rotary_factor
167
+ self._rope_scaling_validation()
168
+ self.attention_bias = attention_bias
169
+ self.attention_dropout = attention_dropout
170
+ self.mlp_bias = mlp_bias
171
+
172
+ super().__init__(
173
+ pad_token_id=pad_token_id,
174
+ bos_token_id=bos_token_id,
175
+ eos_token_id=eos_token_id,
176
+ tie_word_embeddings=tie_word_embeddings,
177
+ **kwargs,
178
+ )
179
+
180
+ def _rope_scaling_validation(self):
181
+ """
182
+ Validate the `rope_scaling` configuration.
183
+ """
184
+ if self.rope_scaling is None:
185
+ return
186
+
187
+ if not isinstance(self.rope_scaling, dict) or len(
188
+ self.rope_scaling) != 2:
189
+ raise ValueError(
190
+ "`rope_scaling` must be a dictionary with two fields, "
191
+ f"`type` and `factor`, got {self.rope_scaling}")
192
+ rope_scaling_type = self.rope_scaling.get("type", None)
193
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
194
+ if rope_scaling_type is None or rope_scaling_type not in [
195
+ "linear", "dynamic"
196
+ ]:
197
+ raise ValueError(
198
+ "`rope_scaling`'s type field must be one of ['linear', "
199
+ f"'dynamic'], got {rope_scaling_type}")
200
+ if rope_scaling_factor is None or not isinstance(
201
+ rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
202
+ raise ValueError(
203
+ "`rope_scaling`'s factor field must be a float > 1, got "
204
+ f"{rope_scaling_factor}")
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/nvlm_d.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from
4
+ # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
5
+ # --------------------------------------------------------
6
+ # NVLM-D
7
+ # Copyright (c) 2024 NVIDIA
8
+ # Licensed under Apache 2.0 License [see LICENSE for details]
9
+ # --------------------------------------------------------
10
+ from .internvl import InternVLChatConfig
11
+
12
+
13
+ class NVLM_D_Config(InternVLChatConfig):
14
+ model_type = 'NVLM_D'
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/solar.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
6
+ # and OPT implementations in this library. It has been modified from its
7
+ # original forms to accommodate minor architectural differences compared
8
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+ """Solar model configuration"""
22
+
23
+ from transformers import PretrainedConfig
24
+ from transformers.utils import logging
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+
29
+ class SolarConfig(PretrainedConfig):
30
+ r"""
31
+ This is the configuration class to store
32
+ the configuration of a [`SolarModel`].
33
+ It is used to instantiate an LLaMA model
34
+ according to the specified arguments,
35
+ defining the model architecture.
36
+ Instantiating a configuration with the
37
+ defaults will yield a similar
38
+ configuration to that of the LLaMA-7B.
39
+ Configuration objects inherit from [`PretrainedConfig`]
40
+ and can be used to control the model outputs.
41
+ Read the documentation from [`PretrainedConfig`] for more information.
42
+ Args:
43
+ vocab_size (`int`, *optional*, defaults to 32000):
44
+ Vocabulary size of the LLaMA model.
45
+ Defines the number of different tokens
46
+ that can be represented by the `inputs_ids`
47
+ passed when calling [`SolarModel`]
48
+ hidden_size (`int`, *optional*, defaults to 4096):
49
+ Dimension of the hidden representations.
50
+ intermediate_size (`int`, *optional*, defaults to 11008):
51
+ Dimension of the MLP representations.
52
+ num_hidden_layers (`int`, *optional*, defaults to 32):
53
+ Number of hidden layers in the Transformer decoder.
54
+ num_attention_heads (`int`, *optional*, defaults to 32):
55
+ Number of attention heads for each attention layer
56
+ in the Transformer decoder.
57
+ num_key_value_heads (`int`, *optional*):
58
+ This is the number of key_value heads that
59
+ should be used to implement Grouped Query Attention. If
60
+ `num_key_value_heads=num_attention_heads`,
61
+ the model will use Multi Head Attention (MHA), if
62
+ `num_key_value_heads=1` the model
63
+ will use Multi Query Attention (MQA)
64
+ otherwise GQA is used. When
65
+ converting a multi-head checkpoint to a GQA checkpoint,
66
+ each group key and value head should be constructed
67
+ by meanpooling all the original heads within that group.
68
+ For more details checkout [this paper]
69
+ (https://arxiv.org/pdf/2305.13245.pdf).
70
+ If it is not specified, will default to
71
+ `num_attention_heads`.
72
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
73
+ The non-linear activation function (function or string)
74
+ in the decoder.
75
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
76
+ The maximum sequence length that this model might ever be used with.
77
+ Solar 1 supports up to 2048 tokens,
78
+ Solar 2 up to 4096, CodeSolar up to 16384.
79
+ initializer_range (`float`, *optional*, defaults to 0.02):
80
+ The standard deviation of
81
+ the truncated_normal_initializer for initializing
82
+ all weight matrices.
83
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
84
+ The epsilon used by the rms normalization layers.
85
+ use_cache (`bool`, *optional*, defaults to `True`):
86
+ Whether or not the model should return
87
+ the last key/values attentions (not used by all models). Only
88
+ relevant if `config.is_decoder=True`.
89
+ pad_token_id (`int`, *optional*):
90
+ Padding token id.
91
+ bos_token_id (`int`, *optional*, defaults to 1):
92
+ Beginning of stream token id.
93
+ eos_token_id (`int`, *optional*, defaults to 2):
94
+ End of stream token id.
95
+ pretraining_tp (`int`, *optional*, defaults to 1):
96
+ Experimental feature. Tensor parallelism rank
97
+ used during pretraining.
98
+ Please refer to [this
99
+ document](https://huggingface.co/docs/
100
+ transformers/main/
101
+ perf_train_gpu_many#tensor-parallelism)
102
+ to understand more about it. This value is
103
+ necessary to ensure exact reproducibility
104
+ of the pretraining results.
105
+ Please refer to [this
106
+ issue](https://github.com/pytorch/pytorch/issues/76232).
107
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
108
+ Whether to tie weight embeddings
109
+ rope_theta (`float`, *optional*, defaults to 10000.0):
110
+ The base period of the RoPE embeddings.
111
+ rope_scaling (`Dict`, *optional*):
112
+ Dictionary containing the scaling configuration for
113
+ the RoPE embeddings.
114
+ Currently supports two scaling
115
+ strategies: linear and dynamic.
116
+ Their scaling factor must be a float greater than 1.
117
+ The expected format is
118
+ `{"type": strategy name, "factor": scaling factor}`.
119
+ When using this flag, don't update
120
+ `max_position_embeddings` to the expected new maximum.
121
+ See the following thread for more information on how
122
+ these scaling strategies behave:
123
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
124
+ dynamically_scaled_rope_further_increases/. This is an
125
+ experimental feature, subject to breaking
126
+ API changes in future versions.
127
+ attention_bias (`bool`, *optional*, defaults to `False`):
128
+ Whether to use a bias in the query, key, value
129
+ and output projection layers during self-attention.
130
+ attention_dropout (`float`, *optional*, defaults to 0.0):
131
+ The dropout ratio for the attention probabilities.
132
+ mlp_bias (`bool`, *optional*, defaults to `False`):
133
+ Whether to use a bias in up_proj, down_proj and gate_proj
134
+ layers in the MLP layers.
135
+ sliding_window (`int`, *optional*, defaults to 2047):
136
+ Sliding window attention window size. If not specified,
137
+ will default to `2047`.
138
+ ```python
139
+ >>> from transformers import SolarModel, SolarConfig
140
+ >>> # Initializing a Solar-pro style configuration
141
+ >>> configuration = SolarConfig()
142
+ >>> # Initializing a model from the Solar-pro style configuration
143
+ >>> model = SolarModel(configuration)
144
+ >>> # Accessing the model configuration
145
+ >>> configuration = model.config
146
+ ```"""
147
+
148
+ model_type = "solar"
149
+ keys_to_ignore_at_inference = ["past_key_values"]
150
+
151
+ def __init__(
152
+ self,
153
+ vocab_size=32000,
154
+ hidden_size=4096,
155
+ intermediate_size=11008,
156
+ num_hidden_layers=32,
157
+ num_attention_heads=32,
158
+ num_key_value_heads=None,
159
+ hidden_act="silu",
160
+ max_position_embeddings=2048,
161
+ initializer_range=0.02,
162
+ rms_norm_eps=1e-6,
163
+ use_cache=True,
164
+ pad_token_id=None,
165
+ bos_token_id=1,
166
+ eos_token_id=2,
167
+ pretraining_tp=1,
168
+ tie_word_embeddings=False,
169
+ rope_theta=10000.0,
170
+ rope_scaling=None,
171
+ attention_bias=False,
172
+ attention_dropout=0.0,
173
+ mlp_bias=False,
174
+ sliding_window=2047,
175
+ bskcn_1=None,
176
+ bskcn_2=None,
177
+ bskcn_3=None,
178
+ bskcn_4=None,
179
+ bskcn_tv=None,
180
+ **kwargs,
181
+ ):
182
+ self.vocab_size = vocab_size
183
+ self.max_position_embeddings = max_position_embeddings
184
+ self.hidden_size = hidden_size
185
+ self.intermediate_size = intermediate_size
186
+ self.num_hidden_layers = num_hidden_layers
187
+ self.num_attention_heads = num_attention_heads
188
+
189
+ # for backward compatibility
190
+ if num_key_value_heads is None:
191
+ num_key_value_heads = num_attention_heads
192
+
193
+ self.num_key_value_heads = num_key_value_heads
194
+ self.hidden_act = hidden_act
195
+ self.initializer_range = initializer_range
196
+ self.rms_norm_eps = rms_norm_eps
197
+ self.pretraining_tp = pretraining_tp
198
+ self.use_cache = use_cache
199
+ self.rope_theta = rope_theta
200
+ self.rope_scaling = rope_scaling
201
+ self._rope_scaling_validation()
202
+ self.attention_bias = attention_bias
203
+ self.attention_dropout = attention_dropout
204
+ self.mlp_bias = mlp_bias
205
+ self.sliding_window = sliding_window
206
+ self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
207
+ self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
208
+ self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
209
+ self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
210
+ self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
211
+
212
+ super().__init__(
213
+ pad_token_id=pad_token_id,
214
+ bos_token_id=bos_token_id,
215
+ eos_token_id=eos_token_id,
216
+ tie_word_embeddings=tie_word_embeddings,
217
+ **kwargs,
218
+ )
219
+
220
+ def _rope_scaling_validation(self):
221
+ """
222
+ Validate the `rope_scaling` configuration.
223
+ """
224
+ if self.rope_scaling is None:
225
+ return
226
+
227
+ if (not isinstance(self.rope_scaling, dict)
228
+ or len(self.rope_scaling) != 2):
229
+ raise ValueError(
230
+ "`rope_scaling` must be a dictionary with two fields,"
231
+ " `type` and `factor`, "
232
+ f"got {self.rope_scaling}")
233
+ rope_scaling_type = self.rope_scaling.get("type", None)
234
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
235
+ if rope_scaling_type is None or rope_scaling_type not in [
236
+ "linear",
237
+ "dynamic",
238
+ ]:
239
+ raise ValueError(f"`rope_scaling`'s type field must be one of "
240
+ f"['linear', 'dynamic'], got {rope_scaling_type}")
241
+ if (rope_scaling_factor is None
242
+ or not isinstance(rope_scaling_factor, float)
243
+ or rope_scaling_factor <= 1.0):
244
+ raise ValueError(
245
+ f"`rope_scaling`'s factor field must be a float > 1,"
246
+ f" got {rope_scaling_factor}")
.venv/lib/python3.11/site-packages/vllm/transformers_utils/configs/ultravox.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
4
+ from typing import Any, Dict, Optional
5
+
6
+ import transformers
7
+
8
+
9
+ class UltravoxConfig(transformers.PretrainedConfig):
10
+ r"""
11
+ This is the configuration class to store the configuration of a
12
+ [`UltravoxForConditionalGeneration`]. It is used to instantiate an
13
+ Ultravox model according to the specified arguments, defining the model
14
+ architecture.
15
+
16
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to
17
+ control the model outputs. Read the documentation from [`PretrainedConfig`]
18
+ for more information.
19
+
20
+ Args:
21
+ audio_config (`Union[AutoConfig, dict]`, *optional*):
22
+ Custom audio config or dict
23
+ text_config (`Union[AutoConfig, dict]`, *optional*):
24
+ The config object of the text backbone. Can be any of `LlamaConfig`
25
+ or `MistralConfig`.
26
+ ignore_index (`int`, *optional*, defaults to -100):
27
+ The ignore index for the loss function.
28
+ audio_token_index (`int`, *optional*, defaults to 32000):
29
+ The audio token index to encode the audio prompt.
30
+ stack_factor (`int`, *optional*, defaults to 8):
31
+ Audio downsampling factor for the multimodal projector.
32
+ norm_init (`float`, *optional*, defaults to 0.4):
33
+ The initialization value for the layer normalization.
34
+ projector_act (`str`, *optional*, defaults to `"swiglu"`):
35
+ The activation function used by the multimodal projector.
36
+ text_model_lora_config (`LoraConfigSimplified`, *optional*):
37
+ The LoRA configuration for finetuning the text model.
38
+ audio_model_lora_config (`LoraConfigSimplified`, *optional*):
39
+ The LoRA configuration for finetuning the audio model.
40
+ """
41
+
42
+ model_type = "ultravox"
43
+ is_composition = False
44
+
45
+ def __init__(
46
+ self,
47
+ audio_config: Optional[Dict[str, Any]] = None,
48
+ text_config: Optional[Dict[str, Any]] = None,
49
+ audio_model_id: Optional[str] = None,
50
+ text_model_id: Optional[str] = None,
51
+ ignore_index: int = -100,
52
+ audio_token_index: int = 32000,
53
+ hidden_size: int = 4096,
54
+ stack_factor: int = 8,
55
+ norm_init: float = 0.4,
56
+ projector_act: str = "swiglu",
57
+ text_model_lora_config: Optional[Dict[str, Any]] = None,
58
+ audio_model_lora_config: Optional[Dict[str, Any]] = None,
59
+ **kwargs,
60
+ ):
61
+ self.ignore_index = ignore_index
62
+
63
+ self.audio_model_id = audio_model_id
64
+ self.text_model_id = text_model_id
65
+ self.audio_token_index = audio_token_index
66
+
67
+ self.hidden_size = hidden_size
68
+ self.stack_factor = stack_factor
69
+ self.norm_init = norm_init
70
+ self.projector_act = projector_act
71
+
72
+ if text_model_id is not None:
73
+ # Avoid circular import
74
+ from vllm.transformers_utils.config import get_config
75
+
76
+ self.text_config = get_config(text_model_id,
77
+ trust_remote_code=False)
78
+ else:
79
+ text_config = text_config or {}
80
+ self.text_config = transformers.CONFIG_MAPPING[text_config.get(
81
+ "model_type", "llama")](**text_config)
82
+
83
+ if audio_model_id is not None:
84
+ # Avoid circular import
85
+ from vllm.transformers_utils.config import get_config
86
+
87
+ self.audio_config = get_config(audio_model_id,
88
+ trust_remote_code=False)
89
+ else:
90
+ audio_config = audio_config or {}
91
+ self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
92
+ "model_type", "whisper")](**audio_config)
93
+
94
+ self.text_model_lora_config = text_model_lora_config or {}
95
+ self.audio_model_lora_config = audio_model_lora_config or {}
96
+
97
+ self.vocab_size = self.text_config.vocab_size
98
+
99
+ self.initializer_range = self.text_config.initializer_range
100
+
101
+ super().__init__(**kwargs)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Dict, List, Optional
4
+
5
+ from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
6
+ Sequence, SequenceGroup)
7
+
8
+ from .detokenizer_utils import (convert_prompt_ids_to_tokens,
9
+ detokenize_incrementally)
10
+ from .tokenizer import AnyTokenizer
11
+ from .tokenizer_group import BaseTokenizerGroup
12
+
13
+
14
+ class Detokenizer:
15
+ """Provides methods to decode the output of a model into text."""
16
+
17
+ def __init__(self, tokenizer_group: BaseTokenizerGroup):
18
+ self.tokenizer_group = tokenizer_group
19
+
20
+ def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
21
+ """Returns the HF tokenizer to use for a given sequence."""
22
+ return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
23
+
24
+ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
25
+ prompt_logprobs: List[Optional[Dict[
26
+ int, Logprob]]],
27
+ position_offset: int) -> None:
28
+ """Decodes the logprobs for the prompt of a sequence group.
29
+
30
+ Args:
31
+ seq_group: The sequence group to decode.
32
+ prompt_logprobs: The logprobs to decode.
33
+ position_offset: Offset of the first index of the logprobs
34
+ relative to the start of the sequence (for chunked prefill).
35
+
36
+ Returns:
37
+ The prompt logprobs with the decoded tokens.
38
+ """
39
+ prms = seq_group.sampling_params
40
+ assert prms is not None
41
+
42
+ # We can pick any sequence for the prompt.
43
+ seq = seq_group.get_seqs()[0]
44
+ # Only prompt, without the generated token.
45
+ all_token_ids = seq.get_token_ids()
46
+ prompt_token_ids = all_token_ids[:-1]
47
+ tokenizer = self.get_tokenizer_for_seq(seq)
48
+ prefix_offset = 0
49
+ read_offset = 0
50
+ next_iter_prefix_offset = 0
51
+ next_iter_read_offset = 0
52
+ next_iter_tokens: List[str] = []
53
+ prev_tokens = None
54
+
55
+ for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
56
+ prompt_logprobs):
57
+
58
+ # Absolute token position equals the index in the logprobs
59
+ # list plus the offset of the entire logprobs list relative
60
+ # to the start of the sequence.
61
+ token_position = token_position_in_logprob + position_offset
62
+ if not prompt_logprobs_for_token:
63
+ continue
64
+ for token_id, sample_logprob in prompt_logprobs_for_token.items():
65
+ if (sample_logprob.decoded_token is None
66
+ and token_id != VLLM_INVALID_TOKEN_ID):
67
+ prompt_token_ids_with_token = (
68
+ prompt_token_ids[:token_position] + [token_id])
69
+ (new_tokens, new_text, new_prefix_offset,
70
+ new_read_offset) = detokenize_incrementally(
71
+ tokenizer=tokenizer,
72
+ all_input_ids=prompt_token_ids_with_token,
73
+ prev_tokens=prev_tokens,
74
+ prefix_offset=prefix_offset,
75
+ read_offset=read_offset,
76
+ skip_special_tokens=prms.skip_special_tokens,
77
+ spaces_between_special_tokens=prms.
78
+ spaces_between_special_tokens,
79
+ )
80
+
81
+ sample_logprob.decoded_token = new_text
82
+
83
+ # Use the offsets & prev tokens corresponding to
84
+ # real tokens to ensure detokenization is consistent
85
+ # actual with prompt.
86
+ if token_id == all_token_ids[token_position]:
87
+ next_iter_prefix_offset = new_prefix_offset
88
+ next_iter_read_offset = new_read_offset
89
+ next_iter_tokens = new_tokens
90
+
91
+ # Advance to the next token position.
92
+ prefix_offset = next_iter_prefix_offset
93
+ read_offset = next_iter_read_offset
94
+ if prev_tokens is None:
95
+ prev_tokens = next_iter_tokens.copy()
96
+ else:
97
+ prev_tokens.extend(next_iter_tokens)
98
+
99
+ def decode_sequence_inplace(self, seq: Sequence,
100
+ prms: SamplingParams) -> int:
101
+ """Decodes the new token for a sequence. In-place operation.
102
+
103
+ Args:
104
+ seq: The sequence to decode.
105
+ prms: The sampling parameters used to generate the sequence.
106
+
107
+ Returns:
108
+ The number of characters added to the output text.
109
+ """
110
+ all_input_ids = seq.get_token_ids()
111
+ token_id_generated_this_iteration = all_input_ids[-1]
112
+ tokenizer = self.get_tokenizer_for_seq(seq)
113
+
114
+ # Convert prompt token IDs to tokens if necessary.
115
+ # Do it here so that we don't have to repeat this
116
+ # computation for each logprob.
117
+ if seq.tokens is None:
118
+ (seq.tokens, seq.prefix_offset,
119
+ seq.read_offset) = convert_prompt_ids_to_tokens(
120
+ tokenizer=tokenizer,
121
+ prompt_ids=all_input_ids[:-1],
122
+ skip_special_tokens=prms.skip_special_tokens,
123
+ )
124
+
125
+ (new_tokens, new_decoded_token_text, prefix_offset,
126
+ read_offset) = detokenize_incrementally(
127
+ tokenizer=tokenizer,
128
+ all_input_ids=all_input_ids,
129
+ prev_tokens=seq.tokens,
130
+ prefix_offset=seq.prefix_offset,
131
+ read_offset=seq.read_offset,
132
+ skip_special_tokens=prms.skip_special_tokens,
133
+ spaces_between_special_tokens=prms.spaces_between_special_tokens,
134
+ )
135
+
136
+ # Decode logprobs
137
+ logprobs = seq.output_logprobs[-1]
138
+ if logprobs:
139
+ previous_tokens = all_input_ids[:-1]
140
+ for token_id, sample_logprob in logprobs.items():
141
+ # If the token was generated this iteration,
142
+ # use the provided text.
143
+ if token_id == token_id_generated_this_iteration:
144
+ sample_logprob.decoded_token = new_decoded_token_text
145
+ continue
146
+
147
+ if (sample_logprob.decoded_token is None
148
+ and token_id != VLLM_INVALID_TOKEN_ID):
149
+ all_input_ids_with_logprob = previous_tokens + [token_id]
150
+ (_, new_text, _, _) = detokenize_incrementally(
151
+ tokenizer=tokenizer,
152
+ all_input_ids=all_input_ids_with_logprob,
153
+ prev_tokens=seq.tokens,
154
+ prefix_offset=seq.prefix_offset,
155
+ read_offset=seq.read_offset,
156
+ skip_special_tokens=prms.skip_special_tokens,
157
+ spaces_between_special_tokens=prms.
158
+ spaces_between_special_tokens,
159
+ )
160
+ sample_logprob.decoded_token = new_text
161
+
162
+ seq.tokens.extend(new_tokens)
163
+ seq.prefix_offset = prefix_offset
164
+ seq.read_offset = read_offset
165
+ seq.output_text += new_decoded_token_text
166
+
167
+ return len(new_decoded_token_text)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/detokenizer_utils.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import List, Optional, Tuple
4
+
5
+ from .tokenizer import AnyTokenizer
6
+
7
+
8
+ def _replace_none_with_empty(tokens: List[Optional[str]]):
9
+ for i, token in enumerate(tokens):
10
+ if token is None:
11
+ tokens[i] = ""
12
+
13
+
14
+ def _convert_tokens_to_string_with_added_encoders(
15
+ tokenizer: AnyTokenizer,
16
+ output_tokens: List[str],
17
+ skip_special_tokens: bool,
18
+ spaces_between_special_tokens: bool,
19
+ ) -> str:
20
+ # Adapted from
21
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
22
+ # NOTE(woosuk): The following code is slow because it runs a for loop over
23
+ # the output_tokens. In Python, running a for loop over a list can be slow
24
+ # even when the loop body is very simple.
25
+ sub_texts: List[str] = []
26
+ current_sub_text: List[str] = []
27
+ all_special_tokens = set(tokenizer.all_special_tokens)
28
+ for token in output_tokens:
29
+ if skip_special_tokens and token in all_special_tokens:
30
+ continue
31
+ if token in tokenizer.get_added_vocab():
32
+ if current_sub_text:
33
+ sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
34
+ sub_texts.append(sub_text)
35
+ current_sub_text = []
36
+ sub_texts.append(token)
37
+ else:
38
+ current_sub_text.append(token)
39
+ if current_sub_text:
40
+ sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
41
+ sub_texts.append(sub_text)
42
+ if spaces_between_special_tokens:
43
+ return " ".join(sub_texts)
44
+ else:
45
+ return "".join(sub_texts)
46
+
47
+
48
+ # 5 is an arbitrary value that should work for all
49
+ # tokenizers (bigger = more conservative).
50
+ INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
51
+
52
+
53
+ def convert_prompt_ids_to_tokens(
54
+ tokenizer: AnyTokenizer,
55
+ prompt_ids: List[int],
56
+ skip_special_tokens: bool = False,
57
+ ) -> Tuple[List[str], int, int]:
58
+ """Converts the prompt ids to tokens and returns the tokens and offsets
59
+ for incremental detokenization.
60
+
61
+ Note that not all tokens are converted to strings. Only the tokens that
62
+ are necessary for incremental detokenization are converted to strings.
63
+ """
64
+ # We do not need to convert the whole prompt to tokens.
65
+ # Offset a little more in case we have special tokens.
66
+ new_tokens = tokenizer.convert_ids_to_tokens(
67
+ prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
68
+ skip_special_tokens=skip_special_tokens)
69
+ read_offset = len(new_tokens)
70
+ prefix_offset = max(
71
+ read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
72
+ # This is required to guard against out-of-vocab prompt token ids
73
+ _replace_none_with_empty(new_tokens) # type: ignore[arg-type]
74
+ return new_tokens, prefix_offset, read_offset
75
+
76
+
77
+ # Based on
78
+ # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
79
+ # under Apache 2.0 license
80
+ def detokenize_incrementally(
81
+ tokenizer: AnyTokenizer,
82
+ all_input_ids: List[int],
83
+ prev_tokens: Optional[List[str]],
84
+ prefix_offset: int,
85
+ read_offset: int,
86
+ skip_special_tokens: bool = False,
87
+ spaces_between_special_tokens: bool = True,
88
+ ) -> Tuple[List[str], str, int, int]:
89
+ """Detokenizes the input ids incrementally and returns the new tokens
90
+ and the new text.
91
+
92
+ If `prev_tokens` is None, this function will convert the input ids to
93
+ tokens and return the tokens and the new text. Otherwise, it will return the
94
+ new tokens and the new text.
95
+
96
+ This function will also return the new prefix offset and the new read
97
+ offset to be used in the next iteration.
98
+
99
+ The offsets are necessary to defeat cleanup algorithms in the decode which
100
+ decide to add a space or not depending on the surrounding ids.
101
+
102
+ Args:
103
+ tokenizer: The tokenizer to use.
104
+ all_input_ids: The input ids. The last id is the new token id.
105
+ prev_tokens: The previous tokens. If None, this function will convert
106
+ the input ids to tokens and return the tokens and the new text.
107
+ prefix_offset: The prefix offset.
108
+ read_offset: The read offset.
109
+ skip_special_tokens: Whether to skip special tokens.
110
+ spaces_between_special_tokens: Whether to add spaces between special
111
+ tokens.
112
+ """
113
+ new_token_id = all_input_ids[-1]
114
+ # This is the first iteration for this sequence
115
+ is_first_iter = prev_tokens is None
116
+ if is_first_iter:
117
+ (prev_tokens, prefix_offset,
118
+ read_offset) = convert_prompt_ids_to_tokens(
119
+ tokenizer,
120
+ all_input_ids[:-1],
121
+ skip_special_tokens=skip_special_tokens)
122
+ assert prev_tokens is not None
123
+
124
+ # If the new token id is out of bounds, return an empty string.
125
+ if 0 <= new_token_id < len(tokenizer):
126
+ # Put new_token_id in a list so skip_special_tokens is respected
127
+ new_tokens = tokenizer.convert_ids_to_tokens(
128
+ [new_token_id], skip_special_tokens=skip_special_tokens)
129
+ if isinstance(new_tokens, str):
130
+ new_tokens = [new_tokens]
131
+ else:
132
+ new_tokens = [""]
133
+ output_tokens = prev_tokens + new_tokens
134
+
135
+ # If this is the first iteration, return all tokens.
136
+ if is_first_iter:
137
+ new_tokens = output_tokens
138
+
139
+ # The prefix text is necessary only to defeat cleanup algorithms in
140
+ # the decode which decide to add a space or not depending on the
141
+ # surrounding ids.
142
+ if tokenizer.is_fast or not tokenizer.get_added_vocab():
143
+ prefix_text = tokenizer.convert_tokens_to_string(
144
+ output_tokens[prefix_offset:read_offset])
145
+ new_text = tokenizer.convert_tokens_to_string(
146
+ output_tokens[prefix_offset:])
147
+ else:
148
+ prefix_text = _convert_tokens_to_string_with_added_encoders(
149
+ tokenizer,
150
+ output_tokens[prefix_offset:read_offset],
151
+ skip_special_tokens=skip_special_tokens,
152
+ spaces_between_special_tokens=spaces_between_special_tokens,
153
+ )
154
+ new_text = _convert_tokens_to_string_with_added_encoders(
155
+ tokenizer,
156
+ output_tokens[prefix_offset:],
157
+ skip_special_tokens=skip_special_tokens,
158
+ spaces_between_special_tokens=spaces_between_special_tokens,
159
+ )
160
+
161
+ if len(new_text) <= len(prefix_text) or new_text.endswith("�"):
162
+ # utf-8 char at the end means it's a potential unfinished byte sequence
163
+ # from byte fallback tokenization.
164
+ # If it's in the middle, it's probably a real invalid id generated
165
+ # by the model
166
+ return new_tokens, "", prefix_offset, read_offset
167
+
168
+ new_text = new_text[len(prefix_text):]
169
+ return new_tokens, new_text, read_offset, len(output_tokens)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processor.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from functools import lru_cache
4
+ from typing import Any, cast
5
+
6
+ from transformers.processing_utils import ProcessorMixin
7
+
8
+
9
+ def get_processor(
10
+ processor_name: str,
11
+ *args: Any,
12
+ trust_remote_code: bool = False,
13
+ processor_cls: type[ProcessorMixin] = ProcessorMixin,
14
+ **kwargs: Any,
15
+ ):
16
+ """Load a processor for the given model name via HuggingFace."""
17
+ # don't put this import at the top level
18
+ # it will call torch.cuda.device_count()
19
+ from transformers import AutoProcessor
20
+
21
+ processor_factory = (AutoProcessor
22
+ if processor_cls == ProcessorMixin else processor_cls)
23
+
24
+ try:
25
+ processor = processor_factory.from_pretrained(
26
+ processor_name,
27
+ *args,
28
+ trust_remote_code=trust_remote_code,
29
+ **kwargs,
30
+ )
31
+ except ValueError as e:
32
+ # If the error pertains to the processor class not existing or not
33
+ # currently being imported, suggest using the --trust-remote-code flag.
34
+ # Unlike AutoTokenizer, AutoProcessor does not separate such errors
35
+ if not trust_remote_code:
36
+ err_msg = (
37
+ "Failed to load the processor. If the processor is "
38
+ "a custom processor not yet available in the HuggingFace "
39
+ "transformers library, consider setting "
40
+ "`trust_remote_code=True` in LLM or using the "
41
+ "`--trust-remote-code` flag in the CLI.")
42
+ raise RuntimeError(err_msg) from e
43
+ else:
44
+ raise e
45
+
46
+ return cast(ProcessorMixin, processor)
47
+
48
+
49
+ cached_get_processor = lru_cache(get_processor)
50
+
51
+
52
+ def get_image_processor(
53
+ processor_name: str,
54
+ *args: Any,
55
+ trust_remote_code: bool = False,
56
+ **kwargs: Any,
57
+ ):
58
+ """Load an image processor for the given model name via HuggingFace."""
59
+ # don't put this import at the top level
60
+ # it will call torch.cuda.device_count()
61
+ from transformers import AutoImageProcessor
62
+ from transformers.image_processing_utils import BaseImageProcessor
63
+
64
+ try:
65
+ processor = AutoImageProcessor.from_pretrained(
66
+ processor_name,
67
+ *args,
68
+ trust_remote_code=trust_remote_code,
69
+ **kwargs)
70
+ except ValueError as e:
71
+ # If the error pertains to the processor class not existing or not
72
+ # currently being imported, suggest using the --trust-remote-code flag.
73
+ # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
74
+ if not trust_remote_code:
75
+ err_msg = (
76
+ "Failed to load the image processor. If the image processor is "
77
+ "a custom processor not yet available in the HuggingFace "
78
+ "transformers library, consider setting "
79
+ "`trust_remote_code=True` in LLM or using the "
80
+ "`--trust-remote-code` flag in the CLI.")
81
+ raise RuntimeError(err_msg) from e
82
+ else:
83
+ raise e
84
+
85
+ return cast(BaseImageProcessor, processor)
86
+
87
+
88
+ def get_video_processor(
89
+ processor_name: str,
90
+ *args: Any,
91
+ trust_remote_code: bool = False,
92
+ **kwargs: Any,
93
+ ):
94
+ """Load a video processor for the given model name via HuggingFace."""
95
+ # don't put this import at the top level
96
+ # it will call torch.cuda.device_count()
97
+ from transformers.image_processing_utils import BaseImageProcessor
98
+
99
+ processor = get_processor(
100
+ processor_name,
101
+ *args,
102
+ trust_remote_code=trust_remote_code,
103
+ **kwargs,
104
+ )
105
+
106
+ return cast(BaseImageProcessor, processor.video_processor)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from vllm.transformers_utils.processors.deepseek_vl2 import (
4
+ DeepseekVLV2Processor)
5
+
6
+ __all__ = ["DeepseekVLV2Processor"]
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (354 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/__pycache__/deepseek_vl2.cpython-311.pyc ADDED
Binary file (15.9 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/processors/deepseek_vl2.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # yapf: disable
4
+ # ruff: noqa: E501
5
+ # coding=utf-8
6
+ # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
7
+ # Copyright (c) 2023-2024 DeepSeek.
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining a copy of
10
+ # this software and associated documentation files (the "Software"), to deal in
11
+ # the Software without restriction, including without limitation the rights to
12
+ # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
13
+ # the Software, and to permit persons to whom the Software is furnished to do so,
14
+ # subject to the following conditions:
15
+ #
16
+ # The above copyright notice and this permission notice shall be included in all
17
+ # copies or substantial portions of the Software.
18
+ #
19
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
21
+ # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
22
+ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
23
+ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24
+ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
+
26
+ import math
27
+ from typing import List, Tuple
28
+
29
+ import torch
30
+ import torchvision.transforms as T
31
+ from PIL import Image, ImageOps
32
+ from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
33
+ from transformers.processing_utils import ProcessorMixin
34
+
35
+
36
+ class ImageTransform:
37
+
38
+ def __init__(self,
39
+ mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
40
+ std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
41
+ normalize: bool = True):
42
+ self.mean = mean
43
+ self.std = std
44
+ self.normalize = normalize
45
+
46
+ transform_pipelines = [T.ToTensor()]
47
+
48
+ if normalize:
49
+ transform_pipelines.append(T.Normalize(mean, std))
50
+
51
+ self.transform = T.Compose(transform_pipelines)
52
+
53
+ def __call__(self, pil_img: Image.Image):
54
+ x = self.transform(pil_img)
55
+ return x
56
+
57
+
58
+ class DeepseekVLV2Processor(ProcessorMixin):
59
+ tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
60
+ attributes = ["tokenizer"]
61
+
62
+ def __init__(
63
+ self,
64
+ tokenizer: LlamaTokenizerFast,
65
+ candidate_resolutions: Tuple[Tuple[int, int]],
66
+ patch_size: int,
67
+ downsample_ratio: int,
68
+ image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
69
+ image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
70
+ normalize: bool = True,
71
+ image_token: str = "<image>",
72
+ pad_token: str = "<|▁pad▁|>",
73
+ add_special_token: bool = False,
74
+ sft_format: str = "deepseek",
75
+ mask_prompt: bool = True,
76
+ ignore_id: int = -100,
77
+ **kwargs,
78
+ ):
79
+
80
+ self.candidate_resolutions = candidate_resolutions
81
+ self.image_size = candidate_resolutions[0][0]
82
+ self.patch_size = patch_size
83
+ self.image_mean = image_mean
84
+ self.image_std = image_std
85
+ self.normalize = normalize
86
+ self.downsample_ratio = downsample_ratio
87
+
88
+ self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
89
+ self.tokenizer = tokenizer
90
+ self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference
91
+
92
+ # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
93
+ if tokenizer.pad_token is None:
94
+ self.tokenizer.add_special_tokens({'pad_token': pad_token})
95
+
96
+ # add image token
97
+ image_token_id = self.tokenizer.vocab.get(image_token)
98
+ if image_token_id is None:
99
+ special_tokens = [image_token]
100
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
101
+ self.tokenizer.add_special_tokens(special_tokens_dict)
102
+ self.image_token_id = self.tokenizer.vocab.get(image_token)
103
+
104
+ # add five special tokens for grounding-related tasks
105
+ # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
106
+ special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
107
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
108
+ self.tokenizer.add_special_tokens(special_tokens_dict)
109
+
110
+ # add special tokens for SFT data
111
+ special_tokens = ["<|User|>", "<|Assistant|>"]
112
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
113
+ self.tokenizer.add_special_tokens(special_tokens_dict)
114
+
115
+ self.image_token = image_token
116
+ self.pad_token = pad_token
117
+ self.add_special_token = add_special_token
118
+ self.sft_format = sft_format
119
+ self.mask_prompt = mask_prompt
120
+ self.ignore_id = ignore_id
121
+
122
+ super().__init__(
123
+ tokenizer,
124
+ **kwargs,
125
+ )
126
+
127
+ def select_best_resolution(self, image_size):
128
+ # used for cropping
129
+ original_width, original_height = image_size
130
+ best_fit = None
131
+ max_effective_resolution = 0
132
+ min_wasted_resolution = float("inf")
133
+
134
+ for width, height in self.candidate_resolutions:
135
+ scale = min(width / original_width, height / original_height)
136
+ downscaled_width, downscaled_height = int(
137
+ original_width * scale), int(original_height * scale)
138
+ effective_resolution = min(downscaled_width * downscaled_height,
139
+ original_width * original_height)
140
+ wasted_resolution = (width * height) - effective_resolution
141
+
142
+ if effective_resolution > max_effective_resolution or (
143
+ effective_resolution == max_effective_resolution
144
+ and wasted_resolution < min_wasted_resolution):
145
+ max_effective_resolution = effective_resolution
146
+ min_wasted_resolution = wasted_resolution
147
+ best_fit = (width, height)
148
+
149
+ return best_fit
150
+
151
+ @property
152
+ def bos_id(self):
153
+ return self.tokenizer.bos_token_id
154
+
155
+ @property
156
+ def eos_id(self):
157
+ return self.tokenizer.eos_token_id
158
+
159
+ @property
160
+ def pad_id(self):
161
+ return self.tokenizer.pad_token_id
162
+
163
+ def encode(self, text: str, bos: bool = True, eos: bool = False):
164
+ t = self.tokenizer.encode(text, add_special_tokens=False)
165
+
166
+ if bos:
167
+ t = [self.bos_id] + t
168
+ if eos:
169
+ t = t + [self.eos_id]
170
+
171
+ return t
172
+
173
+ def decode(self, t: List[int], **kwargs) -> str:
174
+ return self.tokenizer.decode(t, **kwargs)
175
+
176
+ def process_one(
177
+ self,
178
+ prompt: str,
179
+ images: List[Image.Image],
180
+ inference_mode: bool = True,
181
+ **kwargs,
182
+ ):
183
+ """
184
+
185
+ Args:
186
+ prompt (str): the formatted prompt;
187
+ conversations (List[Dict]): conversations with a list of messages;
188
+ images (List[ImageType]): the list of images;
189
+ inference_mode (bool): if True, then remove the last eos token;
190
+ system_prompt (str): the system prompt;
191
+ **kwargs:
192
+
193
+ Returns:
194
+ outputs (BaseProcessorOutput): the output of the processor,
195
+ - input_ids (torch.LongTensor): [N + image tokens]
196
+ - target_ids (torch.LongTensor): [N + image tokens]
197
+ - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
198
+ - image_id (int): the id of the image token
199
+ - num_image_tokens (List[int]): the number of image tokens
200
+ """
201
+
202
+ assert (prompt is not None and images is not None
203
+ ), "prompt and images must be used at the same time."
204
+
205
+ sft_format = prompt
206
+ tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
207
+ sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
208
+ masked_tokenized_str = []
209
+ for token_index in tokenized_str:
210
+ if token_index != self.image_token_id:
211
+ masked_tokenized_str.append(token_index)
212
+ else:
213
+ masked_tokenized_str.append(self.ignore_id)
214
+
215
+ assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
216
+ (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
217
+ f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
218
+
219
+ input_ids = torch.LongTensor(tokenized_str)
220
+ target_ids = torch.LongTensor(masked_tokenized_str)
221
+ images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
222
+
223
+ # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
224
+ target_ids[(input_ids < 0) |
225
+ (input_ids == self.image_token_id)] = self.ignore_id
226
+ input_ids[input_ids < 0] = self.pad_id
227
+
228
+ if inference_mode:
229
+ # 去掉结尾的eos token
230
+ assert input_ids[-1] == self.eos_id
231
+ input_ids = input_ids[:-1]
232
+ target_ids = target_ids[:-1]
233
+ images_seq_mask = images_seq_mask[:-1]
234
+
235
+ if len(images_list) == 0:
236
+ pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
237
+ images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
238
+ else:
239
+ pixel_values = torch.stack(images_list, dim=0)
240
+ images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
241
+
242
+ input_ids = input_ids.unsqueeze(0)
243
+
244
+ prepare = BatchFeature(
245
+ data=dict(
246
+ input_ids=input_ids,
247
+ pixel_values=pixel_values,
248
+ images_seq_mask=images_seq_mask,
249
+ images_spatial_crop=images_spatial_crop,
250
+ num_image_tokens=num_image_tokens,
251
+ ),
252
+ tensor_type="pt",
253
+ )
254
+ return prepare
255
+
256
+ def __call__(
257
+ self,
258
+ *,
259
+ prompt: str,
260
+ images: List[Image.Image],
261
+ inference_mode: bool = True,
262
+ **kwargs,
263
+ ):
264
+ """
265
+
266
+ Args:
267
+ prompt (str): the formatted prompt;
268
+ images (List[ImageType]): the list of images;
269
+ inference_mode (bool): if True, then remove the last eos token;
270
+ **kwargs:
271
+
272
+ Returns:
273
+ outputs (BaseProcessorOutput): the output of the processor,
274
+ - input_ids (torch.LongTensor): [N + image tokens]
275
+ - images (torch.FloatTensor): [n_images, 3, H, W]
276
+ - image_id (int): the id of the image token
277
+ - num_image_tokens (List[int]): the number of image tokens
278
+ """
279
+
280
+ prepare = self.process_one(
281
+ prompt=prompt,
282
+ images=images,
283
+ inference_mode=inference_mode,
284
+ )
285
+
286
+ return prepare
287
+
288
+ def tokenize_with_images(
289
+ self,
290
+ conversation: str,
291
+ images: List[Image.Image],
292
+ bos: bool = True,
293
+ eos: bool = True,
294
+ cropping: bool = True,
295
+ ):
296
+ """Tokenize text with <image> tags."""
297
+ assert conversation.count(self.image_token) == len(images)
298
+ text_splits = conversation.split(self.image_token)
299
+ images_list, images_seq_mask, images_spatial_crop = [], [], []
300
+ num_image_tokens = []
301
+ tokenized_str = []
302
+ for text_sep, image in zip(text_splits, images):
303
+ """encode text_sep"""
304
+ tokenized_sep = self.encode(text_sep, bos=False, eos=False)
305
+ tokenized_str += tokenized_sep
306
+ images_seq_mask += [False] * len(tokenized_sep)
307
+
308
+ """select best resolution for anyres"""
309
+ if cropping:
310
+ best_width, best_height = self.select_best_resolution(image.size)
311
+ else:
312
+ best_width, best_height = self.image_size, self.image_size
313
+
314
+ """process the global view"""
315
+ global_view = ImageOps.pad(image, (self.image_size, self.image_size),
316
+ color=tuple(int(x * 255) for x in self.image_transform.mean))
317
+ images_list.append(self.image_transform(global_view))
318
+
319
+ """process the local views"""
320
+ local_view = ImageOps.pad(image, (best_width, best_height),
321
+ color=tuple(int(x * 255) for x in self.image_transform.mean))
322
+ for i in range(0, best_height, self.image_size):
323
+ for j in range(0, best_width, self.image_size):
324
+ images_list.append(
325
+ self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
326
+
327
+ """record height / width crop num"""
328
+ num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
329
+ images_spatial_crop.append([num_width_tiles, num_height_tiles])
330
+
331
+ """add image tokens"""
332
+ h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
333
+ # global views tokens h * (w + 1), 1 is for line separator
334
+ tokenized_image = [self.image_token_id] * h * (w + 1)
335
+ # add a separator between global and local views
336
+ tokenized_image += [self.image_token_id]
337
+ # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
338
+ tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
339
+
340
+ tokenized_str += tokenized_image
341
+ images_seq_mask += [True] * len(tokenized_image)
342
+ num_image_tokens.append(len(tokenized_image))
343
+
344
+ """process the last text split"""
345
+ tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
346
+ tokenized_str += tokenized_sep
347
+ images_seq_mask += [False] * len(tokenized_sep)
348
+
349
+ """add the bos and eos tokens"""
350
+ if bos:
351
+ tokenized_str = [self.bos_id] + tokenized_str
352
+ images_seq_mask = [False] + images_seq_mask
353
+ if eos:
354
+ tokenized_str = tokenized_str + [self.eos_id]
355
+ images_seq_mask = images_seq_mask + [False]
356
+
357
+ assert len(tokenized_str) == len(
358
+ images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
359
+
360
+ return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
361
+
362
+
363
+ AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/s3_utils.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import fnmatch
4
+ import os
5
+ import shutil
6
+ import signal
7
+ import tempfile
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from vllm.utils import PlaceholderModule
12
+
13
+ try:
14
+ import boto3
15
+ except ImportError:
16
+ boto3 = PlaceholderModule("boto3") # type: ignore[assignment]
17
+
18
+
19
+ def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
20
+ return [
21
+ path for path in paths if any(
22
+ fnmatch.fnmatch(path, pattern) for pattern in patterns)
23
+ ]
24
+
25
+
26
+ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
27
+ return [
28
+ path for path in paths
29
+ if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
30
+ ]
31
+
32
+
33
+ def glob(s3=None,
34
+ path: str = "",
35
+ allow_pattern: Optional[list[str]] = None) -> list[str]:
36
+ """
37
+ List full file names from S3 path and filter by allow pattern.
38
+
39
+ Args:
40
+ s3: S3 client to use.
41
+ path: The S3 path to list from.
42
+ allow_pattern: A list of patterns of which files to pull.
43
+
44
+ Returns:
45
+ list[str]: List of full S3 paths allowed by the pattern
46
+ """
47
+ if s3 is None:
48
+ s3 = boto3.client("s3")
49
+ bucket_name, _, paths = list_files(s3,
50
+ path=path,
51
+ allow_pattern=allow_pattern)
52
+ return [f"s3://{bucket_name}/{path}" for path in paths]
53
+
54
+
55
+ def list_files(
56
+ s3,
57
+ path: str,
58
+ allow_pattern: Optional[list[str]] = None,
59
+ ignore_pattern: Optional[list[str]] = None
60
+ ) -> tuple[str, str, list[str]]:
61
+ """
62
+ List files from S3 path and filter by pattern.
63
+
64
+ Args:
65
+ s3: S3 client to use.
66
+ path: The S3 path to list from.
67
+ allow_pattern: A list of patterns of which files to pull.
68
+ ignore_pattern: A list of patterns of which files not to pull.
69
+
70
+ Returns:
71
+ tuple[str, str, list[str]]: A tuple where:
72
+ - The first element is the bucket name
73
+ - The second element is string represent the bucket
74
+ and the prefix as a dir like string
75
+ - The third element is a list of files allowed or
76
+ disallowed by pattern
77
+ """
78
+ parts = path.removeprefix('s3://').split('/')
79
+ prefix = '/'.join(parts[1:])
80
+ bucket_name = parts[0]
81
+
82
+ objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
83
+ paths = [obj['Key'] for obj in objects.get('Contents', [])]
84
+
85
+ paths = _filter_ignore(paths, ["*/"])
86
+ if allow_pattern is not None:
87
+ paths = _filter_allow(paths, allow_pattern)
88
+
89
+ if ignore_pattern is not None:
90
+ paths = _filter_ignore(paths, ignore_pattern)
91
+
92
+ return bucket_name, prefix, paths
93
+
94
+
95
+ class S3Model:
96
+ """
97
+ A class representing a S3 model mirrored into a temporary directory.
98
+
99
+ Attributes:
100
+ s3: S3 client.
101
+ dir: The temporary created directory.
102
+
103
+ Methods:
104
+ pull_files(): Pull model from S3 to the temporary directory.
105
+ """
106
+
107
+ def __init__(self) -> None:
108
+ self.s3 = boto3.client('s3')
109
+ for sig in (signal.SIGINT, signal.SIGTERM):
110
+ existing_handler = signal.getsignal(sig)
111
+ signal.signal(sig, self._close_by_signal(existing_handler))
112
+ self.dir = tempfile.mkdtemp()
113
+
114
+ def __del__(self):
115
+ self._close()
116
+
117
+ def _close(self) -> None:
118
+ if os.path.exists(self.dir):
119
+ shutil.rmtree(self.dir)
120
+
121
+ def _close_by_signal(self, existing_handler=None):
122
+
123
+ def new_handler(signum, frame):
124
+ self._close()
125
+ if existing_handler:
126
+ existing_handler(signum, frame)
127
+
128
+ return new_handler
129
+
130
+ def pull_files(self,
131
+ s3_model_path: str = "",
132
+ allow_pattern: Optional[list[str]] = None,
133
+ ignore_pattern: Optional[list[str]] = None) -> None:
134
+ """
135
+ Pull files from S3 storage into the temporary directory.
136
+
137
+ Args:
138
+ s3_model_path: The S3 path of the model.
139
+ allow_pattern: A list of patterns of which files to pull.
140
+ ignore_pattern: A list of patterns of which files not to pull.
141
+
142
+ """
143
+ bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
144
+ allow_pattern,
145
+ ignore_pattern)
146
+ if len(files) == 0:
147
+ return
148
+
149
+ for file in files:
150
+ destination_file = os.path.join(self.dir,
151
+ file.removeprefix(base_dir))
152
+ local_dir = Path(destination_file).parent
153
+ os.makedirs(local_dir, exist_ok=True)
154
+ self.s3.download_file(bucket_name, file, destination_file)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import contextlib
4
+ import os
5
+ import warnings
6
+ from pathlib import Path
7
+ from types import MethodType
8
+ from typing import Optional, Union
9
+
10
+ import huggingface_hub
11
+ from transformers import (AutoTokenizer, PreTrainedTokenizer,
12
+ PreTrainedTokenizerFast)
13
+
14
+ from vllm.envs import VLLM_USE_MODELSCOPE
15
+ from vllm.logger import init_logger
16
+ from vllm.lora.request import LoRARequest
17
+ from vllm.transformers_utils.tokenizers import MistralTokenizer
18
+ from vllm.transformers_utils.utils import check_gguf_file
19
+ from vllm.utils import make_async
20
+
21
+ logger = init_logger(__name__)
22
+
23
+ AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
24
+ MistralTokenizer]
25
+
26
+
27
+ def decode_tokens(
28
+ tokenizer: AnyTokenizer,
29
+ token_ids: list[int],
30
+ *,
31
+ skip_special_tokens: bool = False,
32
+ ) -> str:
33
+ """
34
+ Backend-agnostic equivalent of HF's
35
+ :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
36
+ """
37
+ return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
38
+
39
+
40
+ def encode_tokens(
41
+ tokenizer: AnyTokenizer,
42
+ text: str,
43
+ *,
44
+ add_special_tokens: Optional[bool] = None,
45
+ ) -> list[int]:
46
+ """
47
+ Backend-agnostic equivalent of HF's
48
+ :code:`tokenizer.encode(text, add_special_tokens=...)`.
49
+ """
50
+ if isinstance(tokenizer, MistralTokenizer):
51
+ return tokenizer.tokenizer.encode(text,
52
+ bos=add_special_tokens,
53
+ eos=add_special_tokens)
54
+ elif add_special_tokens is not None:
55
+ return tokenizer.encode(text, add_special_tokens=add_special_tokens)
56
+ return tokenizer.encode(text)
57
+
58
+
59
+ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
60
+ """Get tokenizer with cached properties.
61
+
62
+ This will patch the tokenizer object in place.
63
+
64
+ By default, transformers will recompute multiple tokenizer properties
65
+ each time they are called, leading to a significant slowdown. This
66
+ function caches these properties for faster access."""
67
+
68
+ tokenizer_all_special_ids = set(tokenizer.all_special_ids)
69
+ tokenizer_all_special_tokens_extended = (
70
+ tokenizer.all_special_tokens_extended)
71
+ tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
72
+ tokenizer_vocab = tokenizer.get_vocab()
73
+ tokenizer_len = len(tokenizer)
74
+
75
+ max_token_id = max(tokenizer_vocab.values())
76
+ # Some tokenizers (e.g., QwenTokenizer) have special tokens that
77
+ # are added and included in the implementation of the vocab_size
78
+ # property, but not in get_vocab(); if there is an implementation
79
+ # of vocab size, we should take the greater value.
80
+ if hasattr(tokenizer, "vocab_size"):
81
+ with contextlib.suppress(NotImplementedError):
82
+ max_token_id = max(max_token_id, tokenizer.vocab_size)
83
+
84
+ class CachedTokenizer(tokenizer.__class__): # type: ignore
85
+
86
+ @property
87
+ def all_special_ids(self):
88
+ return tokenizer_all_special_ids
89
+
90
+ @property
91
+ def all_special_tokens(self):
92
+ return tokenizer_all_special_tokens
93
+
94
+ @property
95
+ def all_special_tokens_extended(self):
96
+ return tokenizer_all_special_tokens_extended
97
+
98
+ @property
99
+ def max_token_id(self):
100
+ return max_token_id
101
+
102
+ def get_vocab(self):
103
+ return tokenizer_vocab
104
+
105
+ def __len__(self):
106
+ return tokenizer_len
107
+
108
+ CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
109
+
110
+ tokenizer.__class__ = CachedTokenizer
111
+ return tokenizer
112
+
113
+
114
+ def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
115
+ """Patch _pad method to accept `padding_side` for older tokenizers."""
116
+ orig_pad = tokenizer._pad
117
+
118
+ def _pad(
119
+ self: PreTrainedTokenizer,
120
+ *args,
121
+ padding_side: Optional[str] = None,
122
+ **kwargs,
123
+ ):
124
+ if padding_side is not None and padding_side != self.padding_side:
125
+ msg = ("`padding_side` argument is not supported by "
126
+ f"{type(tokenizer).__name__} and will be ignored.")
127
+ warnings.warn(msg, stacklevel=2)
128
+
129
+ return orig_pad(*args, **kwargs)
130
+
131
+ tokenizer._pad = MethodType(_pad, tokenizer)
132
+
133
+
134
+ def get_tokenizer(
135
+ tokenizer_name: Union[str, Path],
136
+ *args,
137
+ tokenizer_mode: str = "auto",
138
+ trust_remote_code: bool = False,
139
+ revision: Optional[str] = None,
140
+ download_dir: Optional[str] = None,
141
+ **kwargs,
142
+ ) -> AnyTokenizer:
143
+ """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
144
+ """
145
+ if VLLM_USE_MODELSCOPE:
146
+ # download model from ModelScope hub,
147
+ # lazy import so that modelscope is not required for normal use.
148
+ # pylint: disable=C.
149
+ from modelscope.hub.snapshot_download import snapshot_download
150
+
151
+ # Only set the tokenizer here, model will be downloaded on the workers.
152
+ if not os.path.exists(tokenizer_name):
153
+ tokenizer_path = snapshot_download(
154
+ model_id=tokenizer_name,
155
+ cache_dir=download_dir,
156
+ revision=revision,
157
+ local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
158
+ # Ignore weights - we only need the tokenizer.
159
+ ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
160
+ tokenizer_name = tokenizer_path
161
+
162
+ if tokenizer_mode == "slow":
163
+ if kwargs.get("use_fast", False):
164
+ raise ValueError(
165
+ "Cannot use the fast tokenizer in slow tokenizer mode.")
166
+ kwargs["use_fast"] = False
167
+
168
+ if "truncation_side" not in kwargs:
169
+ kwargs["truncation_side"] = "left"
170
+
171
+ # Separate model folder from file path for GGUF models
172
+ is_gguf = check_gguf_file(tokenizer_name)
173
+ if is_gguf:
174
+ kwargs["gguf_file"] = Path(tokenizer_name).name
175
+ tokenizer_name = Path(tokenizer_name).parent
176
+
177
+ # if tokenizer is from official mistral org
178
+ is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
179
+ if is_from_mistral_org and tokenizer_mode != "mistral":
180
+ warnings.warn(
181
+ 'It is strongly recommended to run mistral models with '
182
+ '`--tokenizer-mode "mistral"` to ensure correct '
183
+ 'encoding and decoding.',
184
+ FutureWarning,
185
+ stacklevel=2)
186
+ if tokenizer_mode == "mistral":
187
+ tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
188
+ revision=revision)
189
+ else:
190
+ try:
191
+ tokenizer = AutoTokenizer.from_pretrained(
192
+ tokenizer_name,
193
+ *args,
194
+ trust_remote_code=trust_remote_code,
195
+ revision=revision,
196
+ **kwargs,
197
+ )
198
+ except ValueError as e:
199
+ # If the error pertains to the tokenizer class not existing or not
200
+ # currently being imported,
201
+ # suggest using the --trust-remote-code flag.
202
+ if not trust_remote_code and (
203
+ "does not exist or is not currently imported." in str(e)
204
+ or "requires you to execute the tokenizer file" in str(e)):
205
+ err_msg = ("Failed to load the tokenizer. If the tokenizer "
206
+ "is a custom tokenizer not yet available in the "
207
+ "HuggingFace transformers library, consider "
208
+ "setting `trust_remote_code=True` in LLM or using "
209
+ "the `--trust-remote-code` flag in the CLI.")
210
+ raise RuntimeError(err_msg) from e
211
+ else:
212
+ raise e
213
+
214
+ # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
215
+ if type(tokenizer).__name__ in ("ChatGLMTokenizer",
216
+ "ChatGLM4Tokenizer"):
217
+ assert isinstance(tokenizer, PreTrainedTokenizer)
218
+ patch_padding_side(tokenizer)
219
+
220
+ if not isinstance(tokenizer, PreTrainedTokenizerFast):
221
+ logger.warning(
222
+ "Using a slow tokenizer. This might cause a significant "
223
+ "slowdown. Consider using a fast tokenizer instead.")
224
+ tokenizer = get_cached_tokenizer(tokenizer)
225
+
226
+ return tokenizer
227
+
228
+
229
+ def get_lora_tokenizer(lora_request: LoRARequest, *args,
230
+ **kwargs) -> Optional[AnyTokenizer]:
231
+ if lora_request is None:
232
+ return None
233
+ try:
234
+ tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
235
+ except Exception as e:
236
+ # No tokenizer was found in the LoRA folder,
237
+ # use base model tokenizer
238
+ logger.warning(
239
+ "No tokenizer found in %s, using base model tokenizer instead. "
240
+ "(Exception: %s)", lora_request.lora_path, e)
241
+ tokenizer = None
242
+ return tokenizer
243
+
244
+
245
+ get_lora_tokenizer_async = make_async(get_lora_tokenizer)
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__init__.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Optional, Type
4
+
5
+ from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
6
+ SchedulerConfig, TokenizerPoolConfig)
7
+ from vllm.executor.ray_utils import ray
8
+
9
+ from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
10
+ from .tokenizer_group import TokenizerGroup
11
+
12
+ if ray:
13
+ from .ray_tokenizer_group import RayTokenizerGroupPool
14
+ else:
15
+ RayTokenizerGroupPool = None # type: ignore
16
+
17
+
18
+ def init_tokenizer_from_configs(model_config: ModelConfig,
19
+ scheduler_config: SchedulerConfig,
20
+ parallel_config: ParallelConfig,
21
+ lora_config: LoRAConfig):
22
+ init_kwargs = dict(tokenizer_id=model_config.tokenizer,
23
+ enable_lora=bool(lora_config),
24
+ max_num_seqs=scheduler_config.max_num_seqs,
25
+ max_loras=lora_config.max_loras if lora_config else 0,
26
+ max_input_length=None,
27
+ tokenizer_mode=model_config.tokenizer_mode,
28
+ trust_remote_code=model_config.trust_remote_code,
29
+ revision=model_config.tokenizer_revision,
30
+ truncation_side=model_config.truncation_side)
31
+
32
+ return get_tokenizer_group(parallel_config.tokenizer_pool_config,
33
+ **init_kwargs)
34
+
35
+
36
+ def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
37
+ **init_kwargs) -> BaseTokenizerGroup:
38
+ tokenizer_cls: Type[BaseTokenizerGroup]
39
+ if tokenizer_pool_config is None:
40
+ tokenizer_cls = TokenizerGroup
41
+ elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
42
+ tokenizer_pool_config.pool_type, BaseTokenizerGroup):
43
+ tokenizer_cls = tokenizer_pool_config.pool_type
44
+ elif tokenizer_pool_config.pool_type == "ray":
45
+ if RayTokenizerGroupPool is None:
46
+ raise ImportError(
47
+ "RayTokenizerGroupPool is not available. Please install "
48
+ "the ray package to use the Ray tokenizer group pool.")
49
+ tokenizer_cls = RayTokenizerGroupPool
50
+ else:
51
+ raise ValueError(
52
+ f"Unknown pool type: {tokenizer_pool_config.pool_type}")
53
+ return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
54
+
55
+
56
+ __all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (2.67 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-311.pyc ADDED
Binary file (3.43 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-311.pyc ADDED
Binary file (12.6 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-311.pyc ADDED
Binary file (5.92 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import List, Optional
5
+
6
+ from vllm.config import TokenizerPoolConfig
7
+ from vllm.lora.request import LoRARequest
8
+ from vllm.transformers_utils.tokenizer import AnyTokenizer
9
+
10
+
11
+ class BaseTokenizerGroup(ABC):
12
+ """A group of tokenizers that can be used for LoRA adapters."""
13
+
14
+ @classmethod
15
+ @abstractmethod
16
+ def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
17
+ **init_kwargs) -> "BaseTokenizerGroup":
18
+ pass
19
+
20
+ @abstractmethod
21
+ def ping(self) -> bool:
22
+ """Check if the tokenizer group is alive."""
23
+ pass
24
+
25
+ @abstractmethod
26
+ def get_max_input_len(
27
+ self,
28
+ lora_request: Optional[LoRARequest] = None,
29
+ ) -> Optional[int]:
30
+ """Get the maximum input length for the LoRA request."""
31
+ pass
32
+
33
+ @abstractmethod
34
+ def encode(self,
35
+ prompt: str,
36
+ request_id: Optional[str] = None,
37
+ lora_request: Optional[LoRARequest] = None,
38
+ add_special_tokens: Optional[bool] = None) -> List[int]:
39
+ """Encode a prompt using the tokenizer group."""
40
+ pass
41
+
42
+ @abstractmethod
43
+ async def encode_async(
44
+ self,
45
+ prompt: str,
46
+ request_id: Optional[str] = None,
47
+ lora_request: Optional[LoRARequest] = None,
48
+ add_special_tokens: Optional[bool] = None) -> List[int]:
49
+ """Encode a prompt using the tokenizer group."""
50
+ pass
51
+
52
+ @abstractmethod
53
+ def get_lora_tokenizer(
54
+ self,
55
+ lora_request: Optional[LoRARequest] = None,
56
+ ) -> AnyTokenizer:
57
+ """Get a tokenizer for a LoRA request."""
58
+ pass
59
+
60
+ @abstractmethod
61
+ async def get_lora_tokenizer_async(
62
+ self,
63
+ lora_request: Optional[LoRARequest] = None,
64
+ ) -> AnyTokenizer:
65
+ """Get a tokenizer for a LoRA request."""
66
+ pass
67
+
68
+ def check_health(self):
69
+ """Raise exception if the tokenizer group is unhealthy."""
70
+ return
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import asyncio
4
+ import os
5
+ from typing import List, Optional
6
+
7
+ try:
8
+ from ray.exceptions import ActorDiedError # type: ignore
9
+ except ImportError:
10
+ # For older versions of Ray
11
+ from ray.exceptions import RayActorError as ActorDiedError # type: ignore
12
+ from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
13
+
14
+ from vllm.config import TokenizerPoolConfig
15
+ from vllm.executor.ray_utils import ray
16
+ from vllm.logger import init_logger
17
+ from vllm.lora.request import LoRARequest
18
+ from vllm.transformers_utils.tokenizer import AnyTokenizer
19
+
20
+ from .base_tokenizer_group import BaseTokenizerGroup
21
+ from .tokenizer_group import TokenizerGroup
22
+
23
+ logger = init_logger(__name__)
24
+
25
+
26
+ class RayTokenizerGroupPool(BaseTokenizerGroup):
27
+ """A Ray-based pool of TokenizerGroups for async tokenization."""
28
+
29
+ # Class to use for workers making up the pool.
30
+ _worker_cls = TokenizerGroup
31
+
32
+ @classmethod
33
+ def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
34
+ **init_kwargs) -> "RayTokenizerGroupPool":
35
+ if not tokenizer_pool_config:
36
+ raise ValueError("tokenizer_pool_config must not be None.")
37
+ ray_actor_options = (tokenizer_pool_config.extra_config or {
38
+ "num_cpus": 0
39
+ })
40
+ ray_actor_options.setdefault(
41
+ "scheduling_strategy",
42
+ NodeAffinitySchedulingStrategy(
43
+ node_id=ray.get_runtime_context().get_node_id(), soft=True))
44
+
45
+ # Carry over the env vars to the actors.
46
+ # This is necessary for API keys and such.
47
+ ray_actor_options.setdefault("runtime_env", {})
48
+ _carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
49
+
50
+ init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
51
+ init_kwargs["ray_actor_options"] = ray_actor_options
52
+
53
+ return cls(**init_kwargs)
54
+
55
+ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
56
+ max_input_length: Optional[int], num_actors: int,
57
+ ray_actor_options: dict, **tokenizer_config):
58
+ # Store a local copy of the TokenizerGroup for quick access
59
+ # to underlying HF tokenizers.
60
+ self._tokenizer_config = {
61
+ "tokenizer_id": tokenizer_id,
62
+ "enable_lora": enable_lora,
63
+ "max_num_seqs": max_num_seqs,
64
+ "max_input_length": max_input_length,
65
+ **tokenizer_config
66
+ }
67
+ self._local_tokenizer_group = self._worker_cls(
68
+ **self._tokenizer_config, )
69
+
70
+ self._ray_tokenizer_group_cls = ray.remote(
71
+ self._worker_cls).options(**ray_actor_options) # type: ignore
72
+ self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
73
+ self._idle_actors: Optional[asyncio.Queue] = None
74
+
75
+ # If set, actor is unhealthy. Will reraise on the next
76
+ # check_health call.
77
+ self._exception: Optional[ActorDiedError] = None
78
+
79
+ def _init_actor(self) -> ray.ObjectRef:
80
+ return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
81
+
82
+ @property
83
+ def pool_size(self) -> int:
84
+ return len(self.tokenizer_actors)
85
+
86
+ def ping(self):
87
+ return ray.get([
88
+ actor.ping.remote() # type: ignore
89
+ for actor in self.tokenizer_actors
90
+ ])
91
+
92
+ def _ensure_queue_initialized(self):
93
+ if self._idle_actors is None:
94
+ self._idle_actors = asyncio.Queue()
95
+ for actor in self.tokenizer_actors:
96
+ self._idle_actors.put_nowait(actor)
97
+
98
+ def _finalize_encode(self, actor: ray.ObjectRef,
99
+ original_actor: ray.ObjectRef, actor_is_alive: bool):
100
+ assert self._idle_actors is not None
101
+ # Cleanup the dead actor.
102
+ if not actor_is_alive or original_actor is not actor:
103
+ self.tokenizer_actors.remove(original_actor)
104
+ if actor_is_alive:
105
+ # Put the actor back in the queue.
106
+ # This is done in a finally block to ensure that the actor is
107
+ # always put back in the queue, even if an exception/cancellation
108
+ # is raised.
109
+ self._idle_actors.put_nowait(actor)
110
+ # Add back the new actor.
111
+ if original_actor is not actor:
112
+ self.tokenizer_actors.append(actor)
113
+
114
+ def encode(self,
115
+ prompt: str,
116
+ request_id: Optional[str] = None,
117
+ lora_request: Optional[LoRARequest] = None,
118
+ add_special_tokens: Optional[bool] = None) -> List[int]:
119
+ """Encode a prompt using the tokenizer group.
120
+
121
+ We pick an idle actor and use it to encode the prompt.
122
+ The actor is then put back in the queue for future use.
123
+ This is blocking.
124
+ """
125
+ self.check_health()
126
+ self._ensure_queue_initialized()
127
+ assert self._idle_actors is not None
128
+
129
+ if self._idle_actors.empty():
130
+ raise RuntimeError("No idle actors available.")
131
+ actor = self._idle_actors.get_nowait()
132
+ actor_is_alive = True
133
+ original_actor = actor
134
+ try:
135
+ ret = ray.get(
136
+ actor.encode.remote(request_id=request_id,
137
+ prompt=prompt,
138
+ lora_request=lora_request,
139
+ add_special_tokens=add_special_tokens))
140
+ except ActorDiedError as e:
141
+ # If the actor is dead, we first try to reinitialize it.
142
+ logger.warning("%s died with ActorDiedError, reinitializing.",
143
+ actor,
144
+ exc_info=e)
145
+ actor = self._init_actor()
146
+ try:
147
+ ret = ray.get(
148
+ actor.encode.remote(request_id=request_id,
149
+ prompt=prompt,
150
+ lora_request=lora_request,
151
+ add_special_tokens=add_special_tokens))
152
+ except ActorDiedError as e:
153
+ logger.error(
154
+ "%s died for second time in a row, marking "
155
+ "RayTokenizerGroupPool as unhealthy.", actor)
156
+ actor_is_alive = False
157
+ if not self._exception:
158
+ self._exception = e
159
+ self.check_health()
160
+ finally:
161
+ self._finalize_encode(actor, original_actor, actor_is_alive)
162
+ return ret
163
+
164
+ async def encode_async(
165
+ self,
166
+ prompt: str,
167
+ request_id: Optional[str] = None,
168
+ lora_request: Optional[LoRARequest] = None,
169
+ add_special_tokens: Optional[bool] = None) -> List[int]:
170
+ """Encode a prompt using the tokenizer group.
171
+
172
+ We pick an idle actor and use it to encode the prompt.
173
+ If there are no idle actors, we wait until one becomes
174
+ available.
175
+ The actor is then put back in the queue for future use.
176
+ This is non-blocking.
177
+ """
178
+ self.check_health()
179
+ self._ensure_queue_initialized()
180
+ assert self._idle_actors is not None
181
+
182
+ actor = await self._idle_actors.get()
183
+ actor_is_alive = True
184
+ original_actor = actor
185
+ try:
186
+ ret = await actor.encode.remote(
187
+ request_id=request_id,
188
+ prompt=prompt,
189
+ lora_request=lora_request,
190
+ add_special_tokens=add_special_tokens)
191
+ except ActorDiedError as e:
192
+ # If the actor is dead, we first try to reinitialize it.
193
+ logger.warning("%s died with ActorDiedError, reinitializing.",
194
+ actor,
195
+ exc_info=e)
196
+ actor = self._init_actor()
197
+ try:
198
+ ret = await actor.encode.remote(
199
+ request_id=request_id,
200
+ prompt=prompt,
201
+ lora_request=lora_request,
202
+ add_special_tokens=add_special_tokens)
203
+ except ActorDiedError as e:
204
+ logger.error(
205
+ "%s died for second time in a row, marking "
206
+ "RayTokenizerGroupPool as unhealthy.", actor)
207
+ actor_is_alive = False
208
+ if not self._exception:
209
+ self._exception = e
210
+ self.check_health()
211
+ finally:
212
+ self._finalize_encode(actor, original_actor, actor_is_alive)
213
+ return ret
214
+
215
+ def get_max_input_len(self,
216
+ lora_request: Optional[LoRARequest] = None
217
+ ) -> Optional[int]:
218
+ """Get the maximum input length for the LoRA request."""
219
+ return self._local_tokenizer_group.get_max_input_len(lora_request)
220
+
221
+ def get_lora_tokenizer(
222
+ self,
223
+ lora_request: Optional[LoRARequest] = None,
224
+ ) -> AnyTokenizer:
225
+ return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
226
+
227
+ async def get_lora_tokenizer_async(
228
+ self,
229
+ lora_request: Optional[LoRARequest] = None,
230
+ ) -> AnyTokenizer:
231
+ return await self._local_tokenizer_group.get_lora_tokenizer_async(
232
+ lora_request)
233
+
234
+ def check_health(self):
235
+ if self._exception:
236
+ raise RuntimeError(
237
+ "TokenizerGroupPool is unhealthy.") from self._exception
238
+
239
+
240
+ def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
241
+ """Copy over all current process environment variables to the runtime_env.
242
+
243
+ The variables in runtime_env will take precedence over the current process
244
+ environment variables.
245
+
246
+ runtime_env will be modified in place."""
247
+ env_vars = os.environ.copy()
248
+ runtime_env.setdefault("env_vars", {})
249
+ env_vars.update(runtime_env["env_vars"])
250
+ runtime_env["env_vars"] = env_vars
.venv/lib/python3.11/site-packages/vllm/transformers_utils/tokenizer_group/tokenizer_group.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import List, Optional
4
+
5
+ from vllm.config import TokenizerPoolConfig
6
+ from vllm.lora.request import LoRARequest
7
+ from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
8
+ get_lora_tokenizer,
9
+ get_lora_tokenizer_async,
10
+ get_tokenizer)
11
+ from vllm.utils import LRUCache
12
+
13
+ from .base_tokenizer_group import BaseTokenizerGroup
14
+
15
+
16
+ class TokenizerGroup(BaseTokenizerGroup):
17
+ """A group of tokenizers that can be used for LoRA adapters."""
18
+
19
+ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
20
+ max_input_length: Optional[int], **tokenizer_config):
21
+ self.tokenizer_id = tokenizer_id
22
+ self.tokenizer_config = tokenizer_config
23
+ self.enable_lora = enable_lora
24
+ self.max_input_length = max_input_length
25
+ self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
26
+ max_loras = tokenizer_config.get("max_loras", 0)
27
+ self.lora_tokenizers = LRUCache[int, AnyTokenizer](
28
+ capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
29
+
30
+ @classmethod
31
+ def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
32
+ **init_kwargs) -> "TokenizerGroup":
33
+ return cls(**init_kwargs)
34
+
35
+ def ping(self) -> bool:
36
+ """Check if the tokenizer group is alive."""
37
+ return True
38
+
39
+ def get_max_input_len(self,
40
+ lora_request: Optional[LoRARequest] = None
41
+ ) -> Optional[int]:
42
+ """Get the maximum input length for the LoRA request."""
43
+ return self.max_input_length
44
+
45
+ def _raise_if_input_too_long(self,
46
+ encoded_tokens: List[int],
47
+ lora_request: Optional[LoRARequest] = None):
48
+ input_length = len(encoded_tokens)
49
+ if lora_request:
50
+ max_input_length = (lora_request.long_lora_max_len
51
+ or self.max_input_length)
52
+ else:
53
+ max_input_length = self.max_input_length
54
+ if max_input_length is not None and input_length > max_input_length:
55
+ raise ValueError("Input too long.", input_length, max_input_length)
56
+
57
+ def encode(self,
58
+ prompt: str,
59
+ request_id: Optional[str] = None,
60
+ lora_request: Optional[LoRARequest] = None,
61
+ add_special_tokens: Optional[bool] = None) -> List[int]:
62
+ tokenizer = self.get_lora_tokenizer(lora_request)
63
+ ret = encode_tokens(tokenizer,
64
+ prompt,
65
+ add_special_tokens=add_special_tokens)
66
+ self._raise_if_input_too_long(ret, lora_request)
67
+ return ret
68
+
69
+ async def encode_async(
70
+ self,
71
+ prompt: str,
72
+ request_id: Optional[str] = None,
73
+ lora_request: Optional[LoRARequest] = None,
74
+ add_special_tokens: Optional[bool] = None) -> List[int]:
75
+ tokenizer = await self.get_lora_tokenizer_async(lora_request)
76
+ ret = encode_tokens(tokenizer,
77
+ prompt,
78
+ add_special_tokens=add_special_tokens)
79
+ self._raise_if_input_too_long(ret, lora_request)
80
+ return ret
81
+
82
+ def get_lora_tokenizer(
83
+ self,
84
+ lora_request: Optional[LoRARequest] = None,
85
+ ) -> AnyTokenizer:
86
+ if not lora_request or not self.enable_lora:
87
+ return self.tokenizer
88
+ if lora_request.lora_int_id not in self.lora_tokenizers:
89
+ tokenizer = (get_lora_tokenizer(
90
+ lora_request, **self.tokenizer_config) or self.tokenizer)
91
+ self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
92
+ return tokenizer
93
+ else:
94
+ return self.lora_tokenizers[lora_request.lora_int_id]
95
+
96
+ async def get_lora_tokenizer_async(
97
+ self,
98
+ lora_request: Optional[LoRARequest] = None,
99
+ ) -> AnyTokenizer:
100
+ if not lora_request or not self.enable_lora:
101
+ return self.tokenizer
102
+ if lora_request.lora_int_id not in self.lora_tokenizers:
103
+ tokenizer = (await get_lora_tokenizer_async(
104
+ lora_request, **self.tokenizer_config) or self.tokenizer)
105
+ self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
106
+ return tokenizer
107
+ else:
108
+ return self.lora_tokenizers[lora_request.lora_int_id]
.venv/lib/python3.11/site-packages/vllm/transformers_utils/utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from os import PathLike
4
+ from pathlib import Path
5
+ from typing import Union
6
+
7
+
8
+ def is_s3(model_or_path: str) -> bool:
9
+ return model_or_path.lower().startswith('s3://')
10
+
11
+
12
+ def check_gguf_file(model: Union[str, PathLike]) -> bool:
13
+ """Check if the file is a GGUF model."""
14
+ model = Path(model)
15
+ if not model.is_file():
16
+ return False
17
+ elif model.suffix == ".gguf":
18
+ return True
19
+
20
+ with open(model, "rb") as f:
21
+ header = f.read(4)
22
+ return header == b"GGUF"
.venv/lib/python3.11/site-packages/vllm/v1/engine/__init__.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import enum
4
+ from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING, List, Optional, Union
6
+
7
+ import msgspec
8
+
9
+ from vllm.v1.metrics.stats import SchedulerStats
10
+
11
+ if TYPE_CHECKING:
12
+ from vllm.lora.request import LoRARequest
13
+ from vllm.multimodal import MultiModalKwargs
14
+ from vllm.multimodal.inputs import PlaceholderRange
15
+ from vllm.sampling_params import SamplingParams
16
+
17
+ # These are possible values of RequestOutput.finish_reason,
18
+ # so form part of the external API.
19
+ FINISH_REASON_STRINGS = ("stop", "length", "abort")
20
+
21
+
22
+ class FinishReason(enum.IntEnum):
23
+ """
24
+ Reason a request finished - stop, length, or abort.
25
+
26
+ Int rather than Str for more compact serialization.
27
+
28
+ stop - a stop string was emitted
29
+ length - max_tokens was consumed, or max_model_len was reached
30
+ abort - aborted for another reason
31
+
32
+ """
33
+ STOP = 0
34
+ LENGTH = 1
35
+ ABORT = 2
36
+
37
+ def __str__(self):
38
+ return FINISH_REASON_STRINGS[self.value]
39
+
40
+
41
+ @dataclass
42
+ class EngineCoreRequest:
43
+
44
+ # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
45
+ # but this object is currently not playing well with msgspec
46
+ # due to circular imports and typing we have in data.py
47
+
48
+ request_id: str
49
+ # NOTE(ywang96): original text prompt is needed when a request is added to
50
+ # Detokenizer, but set to None when it is added to EngineCoreClient.
51
+ prompt: Optional[str]
52
+ prompt_token_ids: List[int]
53
+ mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
54
+ mm_hashes: Optional[List[str]]
55
+ mm_placeholders: Optional[List["PlaceholderRange"]]
56
+ sampling_params: "SamplingParams"
57
+ eos_token_id: Optional[int]
58
+ arrival_time: float
59
+ lora_request: Optional["LoRARequest"]
60
+
61
+
62
+ class EngineCoreOutput(
63
+ msgspec.Struct,
64
+ array_like=True, # type: ignore[call-arg]
65
+ omit_defaults=True, # type: ignore[call-arg]
66
+ gc=False): # type: ignore[call-arg]
67
+
68
+ request_id: str
69
+ new_token_ids: List[int]
70
+ finished: bool
71
+ finish_reason: Optional[FinishReason] = None
72
+ stop_reason: Union[int, str, None] = None
73
+
74
+
75
+ class EngineCoreOutputs(
76
+ msgspec.Struct,
77
+ array_like=True, # type: ignore[call-arg]
78
+ omit_defaults=True, # type: ignore[call-arg]
79
+ gc=False): # type: ignore[call-arg]
80
+
81
+ #NOTE(Nick): We could consider ways to make this more compact,
82
+ # e.g. columnwise layout
83
+
84
+ # [num_reqs]
85
+ outputs: List[EngineCoreOutput]
86
+ scheduler_stats: SchedulerStats
87
+
88
+
89
+ @dataclass
90
+ class EngineCoreProfile:
91
+ is_start: bool
92
+
93
+
94
+ @dataclass
95
+ class EngineCoreResetPrefixCache:
96
+ pass
97
+
98
+
99
+ class EngineCoreRequestType(enum.Enum):
100
+ """
101
+ Request types defined as hex byte strings, so it can be sent over sockets
102
+ without separate encoding step.
103
+ """
104
+ ADD = b'\x00'
105
+ ABORT = b'\x01'
106
+ PROFILE = b'\x02'
107
+ RESET_PREFIX_CACHE = b'\x03'
108
+
109
+
110
+ EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile,
111
+ EngineCoreResetPrefixCache, List[str]]
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (4.57 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/async_llm.cpython-311.pyc ADDED
Binary file (16.5 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core.cpython-311.pyc ADDED
Binary file (14.6 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/core_client.cpython-311.pyc ADDED
Binary file (17.3 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/v1/engine/__pycache__/detokenizer.cpython-311.pyc ADDED
Binary file (6.93 kB). View file