MSALab commited on
Commit
cadf670
·
verified ·
1 Parent(s): dd8925b

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ __pycache__/modeling_llada.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ library_name: transformers
6
+ pipeline_tag: image-text-to-text
7
+ base_model:
8
+ - MSALab/PerceptionDLM-Base
9
+ tags:
10
+ - multimodal
11
+ - diffusion-language-model
12
+ - dllm
13
+ - region-captioning
14
+ - dense-captioning
15
+ - parallel-decoding
16
+ ---
17
+
18
+ # PerceptionDLM
19
+
20
+ **PerceptionDLM** is a multimodal **diffusion** language model optimized for **efficient parallel region perception**. Built upon [**PerceptionDLM-Base**](https://huggingface.co/MSALab/PerceptionDLM-Base), it fully leverages the parallel decoding nature of diffusion language models (DLMs): given an image and multiple region masks, it generates descriptions for **all regions simultaneously** within a single denoising process — avoiding the linear latency growth of autoregressive (AR) region captioners.
21
+
22
+ To the best of our knowledge, this is the first model to achieve **parallel region captioning and perception** by leveraging the advantages of diffusion language models.
23
+
24
+ <p align="center">
25
+ 📄 <a href="https://arxiv.org/abs/2606.19534">Paper</a> &nbsp;|&nbsp;
26
+ 💻 <a href="https://github.com/MSALab-PKU/PerceptionDLM">Code</a> &nbsp;|&nbsp;
27
+ 📊 <a href="https://huggingface.co/datasets/MSALab/ParaDLC-Bench">ParaDLC-Bench</a>
28
+ </p>
29
+
30
+ ## Highlights
31
+
32
+ - 🧩 **Parallel region captioning.** Region prompting + structured attention masking describe many masked regions in a single denoising pass.
33
+ - ⚡ **Up to 3.44× throughput speedup** in dense multi-region scenarios, with stable per-image latency (~2.9s).
34
+ - 🎯 **Competitive quality** with strong AR region captioners while being substantially faster.
35
+
36
+ ## Model Details
37
+
38
+ | | |
39
+ | :--- | :--- |
40
+ | Base model | [MSALab/PerceptionDLM-Base](https://huggingface.co/MSALab/PerceptionDLM-Base) |
41
+ | Key modules | Region prompting, RoI-aligned feature replay, structured attention masking |
42
+ | Region prompts | up to 6 per image |
43
+ | Default inference | 32 diffusion steps, generation length 32 per mask |
44
+ | Training | full ParaCaption corpus, ~2 days on 32× H100 |
45
+ | Precision | bfloat16 |
46
+
47
+ ## Results (ParaDLC-Bench)
48
+
49
+ | Method | Type | Avg (%) | TPF ↑ | Time (s) ↓ |
50
+ | :--- | :--- | :---: | :---: | :---: |
51
+ | GAR-8B | AR (sequential) | 69.5 | 1.0 | 479 |
52
+ | LLaDA-V-8B | Diffusion | 35.2 | 1.0 | 3241 |
53
+ | **PerceptionDLM** | **Diffusion (parallel)** | **62.4** | **2.9** | **276** |
54
+
55
+ `TPF` = Tokens Per Forward (higher = more parallel). PerceptionDLM nearly doubles the accuracy of prior diffusion VLMs while drastically reducing inference time.
56
+
57
+ ## Usage
58
+
59
+ Full inference scripts are provided in the [GitHub repository](https://github.com/MSALab-PKU/PerceptionDLM).
60
+
61
+ ```bash
62
+ python demo/infer_pdmllm.py \
63
+ --model-path MSALab/PerceptionDLM \
64
+ --image assets/demo.jpg \
65
+ --masks assets/demo_mask_0.jpg \
66
+ assets/demo_mask_1.jpg \
67
+ assets/demo_mask_2.jpg \
68
+ --gen-length 32 --steps 32 --temperature 0.0 --top-p 1.0
69
+ ```
70
+
71
+ The model takes an RGB image plus one or more binary masks, and returns one caption per region — all generated in parallel.
72
+
73
+ ## Citation
74
+
75
+ ```bibtex
76
+ @article{sun2026perceptiondlm,
77
+ title = {PerceptionDLM: Parallel Region Perception with Multimodal Diffusion Language Models},
78
+ author = {Sun, Yueyi and Wang, Yuhao and Li, Jason and Tian, Ye and Zhang, Tao and Mai, Jacky and Wang, Yihan and Wang, Haochen and Bai, Jinbin and Yang, Ling and Tong, Yunhai},
79
+ journal = {arXiv preprint arXiv:2606.19534},
80
+ year = {2026}
81
+ }
82
+ ```
83
+
84
+ ## License
85
+
86
+ Released under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
cache.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class dLLMCacheConfig:
6
+ prompt_interval_steps: int = 1
7
+ gen_interval_steps: int = 1
8
+ transfer_ratio: float = 0.0
9
+ cfg_interval_steps: int = 1
10
+
11
+
12
+ import torch
13
+ from collections import defaultdict
14
+
15
+
16
+ class Singleton(type):
17
+ _instances = {}
18
+
19
+ def __call__(cls, *args, **kwargs):
20
+ if cls not in cls._instances:
21
+ cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
22
+ return cls._instances[cls]
23
+
24
+
25
+ class dLLMCache(metaclass=Singleton):
26
+ gen_interval_steps: int
27
+ prompt_interval_steps: int
28
+ cfg_interval_steps: int
29
+ prompt_length: int
30
+ transfer_ratio: float
31
+ __cache: defaultdict
32
+ __step_counter: defaultdict
33
+
34
+ @classmethod
35
+ def new_instance(
36
+ cls,
37
+ prompt_interval_steps: int = 1,
38
+ gen_interval_steps: int = 1,
39
+ cfg_interval_steps: int = 1,
40
+ transfer_ratio: float = 0.0,
41
+ ) -> "dLLMCache":
42
+ ins = cls()
43
+ setattr(ins, "prompt_interval_steps", prompt_interval_steps)
44
+ setattr(ins, "gen_interval_steps", gen_interval_steps)
45
+ setattr(ins, "cfg_interval_steps", cfg_interval_steps)
46
+ setattr(ins, "transfer_ratio", transfer_ratio)
47
+ ins.init()
48
+ return ins
49
+
50
+ def init(self) -> None:
51
+ self.__cache = defaultdict(
52
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
53
+ )
54
+ self.__step_counter = defaultdict(lambda: defaultdict(lambda: 0))
55
+
56
+ def reset_cache(self, prompt_length: int = 0) -> None:
57
+ self.init()
58
+ torch.cuda.empty_cache()
59
+ self.prompt_length = prompt_length
60
+ self.cache_type = "no_cfg"
61
+
62
+ def set_cache(
63
+ self, layer_id: int, feature_name: str, features: torch.Tensor, cache_type: str
64
+ ) -> None:
65
+ self.__cache[self.cache_type][cache_type][layer_id][feature_name] = {
66
+ 0: features
67
+ }
68
+
69
+ def get_cache(
70
+ self, layer_id: int, feature_name: str, cache_type: str
71
+ ) -> torch.Tensor:
72
+ output = self.__cache[self.cache_type][cache_type][layer_id][feature_name][0]
73
+ return output
74
+
75
+ def update_step(self, layer_id: int) -> None:
76
+ self.__step_counter[self.cache_type][layer_id] += 1
77
+
78
+ def refresh_gen(self, layer_id: int = 0) -> bool:
79
+ return (self.current_step - 1) % self.gen_interval_steps == 0
80
+
81
+ def refresh_prompt(self, layer_id: int = 0) -> bool:
82
+ return (self.current_step - 1) % self.prompt_interval_steps == 0
83
+
84
+ def refresh_cfg(self, layer_id: int = 0) -> bool:
85
+ return (
86
+ self.current_step - 1
87
+ ) % self.cfg_interval_steps == 0 or self.current_step <= 5
88
+
89
+ @property
90
+ def current_step(self) -> int:
91
+ return max(list(self.__step_counter[self.cache_type].values()), default=1)
92
+
93
+ def __repr__(self):
94
+ return f"USE dLLMCache"
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|start_header_id|>system<|end_header_id|>\nYou are a helpful assistant.<|eot_id|>\n{% endif %}<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n{% if message['role'] == 'assistant' %}{% generation %}{{ message['content'][0]['text'] }}<|eot_id|>{% endgeneration %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}<img><IMG_CONTEXT></img>{% elif content['type'] == 'video' or 'video' in content %}<video><VIDEO_CONTEXT></video>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|eot_id|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|start_header_id|>assistant<|end_header_id|>\n{% endif %}"
3
+ }
chat_template_utils.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import json
17
+ import re
18
+ import types
19
+ from contextlib import contextmanager
20
+ from datetime import datetime
21
+ from functools import lru_cache
22
+ from inspect import isfunction
23
+ from typing import Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
24
+
25
+ from packaging import version
26
+
27
+ from transformers.utils import logging
28
+ from transformers.utils.import_utils import is_jinja_available, is_torch_available, is_vision_available
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ if is_jinja_available():
34
+ import jinja2
35
+ from jinja2.ext import Extension
36
+ from jinja2.sandbox import ImmutableSandboxedEnvironment
37
+ else:
38
+ jinja2 = None
39
+
40
+ if is_vision_available():
41
+ from PIL.Image import Image
42
+
43
+ if is_torch_available():
44
+ from torch import Tensor
45
+
46
+
47
+ BASIC_TYPES = (int, float, str, bool, Any, type(None), ...)
48
+ # Extracts the initial segment of the docstring, containing the function description
49
+ description_re = re.compile(r"^(.*?)[\n\s]*(Args:|Returns:|Raises:|\Z)", re.DOTALL)
50
+ # Extracts the Args: block from the docstring
51
+ args_re = re.compile(r"\n\s*Args:\n\s*(.*?)[\n\s]*(Returns:|Raises:|\Z)", re.DOTALL)
52
+ # Splits the Args: block into individual arguments
53
+ args_split_re = re.compile(
54
+ r"""
55
+ (?:^|\n) # Match the start of the args block, or a newline
56
+ \s*(\w+):\s* # Capture the argument name and strip spacing
57
+ (.*?)\s* # Capture the argument description, which can span multiple lines, and strip trailing spacing
58
+ (?=\n\s*\w+:|\Z) # Stop when you hit the next argument or the end of the block
59
+ """,
60
+ re.DOTALL | re.VERBOSE,
61
+ )
62
+ # Extracts the Returns: block from the docstring, if present. Note that most chat templates ignore the return type/doc!
63
+ returns_re = re.compile(r"\n\s*Returns:\n\s*(.*?)[\n\s]*(Raises:|\Z)", re.DOTALL)
64
+
65
+
66
+ class TypeHintParsingException(Exception):
67
+ """Exception raised for errors in parsing type hints to generate JSON schemas"""
68
+
69
+ pass
70
+
71
+
72
+ class DocstringParsingException(Exception):
73
+ """Exception raised for errors in parsing docstrings to generate JSON schemas"""
74
+
75
+ pass
76
+
77
+
78
+ def _get_json_schema_type(param_type: str) -> dict[str, str]:
79
+ type_mapping = {
80
+ int: {"type": "integer"},
81
+ float: {"type": "number"},
82
+ str: {"type": "string"},
83
+ bool: {"type": "boolean"},
84
+ type(None): {"type": "null"},
85
+ Any: {},
86
+ }
87
+ if is_vision_available():
88
+ type_mapping[Image] = {"type": "image"}
89
+ if is_torch_available():
90
+ type_mapping[Tensor] = {"type": "audio"}
91
+ return type_mapping.get(param_type, {"type": "object"})
92
+
93
+
94
+ def _parse_type_hint(hint: str) -> dict:
95
+ origin = get_origin(hint)
96
+ args = get_args(hint)
97
+
98
+ if origin is None:
99
+ try:
100
+ return _get_json_schema_type(hint)
101
+ except KeyError:
102
+ raise TypeHintParsingException(
103
+ "Couldn't parse this type hint, likely due to a custom class or object: ", hint
104
+ )
105
+
106
+ elif origin is Union or (hasattr(types, "UnionType") and origin is types.UnionType):
107
+ # Recurse into each of the subtypes in the Union, except None, which is handled separately at the end
108
+ subtypes = [_parse_type_hint(t) for t in args if t is not type(None)]
109
+ if len(subtypes) == 1:
110
+ # A single non-null type can be expressed directly
111
+ return_dict = subtypes[0]
112
+ elif all(isinstance(subtype["type"], str) for subtype in subtypes):
113
+ # A union of basic types can be expressed as a list in the schema
114
+ return_dict = {"type": sorted([subtype["type"] for subtype in subtypes])}
115
+ else:
116
+ # A union of more complex types requires "anyOf"
117
+ return_dict = {"anyOf": subtypes}
118
+ if type(None) in args:
119
+ return_dict["nullable"] = True
120
+ return return_dict
121
+
122
+ elif origin is list:
123
+ if not args:
124
+ return {"type": "array"}
125
+ else:
126
+ # Lists can only have a single type argument, so recurse into it
127
+ return {"type": "array", "items": _parse_type_hint(args[0])}
128
+
129
+ elif origin is tuple:
130
+ if not args:
131
+ return {"type": "array"}
132
+ if len(args) == 1:
133
+ raise TypeHintParsingException(
134
+ f"The type hint {str(hint).replace('typing.', '')} is a Tuple with a single element, which "
135
+ "we do not automatically convert to JSON schema as it is rarely necessary. If this input can contain "
136
+ "more than one element, we recommend "
137
+ "using a List[] type instead, or if it really is a single element, remove the Tuple[] wrapper and just "
138
+ "pass the element directly."
139
+ )
140
+ if ... in args:
141
+ raise TypeHintParsingException(
142
+ "Conversion of '...' is not supported in Tuple type hints. "
143
+ "Use List[] types for variable-length"
144
+ " inputs instead."
145
+ )
146
+ return {"type": "array", "prefixItems": [_parse_type_hint(t) for t in args]}
147
+
148
+ elif origin is dict:
149
+ # The JSON equivalent to a dict is 'object', which mandates that all keys are strings
150
+ # However, we can specify the type of the dict values with "additionalProperties"
151
+ out = {"type": "object"}
152
+ if len(args) == 2:
153
+ out["additionalProperties"] = _parse_type_hint(args[1])
154
+ return out
155
+
156
+ raise TypeHintParsingException("Couldn't parse this type hint, likely due to a custom class or object: ", hint)
157
+
158
+
159
+ def _convert_type_hints_to_json_schema(func: Callable) -> dict:
160
+ type_hints = get_type_hints(func)
161
+ signature = inspect.signature(func)
162
+ required = []
163
+ for param_name, param in signature.parameters.items():
164
+ if param.annotation == inspect.Parameter.empty:
165
+ raise TypeHintParsingException(f"Argument {param.name} is missing a type hint in function {func.__name__}")
166
+ if param.default == inspect.Parameter.empty:
167
+ required.append(param_name)
168
+
169
+ properties = {}
170
+ for param_name, param_type in type_hints.items():
171
+ properties[param_name] = _parse_type_hint(param_type)
172
+
173
+ schema = {"type": "object", "properties": properties}
174
+ if required:
175
+ schema["required"] = required
176
+
177
+ return schema
178
+
179
+
180
+ def parse_google_format_docstring(docstring: str) -> tuple[Optional[str], Optional[dict], Optional[str]]:
181
+ """
182
+ Parses a Google-style docstring to extract the function description,
183
+ argument descriptions, and return description.
184
+
185
+ Args:
186
+ docstring (str): The docstring to parse.
187
+
188
+ Returns:
189
+ The function description, arguments, and return description.
190
+ """
191
+
192
+ # Extract the sections
193
+ description_match = description_re.search(docstring)
194
+ args_match = args_re.search(docstring)
195
+ returns_match = returns_re.search(docstring)
196
+
197
+ # Clean and store the sections
198
+ description = description_match.group(1).strip() if description_match else None
199
+ docstring_args = args_match.group(1).strip() if args_match else None
200
+ returns = returns_match.group(1).strip() if returns_match else None
201
+
202
+ # Parsing the arguments into a dictionary
203
+ if docstring_args is not None:
204
+ docstring_args = "\n".join([line for line in docstring_args.split("\n") if line.strip()]) # Remove blank lines
205
+ matches = args_split_re.findall(docstring_args)
206
+ args_dict = {match[0]: re.sub(r"\s*\n+\s*", " ", match[1].strip()) for match in matches}
207
+ else:
208
+ args_dict = {}
209
+
210
+ return description, args_dict, returns
211
+
212
+
213
+ def get_json_schema(func: Callable) -> dict:
214
+ """
215
+ This function generates a JSON schema for a given function, based on its docstring and type hints. This is
216
+ mostly used for passing lists of tools to a chat template. The JSON schema contains the name and description of
217
+ the function, as well as the names, types and descriptions for each of its arguments. `get_json_schema()` requires
218
+ that the function has a docstring, and that each argument has a description in the docstring, in the standard
219
+ Google docstring format shown below. It also requires that all the function arguments have a valid Python type hint.
220
+
221
+ Although it is not required, a `Returns` block can also be added, which will be included in the schema. This is
222
+ optional because most chat templates ignore the return value of the function.
223
+
224
+ Args:
225
+ func: The function to generate a JSON schema for.
226
+
227
+ Returns:
228
+ A dictionary containing the JSON schema for the function.
229
+
230
+ Examples:
231
+ ```python
232
+ >>> def multiply(x: float, y: float):
233
+ >>> '''
234
+ >>> A function that multiplies two numbers
235
+ >>>
236
+ >>> Args:
237
+ >>> x: The first number to multiply
238
+ >>> y: The second number to multiply
239
+ >>> '''
240
+ >>> return x * y
241
+ >>>
242
+ >>> print(get_json_schema(multiply))
243
+ {
244
+ "name": "multiply",
245
+ "description": "A function that multiplies two numbers",
246
+ "parameters": {
247
+ "type": "object",
248
+ "properties": {
249
+ "x": {"type": "number", "description": "The first number to multiply"},
250
+ "y": {"type": "number", "description": "The second number to multiply"}
251
+ },
252
+ "required": ["x", "y"]
253
+ }
254
+ }
255
+ ```
256
+
257
+ The general use for these schemas is that they are used to generate tool descriptions for chat templates that
258
+ support them, like so:
259
+
260
+ ```python
261
+ >>> from transformers import AutoTokenizer
262
+ >>> from transformers.utils import get_json_schema
263
+ >>>
264
+ >>> def multiply(x: float, y: float):
265
+ >>> '''
266
+ >>> A function that multiplies two numbers
267
+ >>>
268
+ >>> Args:
269
+ >>> x: The first number to multiply
270
+ >>> y: The second number to multiply
271
+ >>> return x * y
272
+ >>> '''
273
+ >>>
274
+ >>> multiply_schema = get_json_schema(multiply)
275
+ >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
276
+ >>> messages = [{"role": "user", "content": "What is 179 x 4571?"}]
277
+ >>> formatted_chat = tokenizer.apply_chat_template(
278
+ >>> messages,
279
+ >>> tools=[multiply_schema],
280
+ >>> chat_template="tool_use",
281
+ >>> return_dict=True,
282
+ >>> return_tensors="pt",
283
+ >>> add_generation_prompt=True
284
+ >>> )
285
+ >>> # The formatted chat can now be passed to model.generate()
286
+ ```
287
+
288
+ Each argument description can also have an optional `(choices: ...)` block at the end, such as
289
+ `(choices: ["tea", "coffee"])`, which will be parsed into an `enum` field in the schema. Note that this will
290
+ only be parsed correctly if it is at the end of the line:
291
+
292
+ ```python
293
+ >>> def drink_beverage(beverage: str):
294
+ >>> '''
295
+ >>> A function that drinks a beverage
296
+ >>>
297
+ >>> Args:
298
+ >>> beverage: The beverage to drink (choices: ["tea", "coffee"])
299
+ >>> '''
300
+ >>> pass
301
+ >>>
302
+ >>> print(get_json_schema(drink_beverage))
303
+ ```
304
+ {
305
+ 'name': 'drink_beverage',
306
+ 'description': 'A function that drinks a beverage',
307
+ 'parameters': {
308
+ 'type': 'object',
309
+ 'properties': {
310
+ 'beverage': {
311
+ 'type': 'string',
312
+ 'enum': ['tea', 'coffee'],
313
+ 'description': 'The beverage to drink'
314
+ }
315
+ },
316
+ 'required': ['beverage']
317
+ }
318
+ }
319
+ """
320
+ doc = inspect.getdoc(func)
321
+ if not doc:
322
+ raise DocstringParsingException(
323
+ f"Cannot generate JSON schema for {func.__name__} because it has no docstring!"
324
+ )
325
+ doc = doc.strip()
326
+ main_doc, param_descriptions, return_doc = parse_google_format_docstring(doc)
327
+
328
+ json_schema = _convert_type_hints_to_json_schema(func)
329
+ if (return_dict := json_schema["properties"].pop("return", None)) is not None:
330
+ if return_doc is not None: # We allow a missing return docstring since most templates ignore it
331
+ return_dict["description"] = return_doc
332
+ for arg, schema in json_schema["properties"].items():
333
+ if arg not in param_descriptions:
334
+ raise DocstringParsingException(
335
+ f"Cannot generate JSON schema for {func.__name__} because the docstring has no description for the argument '{arg}'"
336
+ )
337
+ desc = param_descriptions[arg]
338
+ enum_choices = re.search(r"\(choices:\s*(.*?)\)\s*$", desc, flags=re.IGNORECASE)
339
+ if enum_choices:
340
+ schema["enum"] = [c.strip() for c in json.loads(enum_choices.group(1))]
341
+ desc = enum_choices.string[: enum_choices.start()].strip()
342
+ schema["description"] = desc
343
+
344
+ output = {"name": func.__name__, "description": main_doc, "parameters": json_schema}
345
+ if return_dict is not None:
346
+ output["return"] = return_dict
347
+ return {"type": "function", "function": output}
348
+
349
+
350
+ def _render_with_assistant_indices(
351
+ compiled_template, messages, tools, documents, add_generation_prompt, **template_kwargs
352
+ ):
353
+ rendered_blocks = []
354
+ generation_indices = []
355
+ with compiled_template.environment.activate_tracker(rendered_blocks, generation_indices):
356
+ for block in compiled_template.generate(
357
+ messages=messages,
358
+ tools=tools,
359
+ documents=documents,
360
+ add_generation_prompt=add_generation_prompt,
361
+ **template_kwargs,
362
+ ):
363
+ rendered_blocks.append(block)
364
+ rendered_chat = "".join(rendered_blocks)
365
+ return rendered_chat, generation_indices
366
+
367
+
368
+ @lru_cache
369
+ def _compile_jinja_template(chat_template):
370
+ if not is_jinja_available():
371
+ raise ImportError(
372
+ "apply_chat_template requires jinja2 to be installed. Please install it using `pip install jinja2`."
373
+ )
374
+
375
+ class AssistantTracker(Extension):
376
+ # This extension is used to track the indices of assistant-generated tokens in the rendered chat
377
+ tags = {"generation"}
378
+
379
+ def __init__(self, environment: ImmutableSandboxedEnvironment):
380
+ # The class is only initiated by jinja.
381
+ super().__init__(environment)
382
+ environment.extend(activate_tracker=self.activate_tracker)
383
+ self._rendered_blocks = None
384
+ self._generation_indices = None
385
+
386
+ def parse(self, parser: jinja2.parser.Parser) -> jinja2.nodes.CallBlock:
387
+ lineno = next(parser.stream).lineno
388
+ body = parser.parse_statements(["name:endgeneration"], drop_needle=True)
389
+ return jinja2.nodes.CallBlock(self.call_method("_generation_support"), [], [], body).set_lineno(lineno)
390
+
391
+ @jinja2.pass_eval_context
392
+ def _generation_support(self, context: jinja2.nodes.EvalContext, caller: jinja2.runtime.Macro) -> str:
393
+ rv = caller()
394
+ if self.is_active():
395
+ # Only track generation indices if the tracker is active
396
+ start_index = len("".join(self._rendered_blocks))
397
+ end_index = start_index + len(rv)
398
+ self._generation_indices.append((start_index, end_index))
399
+ return rv
400
+
401
+ def is_active(self) -> bool:
402
+ return self._rendered_blocks or self._generation_indices
403
+
404
+ @contextmanager
405
+ def activate_tracker(self, rendered_blocks: list[int], generation_indices: list[int]):
406
+ try:
407
+ if self.is_active():
408
+ raise ValueError("AssistantTracker should not be reused before closed")
409
+ self._rendered_blocks = rendered_blocks
410
+ self._generation_indices = generation_indices
411
+
412
+ yield
413
+ finally:
414
+ self._rendered_blocks = None
415
+ self._generation_indices = None
416
+
417
+ if version.parse(jinja2.__version__) < version.parse("3.1.0"):
418
+ raise ImportError(
419
+ f"apply_chat_template requires jinja2>=3.1.0 to be installed. Your version is {jinja2.__version__}."
420
+ )
421
+
422
+ def raise_exception(message):
423
+ raise jinja2.exceptions.TemplateError(message)
424
+
425
+ def tojson(x, ensure_ascii=False, indent=None, separators=None, sort_keys=False):
426
+ # We override the built-in tojson filter because Jinja's default filter escapes HTML characters
427
+ # We also expose some options like custom indents and separators
428
+ return json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys)
429
+
430
+ def strftime_now(format):
431
+ return datetime.now().strftime(format)
432
+
433
+ jinja_env = ImmutableSandboxedEnvironment(
434
+ trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols]
435
+ )
436
+ jinja_env.filters["tojson"] = tojson
437
+ jinja_env.globals["raise_exception"] = raise_exception
438
+ jinja_env.globals["strftime_now"] = strftime_now
439
+ return jinja_env.from_string(chat_template)
440
+
441
+
442
+ def render_jinja_template(
443
+ conversations: list[list[dict[str, str]]],
444
+ tools: Optional[list[Union[dict, Callable]]] = None,
445
+ documents: Optional[list[dict[str, str]]] = None,
446
+ chat_template: Optional[str] = None,
447
+ return_assistant_tokens_mask: Optional[bool] = False,
448
+ continue_final_message: Optional[bool] = False,
449
+ add_generation_prompt: Optional[bool] = False,
450
+ **kwargs,
451
+ ) -> str:
452
+ if return_assistant_tokens_mask and not re.search(r"\{\%-?\s*generation\s*-?\%\}", chat_template):
453
+ logger.warning_once(
454
+ "return_assistant_tokens_mask==True but chat template does not contain `{% generation %}` keyword."
455
+ )
456
+
457
+ # Compilation function uses a cache to avoid recompiling the same template
458
+ compiled_template = _compile_jinja_template(chat_template)
459
+
460
+ # We accept either JSON schemas or functions for tools. If we get functions, we convert them to schemas
461
+ if tools is not None:
462
+ tool_schemas = []
463
+ for tool in tools:
464
+ if isinstance(tool, dict):
465
+ tool_schemas.append(tool)
466
+ elif isfunction(tool):
467
+ tool_schemas.append(get_json_schema(tool))
468
+ else:
469
+ raise ValueError(
470
+ "Tools should either be a JSON schema, or a callable function with type hints "
471
+ "and a docstring suitable for auto-conversion to a schema."
472
+ )
473
+ else:
474
+ tool_schemas = None
475
+
476
+ if documents is not None:
477
+ for document in documents:
478
+ if not isinstance(document, dict):
479
+ raise TypeError("Documents should be a list of dicts with 'title' and 'text' keys!")
480
+
481
+ rendered = []
482
+ all_generation_indices = []
483
+ for chat in conversations:
484
+ if hasattr(chat, "messages"):
485
+ # Indicates it's a Conversation object
486
+ chat = chat.messages
487
+ if return_assistant_tokens_mask:
488
+ rendered_chat, generation_indices = _render_with_assistant_indices(
489
+ compiled_template=compiled_template,
490
+ messages=chat,
491
+ tools=tool_schemas,
492
+ documents=documents,
493
+ add_generation_prompt=add_generation_prompt,
494
+ **kwargs,
495
+ )
496
+ all_generation_indices.append(generation_indices)
497
+ else:
498
+ rendered_chat = compiled_template.render(
499
+ messages=chat,
500
+ tools=tool_schemas,
501
+ documents=documents,
502
+ add_generation_prompt=add_generation_prompt,
503
+ **kwargs,
504
+ )
505
+ if continue_final_message:
506
+ final_message = chat[-1]["content"]
507
+ if isinstance(final_message, (list, tuple)):
508
+ for content_block in reversed(final_message):
509
+ if "text" in content_block:
510
+ # Pick the last text block in the message (the first one we hit while iterating in reverse)
511
+ final_message = content_block["text"]
512
+ break
513
+ else:
514
+ raise ValueError(
515
+ "continue_final_message is set but we could not find any text to continuein the final message!"
516
+ )
517
+ if final_message.strip() not in rendered_chat:
518
+ raise ValueError(
519
+ "continue_final_message is set but the final message does not appear in the chat after "
520
+ "applying the chat template! This can happen if the chat template deletes portions of "
521
+ "the final message. Please verify the chat template and final message in your chat to "
522
+ "ensure they are compatible."
523
+ )
524
+ final_msg_loc = rendered_chat.rindex(final_message.strip())
525
+ if rendered_chat[final_msg_loc : final_msg_loc + len(final_message.lstrip())] == final_message:
526
+ # The template preserves spacing or the message doesn't have trailing spacing, so things are simple
527
+ rendered_chat = rendered_chat[: final_msg_loc + len(final_message.lstrip())]
528
+ else:
529
+ # The message has trailing spacing that was trimmed, so we must be more cautious
530
+ rendered_chat = rendered_chat[: final_msg_loc + len(final_message.strip())]
531
+ rendered.append(rendered_chat)
532
+
533
+ return rendered, all_generation_indices
config.json ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PDMLLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_pdmllm.PDMLLMConfig",
7
+ "AutoModel": "modeling_pdmllm.PDMLLM",
8
+ "AutoModelForCausalLM": "modeling_pdmllm.PDMLLM"
9
+ },
10
+ "downsample_ratio": 0.5,
11
+ "image_size": 512,
12
+ "image_token_id": 126349,
13
+ "kernel_size": [
14
+ 16,
15
+ 16
16
+ ],
17
+ "language_model_config": {
18
+ "_attn_implementation_autoset": true,
19
+ "_name_or_path": "bitersun/LLaDA-8B-Instruct-HF",
20
+ "add_cross_attention": false,
21
+ "architectures": [
22
+ "LLaDAModelLM"
23
+ ],
24
+ "attention_bias": false,
25
+ "attention_dropout": 0.0,
26
+ "auto_map": {
27
+ "AutoConfig": "configuration_llada.LLaDAConfig",
28
+ "AutoModel": "modeling_llada.LLaDAModelLM",
29
+ "AutoModelForCausalLM": "modeling_llada.LLaDAModelLM"
30
+ },
31
+ "bad_words_ids": null,
32
+ "begin_suppress_tokens": null,
33
+ "bos_token_id": 128000,
34
+ "chunk_size_feed_forward": 0,
35
+ "cross_attention_hidden_size": null,
36
+ "decoder_start_token_id": null,
37
+ "diversity_penalty": 0.0,
38
+ "do_sample": false,
39
+ "early_stopping": false,
40
+ "encoder_no_repeat_ngram_size": 0,
41
+ "eos_token_id": 126081,
42
+ "exponential_decay_length_penalty": null,
43
+ "finetuning_task": null,
44
+ "forced_bos_token_id": null,
45
+ "forced_eos_token_id": null,
46
+ "hidden_act": "silu",
47
+ "hidden_size": 4096,
48
+ "id2label": {
49
+ "0": "LABEL_0",
50
+ "1": "LABEL_1"
51
+ },
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 12288,
54
+ "is_decoder": false,
55
+ "is_encoder_decoder": false,
56
+ "label2id": {
57
+ "LABEL_0": 0,
58
+ "LABEL_1": 1
59
+ },
60
+ "length_penalty": 1.0,
61
+ "max_length": 20,
62
+ "max_position_embeddings": 16384,
63
+ "min_length": 0,
64
+ "model_type": "llada",
65
+ "no_repeat_ngram_size": 0,
66
+ "num_attention_heads": 32,
67
+ "num_beam_groups": 1,
68
+ "num_beams": 1,
69
+ "num_hidden_layers": 32,
70
+ "num_key_value_heads": 32,
71
+ "num_return_sequences": 1,
72
+ "output_attentions": false,
73
+ "output_hidden_states": false,
74
+ "output_scores": false,
75
+ "pad_token_id": null,
76
+ "prefix": null,
77
+ "pretraining_tp": 1,
78
+ "problem_type": null,
79
+ "pruned_heads": {},
80
+ "remove_invalid_values": false,
81
+ "repetition_penalty": 1.0,
82
+ "return_dict": true,
83
+ "return_dict_in_generate": false,
84
+ "rms_norm_eps": 1e-05,
85
+ "rope_scaling": null,
86
+ "rope_theta": 500000.0,
87
+ "sep_token_id": null,
88
+ "suppress_tokens": null,
89
+ "task_specific_params": null,
90
+ "temperature": 1.0,
91
+ "tf_legacy_loss": false,
92
+ "tie_encoder_decoder": false,
93
+ "tie_word_embeddings": false,
94
+ "tokenizer_class": null,
95
+ "top_k": 50,
96
+ "top_p": 1.0,
97
+ "torch_dtype": "bfloat16",
98
+ "torchscript": false,
99
+ "typical_p": 1.0,
100
+ "use_bfloat16": false,
101
+ "use_cache": false,
102
+ "vocab_size": 126464
103
+ },
104
+ "mask_patch_embedding_in_channels": 3,
105
+ "mask_patch_embedding_out_channels": 1152,
106
+ "model_type": "pdmllm",
107
+ "num_image_token": 256,
108
+ "patch_size": 16,
109
+ "prompt_numbers": 6,
110
+ "replacement_noise_mode": false,
111
+ "roi_output_size": 4,
112
+ "torch_dtype": "bfloat16",
113
+ "transformers_version": "4.51.3",
114
+ "vision_abstractor_config": {
115
+ "projection_type": "mlp2x_gelu"
116
+ },
117
+ "vision_model_config": {
118
+ "_attn_implementation_autoset": true,
119
+ "_name_or_path": "google/siglip2-so400m-patch16-512",
120
+ "add_cross_attention": false,
121
+ "architectures": null,
122
+ "bad_words_ids": null,
123
+ "begin_suppress_tokens": null,
124
+ "bos_token_id": null,
125
+ "chunk_size_feed_forward": 0,
126
+ "cross_attention_hidden_size": null,
127
+ "decoder_start_token_id": null,
128
+ "diversity_penalty": 0.0,
129
+ "do_sample": false,
130
+ "early_stopping": false,
131
+ "encoder_no_repeat_ngram_size": 0,
132
+ "eos_token_id": null,
133
+ "exponential_decay_length_penalty": null,
134
+ "finetuning_task": null,
135
+ "forced_bos_token_id": null,
136
+ "forced_eos_token_id": null,
137
+ "hidden_size": 1152,
138
+ "id2label": {
139
+ "0": "LABEL_0",
140
+ "1": "LABEL_1"
141
+ },
142
+ "initializer_factor": 1.0,
143
+ "is_decoder": false,
144
+ "is_encoder_decoder": false,
145
+ "label2id": {
146
+ "LABEL_0": 0,
147
+ "LABEL_1": 1
148
+ },
149
+ "length_penalty": 1.0,
150
+ "max_length": 20,
151
+ "min_length": 0,
152
+ "model_type": "siglip",
153
+ "no_repeat_ngram_size": 0,
154
+ "num_beam_groups": 1,
155
+ "num_beams": 1,
156
+ "num_return_sequences": 1,
157
+ "output_attentions": false,
158
+ "output_hidden_states": false,
159
+ "output_scores": false,
160
+ "pad_token_id": null,
161
+ "prefix": null,
162
+ "problem_type": null,
163
+ "pruned_heads": {},
164
+ "remove_invalid_values": false,
165
+ "repetition_penalty": 1.0,
166
+ "return_dict": true,
167
+ "return_dict_in_generate": false,
168
+ "sep_token_id": null,
169
+ "suppress_tokens": null,
170
+ "task_specific_params": null,
171
+ "temperature": 1.0,
172
+ "text_config": {
173
+ "_attn_implementation_autoset": false,
174
+ "_name_or_path": "",
175
+ "add_cross_attention": false,
176
+ "architectures": null,
177
+ "attention_dropout": 0.0,
178
+ "bad_words_ids": null,
179
+ "begin_suppress_tokens": null,
180
+ "bos_token_id": 49406,
181
+ "chunk_size_feed_forward": 0,
182
+ "cross_attention_hidden_size": null,
183
+ "decoder_start_token_id": null,
184
+ "diversity_penalty": 0.0,
185
+ "do_sample": false,
186
+ "early_stopping": false,
187
+ "encoder_no_repeat_ngram_size": 0,
188
+ "eos_token_id": 49407,
189
+ "exponential_decay_length_penalty": null,
190
+ "finetuning_task": null,
191
+ "forced_bos_token_id": null,
192
+ "forced_eos_token_id": null,
193
+ "hidden_act": "gelu_pytorch_tanh",
194
+ "hidden_size": 1152,
195
+ "id2label": {
196
+ "0": "LABEL_0",
197
+ "1": "LABEL_1"
198
+ },
199
+ "intermediate_size": 4304,
200
+ "is_decoder": false,
201
+ "is_encoder_decoder": false,
202
+ "label2id": {
203
+ "LABEL_0": 0,
204
+ "LABEL_1": 1
205
+ },
206
+ "layer_norm_eps": 1e-06,
207
+ "length_penalty": 1.0,
208
+ "max_length": 20,
209
+ "max_position_embeddings": 64,
210
+ "min_length": 0,
211
+ "model_type": "siglip_text_model",
212
+ "no_repeat_ngram_size": 0,
213
+ "num_attention_heads": 16,
214
+ "num_beam_groups": 1,
215
+ "num_beams": 1,
216
+ "num_hidden_layers": 27,
217
+ "num_return_sequences": 1,
218
+ "output_attentions": false,
219
+ "output_hidden_states": false,
220
+ "output_scores": false,
221
+ "pad_token_id": 1,
222
+ "prefix": null,
223
+ "problem_type": null,
224
+ "projection_size": 1152,
225
+ "pruned_heads": {},
226
+ "remove_invalid_values": false,
227
+ "repetition_penalty": 1.0,
228
+ "return_dict": true,
229
+ "return_dict_in_generate": false,
230
+ "sep_token_id": null,
231
+ "suppress_tokens": null,
232
+ "task_specific_params": null,
233
+ "temperature": 1.0,
234
+ "tf_legacy_loss": false,
235
+ "tie_encoder_decoder": false,
236
+ "tie_word_embeddings": true,
237
+ "tokenizer_class": null,
238
+ "top_k": 50,
239
+ "top_p": 1.0,
240
+ "torch_dtype": "bfloat16",
241
+ "torchscript": false,
242
+ "typical_p": 1.0,
243
+ "use_bfloat16": false,
244
+ "vocab_size": 256000
245
+ },
246
+ "tf_legacy_loss": false,
247
+ "tie_encoder_decoder": false,
248
+ "tie_word_embeddings": true,
249
+ "tokenizer_class": null,
250
+ "top_k": 50,
251
+ "top_p": 1.0,
252
+ "torch_dtype": "bfloat16",
253
+ "torchscript": false,
254
+ "typical_p": 1.0,
255
+ "use_bfloat16": false,
256
+ "vision_config": {
257
+ "_attn_implementation_autoset": false,
258
+ "_name_or_path": "",
259
+ "add_cross_attention": false,
260
+ "architectures": null,
261
+ "attention_dropout": 0.0,
262
+ "bad_words_ids": null,
263
+ "begin_suppress_tokens": null,
264
+ "bos_token_id": null,
265
+ "chunk_size_feed_forward": 0,
266
+ "cross_attention_hidden_size": null,
267
+ "decoder_start_token_id": null,
268
+ "diversity_penalty": 0.0,
269
+ "do_sample": false,
270
+ "early_stopping": false,
271
+ "encoder_no_repeat_ngram_size": 0,
272
+ "eos_token_id": null,
273
+ "exponential_decay_length_penalty": null,
274
+ "finetuning_task": null,
275
+ "forced_bos_token_id": null,
276
+ "forced_eos_token_id": null,
277
+ "hidden_act": "gelu_pytorch_tanh",
278
+ "hidden_size": 1152,
279
+ "id2label": {
280
+ "0": "LABEL_0",
281
+ "1": "LABEL_1"
282
+ },
283
+ "image_size": 512,
284
+ "intermediate_size": 4304,
285
+ "is_decoder": false,
286
+ "is_encoder_decoder": false,
287
+ "label2id": {
288
+ "LABEL_0": 0,
289
+ "LABEL_1": 1
290
+ },
291
+ "layer_norm_eps": 1e-06,
292
+ "length_penalty": 1.0,
293
+ "max_length": 20,
294
+ "min_length": 0,
295
+ "model_type": "siglip_vision_model",
296
+ "no_repeat_ngram_size": 0,
297
+ "num_attention_heads": 16,
298
+ "num_beam_groups": 1,
299
+ "num_beams": 1,
300
+ "num_channels": 3,
301
+ "num_hidden_layers": 27,
302
+ "num_return_sequences": 1,
303
+ "output_attentions": false,
304
+ "output_hidden_states": false,
305
+ "output_scores": false,
306
+ "pad_token_id": null,
307
+ "patch_size": 16,
308
+ "prefix": null,
309
+ "problem_type": null,
310
+ "pruned_heads": {},
311
+ "remove_invalid_values": false,
312
+ "repetition_penalty": 1.0,
313
+ "return_dict": true,
314
+ "return_dict_in_generate": false,
315
+ "sep_token_id": null,
316
+ "suppress_tokens": null,
317
+ "task_specific_params": null,
318
+ "temperature": 1.0,
319
+ "tf_legacy_loss": false,
320
+ "tie_encoder_decoder": false,
321
+ "tie_word_embeddings": true,
322
+ "tokenizer_class": null,
323
+ "top_k": 50,
324
+ "top_p": 1.0,
325
+ "torch_dtype": "bfloat16",
326
+ "torchscript": false,
327
+ "typical_p": 1.0,
328
+ "use_bfloat16": false
329
+ }
330
+ },
331
+ "vision_output_key": null,
332
+ "vision_select_layer": -2
333
+ }
configuration_llada.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ LLaDA model configuration"""
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ LLaDA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
29
+
30
+
31
+ class LLaDAConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`LLaDAModel`]. It is used to instantiate an LLaDA
34
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the LLaDA-8B.
36
+
37
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
+ documentation from [`PretrainedConfig`] for more information.
39
+
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32000):
43
+ Vocabulary size of the LLaDA model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`LLaDAModel`]
45
+ hidden_size (`int`, *optional*, defaults to 4096):
46
+ Dimension of the hidden representations.
47
+ intermediate_size (`int`, *optional*, defaults to 11008):
48
+ Dimension of the MLP representations.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
+ Number of hidden layers in the Transformer decoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 32):
52
+ Number of attention heads for each attention layer in the Transformer decoder.
53
+ num_key_value_heads (`int`, *optional*):
54
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
+ by meanpooling all the original heads within that group. For more details checkout [this
59
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
+ `num_attention_heads`.
61
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
+ The non-linear activation function (function or string) in the decoder.
63
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
64
+ The maximum sequence length that this model might ever be used with.
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
68
+ The epsilon used by the rms normalization layers.
69
+ use_cache (`bool`, *optional*, defaults to `True`):
70
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
71
+ relevant if `config.is_decoder=True`.
72
+ pad_token_id (`int`, *optional*):
73
+ Padding token id.
74
+ bos_token_id (`int`, *optional*, defaults to 1):
75
+ Beginning of stream token id.
76
+ eos_token_id (`int`, *optional*, defaults to 2):
77
+ End of stream token id.
78
+ pretraining_tp (`int`, *optional*, defaults to 1):
79
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
80
+ document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is
81
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
82
+ issue](https://github.com/pytorch/pytorch/issues/76232).
83
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
84
+ Whether to tie weight embeddings
85
+ rope_theta (`float`, *optional*, defaults to 10000.0):
86
+ The base period of the RoPE embeddings.
87
+ rope_scaling (`Dict`, *optional*):
88
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
89
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
90
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
91
+ `max_position_embeddings` to the expected new maximum.
92
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
93
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
94
+ attention_dropout (`float`, *optional*, defaults to 0.0):
95
+ The dropout ratio for the attention probabilities.
96
+ """
97
+
98
+ model_type = "llada"
99
+ keys_to_ignore_at_inference = ["past_key_values"]
100
+
101
+ def __init__(
102
+ self,
103
+ vocab_size=32000,
104
+ hidden_size=4096,
105
+ intermediate_size=11008,
106
+ num_hidden_layers=32,
107
+ num_attention_heads=32,
108
+ num_key_value_heads=None,
109
+ hidden_act="silu",
110
+ max_position_embeddings=2048,
111
+ initializer_range=0.02,
112
+ rms_norm_eps=1e-6,
113
+ use_cache=True,
114
+ pad_token_id=None,
115
+ bos_token_id=1,
116
+ eos_token_id=2,
117
+ pretraining_tp=1,
118
+ tie_word_embeddings=False,
119
+ rope_theta=10000.0,
120
+ rope_scaling=None,
121
+ attention_bias=False,
122
+ attention_dropout=0.0,
123
+ **kwargs,
124
+ ):
125
+ self.vocab_size = vocab_size
126
+ self.max_position_embeddings = max_position_embeddings
127
+ self.hidden_size = hidden_size
128
+ self.intermediate_size = intermediate_size
129
+ self.num_hidden_layers = num_hidden_layers
130
+ self.num_attention_heads = num_attention_heads
131
+
132
+ # for backward compatibility
133
+ if num_key_value_heads is None:
134
+ num_key_value_heads = num_attention_heads
135
+
136
+ self.num_key_value_heads = num_key_value_heads
137
+ self.hidden_act = hidden_act
138
+ self.initializer_range = initializer_range
139
+ self.rms_norm_eps = rms_norm_eps
140
+ self.pretraining_tp = pretraining_tp
141
+ self.use_cache = use_cache
142
+ self.rope_theta = rope_theta
143
+ self.rope_scaling = rope_scaling
144
+ self._rope_scaling_validation()
145
+ self.attention_bias = attention_bias
146
+ self.attention_dropout = attention_dropout
147
+
148
+ super().__init__(
149
+ pad_token_id=pad_token_id,
150
+ bos_token_id=bos_token_id,
151
+ eos_token_id=eos_token_id,
152
+ tie_word_embeddings=tie_word_embeddings,
153
+ **kwargs,
154
+ )
155
+
156
+ def _rope_scaling_validation(self):
157
+ """
158
+ Validate the `rope_scaling` configuration.
159
+ """
160
+ if self.rope_scaling is None:
161
+ return
162
+
163
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
164
+ raise ValueError(
165
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
166
+ f"got {self.rope_scaling}"
167
+ )
168
+ rope_scaling_type = self.rope_scaling.get("type", None)
169
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
170
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
171
+ raise ValueError(
172
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
173
+ )
174
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
175
+ raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
configuration_pdmllm.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig, AutoConfig, CONFIG_MAPPING
2
+ from transformers.dynamic_module_utils import get_class_from_dynamic_module
3
+ from transformers.utils import logging
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+ class PDMLLMConfig(PretrainedConfig):
8
+ model_type = "pdmllm"
9
+ is_composition = True
10
+
11
+ def __init__(self,
12
+ language_model_config=None,
13
+ vision_model_config=None,
14
+ vision_abstractor_config=None,
15
+ image_token_id=None,
16
+ image_size=512,
17
+ patch_size=16,
18
+ downsample_ratio=0.5,
19
+ vision_select_layer=-2,
20
+ replacement_noise_mode=False,
21
+ prompt_numbers=5,
22
+ mask_patch_embedding_in_channels=3,
23
+ mask_patch_embedding_out_channels=1152,
24
+ kernel_size=[16, 16],
25
+ roi_output_size=None,
26
+ **kwargs):
27
+ super().__init__(**kwargs)
28
+ self.replacement_noise_mode = replacement_noise_mode
29
+ self.image_size = image_size
30
+ self.patch_size = patch_size
31
+ self.downsample_ratio = downsample_ratio
32
+ self.num_image_token = int((image_size // patch_size) ** 2 * (downsample_ratio ** 2))
33
+ self.vision_select_layer = vision_select_layer
34
+ self.prompt_numbers = prompt_numbers
35
+ self.mask_patch_embedding_in_channels = mask_patch_embedding_in_channels
36
+ # self.mask_patch_embedding_out_channels = mask_patch_embedding_out_channels
37
+ # roi_output_size controls how many RoI-aligned tokens replace each crop token.
38
+ # None => keep original (feat_h, feat_w); int => square grid; tuple => (h, w).
39
+ self.roi_output_size = roi_output_size
40
+
41
+ if isinstance(language_model_config, dict):
42
+ if '_name_or_path' not in language_model_config:
43
+ language_model_config['_name_or_path'] = self._name_or_path
44
+ language_model_type = language_model_config.get('model_type', '')
45
+ is_remote_code = '.' in language_model_config.get('auto_map', {}).get('AutoConfig', '')
46
+ if language_model_type in CONFIG_MAPPING and not is_remote_code:
47
+ language_model_config = AutoConfig.for_model(**language_model_config)
48
+ elif language_model_type:
49
+ Config = get_class_from_dynamic_module(language_model_config["auto_map"]["AutoConfig"],
50
+ language_model_config['_name_or_path'])
51
+ language_model_config = Config(**language_model_config)
52
+ self.language_model_config = language_model_config
53
+
54
+ if isinstance(vision_model_config, dict):
55
+ if '_name_or_path' not in vision_model_config:
56
+ vision_model_config['_name_or_path'] = self._name_or_path
57
+ vision_model_type = vision_model_config.get('model_type', '')
58
+ is_remote_code = '.' in vision_model_config.get('auto_map', {}).get('AutoConfig', '')
59
+ if vision_model_type in CONFIG_MAPPING and not is_remote_code:
60
+ vision_model_config = AutoConfig.for_model(**vision_model_config)
61
+ elif vision_model_type:
62
+ Config = get_class_from_dynamic_module(vision_model_config["auto_map"]["AutoConfig"],
63
+ vision_model_config['_name_or_path'])
64
+ vision_model_config = Config(**vision_model_config)
65
+ self.vision_model_config = vision_model_config
66
+
67
+ self.vision_abstractor_config = vision_abstractor_config
68
+
69
+ self.image_token_id = image_token_id
70
+
71
+ try:
72
+ self.mask_patch_embedding_out_channels = self.vision_model_config.vision_config.hidden_size
73
+ except:
74
+ self.mask_patch_embedding_out_channels = mask_patch_embedding_out_channels
75
+
76
+ self.kernel_size = kernel_size
77
+
78
+ @property
79
+ def hidden_size(self):
80
+ return self.language_model_config.hidden_size
81
+
82
+ def to_dict(self):
83
+ ret_dict = super().to_dict()
84
+ ret_dict["auto_map"] = {
85
+ "AutoConfig": "configuration_pdmllm.PDMLLMConfig",
86
+ "AutoModel": "modeling_pdmllm.PDMLLM",
87
+ "AutoModelForCausalLM": "modeling_pdmllm.PDMLLM"
88
+ }
89
+ return ret_dict
90
+
91
+ @classmethod
92
+ def from_dict(cls, config_dict, **kwargs):
93
+ if 'name_or_path' in kwargs:
94
+ config_dict['_name_or_path'] = kwargs.pop('name_or_path')
95
+ return super().from_dict(config_dict, **kwargs)
model-00001-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce9307665e98867ed074d88e163135df3dbf752102330c7f8831be63836450d
3
+ size 3950989604
model-00002-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64331db82bdf3706da1c58ac678d9c76644619753acb1ab575ea548f9656a3a4
3
+ size 3926026584
model-00003-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fd6da345b0f5ad381e446101fe12c166537565a73bc20358e460674bbb6b25e
3
+ size 3926026664
model-00004-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc9fdfa279c5bf910f9bd5358c79385247b4d1b56411e77bb61cfbedc7f223e3
3
+ size 3926026664
model-00005-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f573bf881d7b601eda01ab540a7aa57eb0beec140324ee02f0e0445b1d7b4280
3
+ size 2646697936
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_abstractor.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+
7
+ def build_projection(projection_type: str, in_dim: int, out_dim: int) -> nn.Module:
8
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projection_type)
9
+ if mlp_gelu_match:
10
+ mlp_depth = int(mlp_gelu_match.group(1))
11
+ modules = [nn.Linear(in_dim, out_dim)]
12
+ for _ in range(1, mlp_depth):
13
+ modules.append(nn.GELU())
14
+ modules.append(nn.Linear(out_dim, out_dim))
15
+ projection = nn.Sequential(*modules)
16
+ return projection
17
+
18
+ raise ValueError(f'Unknown projector type: {projection_type}')
19
+
20
+
21
+ class PerceiverProjection(nn.Module):
22
+ def __init__(self, projection_type: str, in_dim: int, out_dim: int):
23
+ super().__init__()
24
+ self.projection = build_projection(projection_type, in_dim, out_dim)
25
+
26
+ def forward(self, input_embeds: torch.Tensor):
27
+ input_embeds.requires_grad_(True)
28
+ embeds = self.projection(input_embeds)
29
+ embeds.requires_grad_(True)
30
+ return embeds
modeling_llada.py ADDED
The diff for this file is too large to render. See raw diff
 
modeling_pdmllm.py ADDED
@@ -0,0 +1,1194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List
2
+ import re
3
+ import torch
4
+ import torchvision
5
+ import transformers
6
+ from einops import rearrange
7
+ from torch import nn
8
+ from torch.nn import functional as F
9
+ from transformers import PreTrainedModel, AutoModel, AutoModelForCausalLM, GenerationConfig
10
+ from transformers import AutoConfig
11
+ from transformers.modeling_outputs import BaseModelOutputWithPooling
12
+ from transformers.feature_extraction_utils import BatchFeature
13
+ from .configuration_pdmllm import PDMLLMConfig
14
+ from .modeling_abstractor import PerceiverProjection
15
+ from .modeling_llada import LLaDAModelLM
16
+ from .cache import *
17
+ from .configuration_llada import LLaDAConfig
18
+
19
+ def build_vision_model(config, model=None):
20
+ assert hasattr(config, "name_or_path")
21
+ if model is None:
22
+ model = AutoModel.from_pretrained(
23
+ config.name_or_path, config=config, trust_remote_code=True)
24
+ return model
25
+
26
+ def vit_forward_with_mask(
27
+ self,
28
+ pixel_values,
29
+ interpolate_pos_encoding: bool = False,
30
+ mask_embeddings=None,
31
+ output_hidden_states: bool = False,
32
+ **kwargs,
33
+ ):
34
+ attention_mask = kwargs.pop("attention_mask", None)
35
+ kwargs.pop("output_hidden_states", None)
36
+ kwargs.pop("output_attentions", None)
37
+
38
+ _, _, height, width = pixel_values.shape
39
+ target_dtype = self.embeddings.patch_embedding.weight.dtype
40
+ patch_embeds = self.embeddings.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
41
+ embeddings = patch_embeds.flatten(2).transpose(1, 2)
42
+
43
+ #hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
44
+ if mask_embeddings is not None:
45
+ embeddings = embeddings + mask_embeddings.to(embeddings.device, dtype=embeddings.dtype)
46
+
47
+ if interpolate_pos_encoding:
48
+ embeddings = embeddings + self.embeddings.interpolate_pos_encoding(embeddings, height, width)
49
+ else:
50
+ embeddings = embeddings + self.embeddings.position_embedding(self.embeddings.position_ids)
51
+
52
+ collected_hs = [] if output_hidden_states else None
53
+ for layer in self.encoder.layers:
54
+ hs = layer(embeddings, attention_mask=attention_mask)
55
+ if isinstance(hs, tuple):
56
+ hs = hs[0]
57
+ embeddings = hs
58
+ if collected_hs is not None:
59
+ collected_hs.append(embeddings)
60
+
61
+ last_hidden_state = self.post_layernorm(embeddings)
62
+ pooler_output = self.head(last_hidden_state) if self.use_head else None
63
+
64
+ return BaseModelOutputWithPooling(
65
+ last_hidden_state=last_hidden_state,
66
+ pooler_output=pooler_output,
67
+ hidden_states=tuple(collected_hs) if collected_hs is not None else None,
68
+ )
69
+
70
+
71
+ class PDMLLM(PreTrainedModel):
72
+ config_class = PDMLLMConfig
73
+ supports_gradient_checkpointing = True
74
+ _skip_keys_device_placement = "past_key_values"
75
+ _supports_cache_class = False
76
+ _supports_flash_attn_2 = True
77
+ _supports_sdpa = True
78
+ accepts_loss_kwargs=False
79
+
80
+ def __init__(self,
81
+ config: PDMLLMConfig,
82
+ language_model=None,
83
+ vision_model=None,
84
+ processor=None,
85
+ ):
86
+ super().__init__(config)
87
+ self.image_size = config.image_size
88
+ self.patch_size = config.patch_size
89
+ self.downsample_ratio = config.downsample_ratio
90
+ self.num_image_token = config.num_image_token
91
+ self.vision_select_layer = config.vision_select_layer
92
+ self.replacement_noise_mode = config.replacement_noise_mode
93
+
94
+ try:
95
+ vision_hidden_states = self.config.vision_model_config.hidden_size
96
+ except:
97
+ vision_hidden_states = self.config.vision_model_config.vision_config.hidden_size
98
+ self.config.vision_model_config.hidden_size = vision_hidden_states
99
+
100
+ vision_model = build_vision_model(config.vision_model_config, vision_model)
101
+
102
+ vision_abstractor = PerceiverProjection(**config.vision_abstractor_config,
103
+ in_dim=self.config.vision_model_config.hidden_size * (int(1 / self.downsample_ratio) ** 2),
104
+ out_dim=self.config.language_model_config.hidden_size)
105
+
106
+ if language_model is None:
107
+ kwargs_ = {}
108
+ if config._attn_implementation_internal is not None:
109
+ kwargs_['attn_implementation'] = config._attn_implementation_internal
110
+ if 'llada' in config.language_model_config.name_or_path.lower():
111
+ with transformers.modeling_utils.no_init_weights():
112
+ language_model = LLaDAModelLM(config.language_model_config)
113
+ else:
114
+ raise ValueError(f"Unsupported language model: {config.language_model_config.name_or_path}")
115
+
116
+ self.vision_model = vision_model
117
+ self.vision_abstractor = vision_abstractor
118
+ self.language_model = language_model
119
+
120
+ # self.mask_patch_embedding = nn.Conv2d(
121
+ # in_channels=1,
122
+ # out_channels=config.mask_patch_embedding_out_channels,
123
+ # kernel_size=config.kernel_size,
124
+ # stride=config.kernel_size,
125
+ # bias=False,
126
+ # )
127
+
128
+ self.mask_id_embedding = nn.Embedding(config.prompt_numbers, config.vision_model_config.vision_config.hidden_size)
129
+
130
+ #self.vit = self.vision_model.vision_model
131
+ #self.vit.forward = vit_forward_with_mask.__get__(self.vit, self.vit.__class__)
132
+ self.vision_model.vision_model.forward = vit_forward_with_mask.__get__(self.vision_model.vision_model, self.vision_model.vision_model.__class__)
133
+
134
+ # zero-init
135
+ # for param in self.mask_patch_embedding.parameters():
136
+ # nn.init.zeros_(param)
137
+
138
+ if processor is not None:
139
+ self.processor = processor
140
+
141
+ self.prompt_numbers = config.prompt_numbers
142
+ # Optional override for how many RoI-aligned tokens replace a crop token.
143
+ self.roi_output_size = getattr(config, "roi_output_size", None)
144
+
145
+ # Only add special tokens when a processor is available (i.e. during training).
146
+ # During inference via from_pretrained, the tokens are already in the saved tokenizer.
147
+ if hasattr(self, "processor"):
148
+ self._add_special_tokens()
149
+ self.gradient_checkpointing_enable()
150
+
151
+ def _add_special_tokens(self):
152
+ assert hasattr(self, "processor")
153
+
154
+ visual_prompt_nums = self.prompt_numbers
155
+ visual_prompt_tokens = [f"<Prompt{i}>" for i in range(visual_prompt_nums)]
156
+ visual_prompt_tokens.append("<NO_Prompt>")
157
+ special_tokens = visual_prompt_tokens
158
+ num_new_tokens = self.processor.tokenizer.add_tokens(
159
+ special_tokens, special_tokens=True
160
+ )
161
+ self.language_model.resize_token_embeddings(len(self.processor.tokenizer))
162
+ print(f"Added {num_new_tokens} special tokens.")
163
+
164
+ def forward_vision(self, pixel_values, global_mask_values_list=None, prompt_tokens=None):
165
+ # pixel_values (n, c, h, w)
166
+
167
+ # Unwrap BatchFeature if needed
168
+ if isinstance(pixel_values, BatchFeature):
169
+ pixel_values = pixel_values["pixel_values"]
170
+
171
+ # Precompute mask embeddings so they can be injected before the vision encoder.
172
+ mask_embeds = None
173
+ if global_mask_values_list is not None:
174
+ if isinstance(global_mask_values_list, BatchFeature):
175
+ mask_values_list = global_mask_values_list.get("pixel_values_list", None)
176
+ else:
177
+ mask_values_list = global_mask_values_list
178
+ if mask_values_list is not None:
179
+ K = self.config.kernel_size[0]
180
+ h_patches = pixel_values.shape[2] // K
181
+ w_patches = pixel_values.shape[3] // K
182
+ mask_embeds = torch.zeros(
183
+ pixel_values.shape[0],
184
+ self.config.vision_model_config.vision_config.hidden_size,
185
+ h_patches, w_patches,
186
+ dtype=pixel_values.dtype,
187
+ device=pixel_values.device,
188
+ )
189
+ for prompt_token, mask_values in zip(prompt_tokens, mask_values_list):
190
+ prompt_id = int(re.search(r"<Prompt(\d+)>", prompt_token).group(1))
191
+ vp_id = torch.tensor(prompt_id, device=pixel_values.device)
192
+ vp_embed = self.mask_id_embedding(vp_id).to(pixel_values.device) # (C,)
193
+
194
+ if mask_values.shape[1] > 1:
195
+ mask_values = mask_values.mean(dim=1, keepdim=True)
196
+ mask_values = mask_values.to(pixel_values.device)
197
+ mask_values = torch.round((mask_values + 1.0) / 2.0 * 255.0).long()
198
+ mask_values = torch.clamp(mask_values, min=0, max=255)
199
+ binary_mask = (mask_values != 255).to(pixel_values.dtype) # (B, 1, H, W)
200
+
201
+ ## mask_patch_embeds = self.mask_patch_embedding(binary_mask) # (B, C, h_patches, w_patches)
202
+
203
+ active_patches = torch.nn.functional.interpolate(
204
+ binary_mask,
205
+ size=(h_patches, w_patches),
206
+ mode='nearest'
207
+ ) # (B, 1, h_patches, w_patches)
208
+
209
+ # Add mask id embedding (at active patches) + mask conv embedding
210
+ mask_embeds = mask_embeds + vp_embed.view(1, -1, 1, 1) * active_patches ## + mask_patch_embeds
211
+
212
+ mask_embeds = mask_embeds.flatten(2).transpose(1, 2) # (B, num_patches, C)
213
+
214
+ vision_outputs = None
215
+ if mask_embeds is not None:
216
+ vision_outputs = self.vision_model.vision_model(
217
+ pixel_values=pixel_values,
218
+ mask_embeddings=mask_embeds,
219
+ output_hidden_states=True,
220
+ )
221
+
222
+ assert vision_outputs is not None
223
+ if self.vision_select_layer == -1:
224
+ image_embeddings = vision_outputs.last_hidden_state
225
+ else:
226
+ image_embeddings = vision_outputs.hidden_states[self.vision_select_layer] # (B, N, C)
227
+
228
+ # Keep all tile embeddings — do NOT filter by image_flags.
229
+ # All tiles are real crops from a single image (produced by dynamic_preprocess).
230
+ # Filtering by pixel-sum==0 can incorrectly drop tiles whose normalized
231
+ # pixel values happen to sum to zero, causing shape mismatches with
232
+ # input_ids image tokens and aspect_ratios in downstream _merge / RoI-align.
233
+ vit_embeds = image_embeddings
234
+
235
+ if self.downsample_ratio != 1:
236
+ patch_num = self.image_size // self.patch_size
237
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], patch_num, patch_num, vit_embeds.shape[-1])
238
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
239
+ vit_embeds = vit_embeds.flatten(1, 2)
240
+
241
+ vit_embeds = self.vision_abstractor(vit_embeds)
242
+
243
+ return vit_embeds
244
+
245
+ def prepare_for_lm(self, input_ids, vision_embeds):
246
+ inputs_embeds = self.get_input_embeddings()(input_ids)
247
+ vision_embeds_ = vision_embeds
248
+ if vision_embeds is not None:
249
+ try:
250
+ vision_mask = input_ids == self.config.image_token_id
251
+ if torch.count_nonzero(vision_mask).item() != vision_embeds.shape[:-1].numel():
252
+ info = "vision embeddings mismatch input embeddings: " \
253
+ f"vision_mask shape={vision_mask.shape}; " \
254
+ f"vision_mask count={torch.count_nonzero(vision_mask)}; " \
255
+ f"vision_embeds shape={vision_embeds.shape}"
256
+ # print(info)
257
+ num_vision_1 = torch.count_nonzero(vision_mask).item()
258
+ num_vision_2 = vision_embeds.shape[:-1].numel()
259
+ vision_embeds = vision_embeds.contiguous()
260
+ if num_vision_1 <= num_vision_2:
261
+ vision_embeds = vision_embeds.view(-1, vision_embeds.size(-1))[:num_vision_1]
262
+ else:
263
+ vision_embeds = vision_embeds.view(-1, vision_embeds.size(-1))
264
+ less_nums = num_vision_1 - num_vision_2
265
+ vision_embeds = torch.cat([vision_embeds, vision_embeds[-less_nums:]], dim=0)
266
+ vision_embeds = vision_embeds.contiguous()
267
+
268
+ # assert torch.count_nonzero(vision_mask).item() == vision_embeds.shape[:-1].numel(), \
269
+ # "vision embeddings mismatch input embeddings: " \
270
+ # f"vision_mask shape={vision_mask.shape}; " \
271
+ # f"vision_mask count={torch.count_nonzero(vision_mask)}; " \
272
+ # f"vision_embeds shape={vision_embeds.shape}"
273
+ inputs_embeds = torch.masked_scatter(inputs_embeds, vision_mask.unsqueeze(-1),
274
+ vision_embeds.to(inputs_embeds.dtype).view(-1,
275
+ vision_embeds.size(-1)))
276
+ except:
277
+ inputs_embeds = inputs_embeds + torch.sum(vision_embeds_[0, 0, :]) * 0.0
278
+
279
+ return inputs_embeds
280
+
281
+ def _prepare_inputs_for_generation(
282
+ self,
283
+ input_ids,
284
+ pixel_values=None,
285
+ global_mask_values_list=None,
286
+ aspect_ratios=None,
287
+ bboxes=None,
288
+ prompt_tokens=None,
289
+ attention_mask=None,
290
+ position_ids=None,
291
+ tokenizer=None,
292
+ ):
293
+ vision_embeds = None
294
+ if pixel_values is not None:
295
+ vision_embeds = self.forward_vision(pixel_values, global_mask_values_list=global_mask_values_list, prompt_tokens=prompt_tokens)
296
+
297
+ inputs_embeds = self.prepare_for_lm(input_ids, vision_embeds)
298
+ reserved_token_spans: List[List[tuple]] = [[] for _ in range(input_ids.shape[0])]
299
+
300
+ length_changed = False
301
+ if vision_embeds is not None and aspect_ratios is not None and bboxes is not None:
302
+ crop_tokens = [
303
+ tokenizer.convert_tokens_to_ids(f"<|reserved_token_{pid}|>")
304
+ for pid in range(self.prompt_numbers)
305
+ ]
306
+
307
+ patch_num = self.image_size // self.patch_size
308
+ if self.downsample_ratio != 1:
309
+ feat_h = int(patch_num * self.downsample_ratio)
310
+ feat_w = int(patch_num * self.downsample_ratio)
311
+ else:
312
+ feat_h = patch_num
313
+ feat_w = patch_num
314
+
315
+ if vision_embeds.shape[0] != 1:
316
+ image_features_tiles = rearrange(
317
+ vision_embeds[1:].unsqueeze(0), "b n (h w) c -> b n c h w", h=feat_h, w=feat_w
318
+ )
319
+ else:
320
+ image_features_tiles = rearrange(
321
+ vision_embeds.unsqueeze(0), "b n (h w) c -> b n c h w", h=feat_h, w=feat_w
322
+ )
323
+
324
+ new_inputs_embeds = []
325
+ new_input_ids_list = []
326
+ assert inputs_embeds.shape[0] == 1, "Currently only support batch_size=1"
327
+
328
+ for batch_idx in range(inputs_embeds.shape[0]):
329
+ curr_inputs_embeds = inputs_embeds[batch_idx]
330
+ curr_input_ids = input_ids[batch_idx]
331
+
332
+ replacements = []
333
+ orig_input_ids = input_ids[batch_idx]
334
+ for cap_idx, crop_token in enumerate(crop_tokens):
335
+ target_mask = orig_input_ids.eq(crop_token)
336
+ if not target_mask.any():
337
+ continue
338
+ target_indices = target_mask.nonzero().squeeze()
339
+ if target_indices.ndim == 0:
340
+ head_idx = tail_idx = target_indices.item()
341
+ else:
342
+ head_idx = target_indices.min().item()
343
+ tail_idx = target_indices.max().item()
344
+ replacements.append((head_idx, tail_idx, cap_idx, crop_token))
345
+ # Apply replacements in ascending order with running shift to keep spans aligned
346
+ replacements.sort(key=lambda x: x[0])
347
+ running_shift = 0
348
+
349
+ for head_idx, tail_idx, cap_idx, crop_token in replacements:
350
+ adj_head = head_idx + running_shift
351
+ adj_tail = tail_idx + running_shift
352
+ image_features_recover = self._merge(
353
+ image_features_tiles,
354
+ aspect_ratios[batch_idx][0],
355
+ aspect_ratios[batch_idx][1],
356
+ )
357
+
358
+ feat_h, feat_w = image_features_recover.shape[2:]
359
+
360
+ x1, y1, x2, y2 = bboxes[batch_idx][str(crop_token)]
361
+
362
+ orig_h, orig_w = feat_h * 16 * 2, feat_w * 16 * 2
363
+
364
+ roi_orig_x1 = x1 * orig_w
365
+ roi_orig_y1 = y1 * orig_h
366
+ roi_orig_x2 = x2 * orig_w
367
+ roi_orig_y2 = y2 * orig_h
368
+
369
+ spatial_scale = feat_w / orig_w
370
+ roi_feat_x1 = roi_orig_x1 * spatial_scale
371
+ roi_feat_y1 = roi_orig_y1 * spatial_scale
372
+ roi_feat_x2 = roi_orig_x2 * spatial_scale
373
+ roi_feat_y2 = roi_orig_y2 * spatial_scale
374
+
375
+ roi = torch.tensor(
376
+ [0, roi_feat_x1, roi_feat_y1, roi_feat_x2, roi_feat_y2],
377
+ dtype=torch.float32,
378
+ device=image_features_recover.device,
379
+ )
380
+
381
+ if self.roi_output_size is None:
382
+ output_h, output_w = feat_h, feat_w
383
+ elif isinstance(self.roi_output_size, int):
384
+ output_h = output_w = self.roi_output_size
385
+ else:
386
+ output_h, output_w = self.roi_output_size
387
+
388
+ roi_features = torchvision.ops.roi_align(
389
+ input=image_features_recover.float(),
390
+ boxes=roi.unsqueeze(0),
391
+ output_size=(output_h, output_w),
392
+ spatial_scale=spatial_scale,
393
+ sampling_ratio=2,
394
+ aligned=True,
395
+ )
396
+
397
+ image_features_replay = (
398
+ roi_features.permute(0, 2, 3, 1)
399
+ .flatten(1, 2)
400
+ .to(image_features_recover.dtype)
401
+ .squeeze()
402
+ )
403
+
404
+ curr_inputs_embeds = torch.cat(
405
+ [
406
+ curr_inputs_embeds[:adj_head],
407
+ image_features_replay,
408
+ curr_inputs_embeds[adj_tail + 1 :],
409
+ ]
410
+ )
411
+ curr_input_ids = torch.cat(
412
+ [
413
+ curr_input_ids[:adj_head],
414
+ torch.full(
415
+ (image_features_replay.shape[0],),
416
+ crop_token,
417
+ dtype=torch.long,
418
+ device=curr_input_ids.device,
419
+ ),
420
+ curr_input_ids[adj_tail + 1 :],
421
+ ]
422
+ )
423
+ reserved_token_spans[batch_idx].append(
424
+ (cap_idx, adj_head, adj_head + image_features_replay.shape[0])
425
+ )
426
+
427
+ length_changed = True
428
+
429
+ delta = image_features_replay.shape[0] - (tail_idx - head_idx + 1)
430
+ running_shift += delta
431
+
432
+ if reserved_token_spans[batch_idx]:
433
+ reserved_token_spans[batch_idx].sort(key=lambda x: x[1])
434
+
435
+ new_inputs_embeds.append(curr_inputs_embeds.unsqueeze(0))
436
+ new_input_ids_list.append(curr_input_ids.unsqueeze(0))
437
+
438
+ inputs_embeds = torch.cat(new_inputs_embeds, dim=0)
439
+ input_ids = torch.cat(new_input_ids_list, dim=0)
440
+
441
+ if (
442
+ length_changed
443
+ or attention_mask is None
444
+ or attention_mask.shape[1] != inputs_embeds.shape[1]
445
+ or position_ids is None
446
+ or position_ids.shape[1] != inputs_embeds.shape[1]
447
+ ):
448
+ attention_mask = torch.ones(
449
+ inputs_embeds.shape[0],
450
+ inputs_embeds.shape[1],
451
+ dtype=torch.long,
452
+ device=inputs_embeds.device,
453
+ )
454
+ position_ids = (
455
+ torch.arange(
456
+ 0,
457
+ inputs_embeds.shape[1],
458
+ dtype=torch.long,
459
+ device=inputs_embeds.device,
460
+ )
461
+ .unsqueeze(0)
462
+ .repeat(inputs_embeds.shape[0], 1)
463
+ )
464
+
465
+ return inputs_embeds, attention_mask, position_ids, input_ids, reserved_token_spans
466
+
467
+ def pixel_shuffle(self, x, scale_factor=0.5):
468
+ x = x.contiguous()
469
+ n, w, h, c = x.size()
470
+ # N, W, H, C --> N, W, H * scale, C // scale
471
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
472
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
473
+ x = x.permute(0, 2, 1, 3).contiguous()
474
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
475
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
476
+ int(c / (scale_factor * scale_factor)))
477
+ x = x.permute(0, 2, 1, 3).contiguous()
478
+ return x
479
+
480
+ def _merge(self, tiles: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
481
+ """Merge image tiles back to original spatial layout."""
482
+ batch_size, num_tiles, num_channels, tile_height, tile_width = tiles.size()
483
+ assert num_tiles == ncw * nch, f"{ncw * nch} != {num_tiles}"
484
+
485
+ tiles = tiles.view(batch_size, nch, ncw, num_channels, tile_height, tile_width)
486
+ tiles = tiles.permute(0, 3, 1, 4, 2, 5).contiguous()
487
+
488
+ original_height = nch * tile_height
489
+ original_width = ncw * tile_width
490
+
491
+ image = tiles.view(batch_size, num_channels, original_height, original_width)
492
+
493
+ return image
494
+
495
+ def _build_custom_4d_mask(
496
+ self,
497
+ input_ids: torch.Tensor,
498
+ attention_mask_2d: torch.Tensor,
499
+ tokenizer,
500
+ dtype: torch.dtype,
501
+ reserved_token_spans: Optional[List[List[tuple]]] = None,
502
+ ) -> Optional[torch.Tensor]:
503
+ """Construct a 4D attention mask so each Mask_Cap_i block only attends to itself,
504
+ image tokens, and its corresponding reserved token embeddings.
505
+
506
+ Args:
507
+ input_ids: (B, L)
508
+ attention_mask_2d: (B, L) padding mask
509
+ tokenizer: tokenizer with convert_tokens_to_ids
510
+ dtype: target dtype for the mask (match hidden states)
511
+ reserved_token_spans: optional per-batch list of (idx, start, end) spans that
512
+ replaced <|reserved_token_i|>. End is exclusive.
513
+ Returns:
514
+ mask_4d: (B, 1, L, L) or None if tokenizer is missing
515
+ """
516
+ if tokenizer is None:
517
+ return None
518
+
519
+ device = input_ids.device
520
+ batch_size, seq_len = input_ids.shape
521
+ neg_value = torch.finfo(dtype).min
522
+
523
+ image_token_id = getattr(self.config, "image_token_id", None)
524
+ image_positions = input_ids.eq(image_token_id) if image_token_id is not None else None
525
+
526
+ eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
527
+
528
+ # Precompute Mask_Cap and reserved token ids
529
+ mask_cap_ids = []
530
+ reserved_token_ids = []
531
+ for i in range(self.prompt_numbers):
532
+ mask_cap_ids.append((i, tokenizer.convert_tokens_to_ids(f"<|Mask_Cap_{i}|>")))
533
+ reserved_token_ids.append(tokenizer.convert_tokens_to_ids(f"<|reserved_token_{i}|>"))
534
+
535
+ mask_4d = torch.zeros((batch_size, 1, seq_len, seq_len), device=device, dtype=dtype)
536
+
537
+ for b in range(batch_size):
538
+ seq = input_ids[b]
539
+ valid_positions = attention_mask_2d[b].bool()
540
+ valid_indices = torch.nonzero(valid_positions, as_tuple=False).flatten().tolist()
541
+ img_idx = (
542
+ torch.nonzero(image_positions[b], as_tuple=False).flatten().tolist()
543
+ if image_positions is not None
544
+ else []
545
+ )
546
+
547
+ for cap_idx, cap_token_id in mask_cap_ids:
548
+ if cap_token_id is None or cap_token_id < 0:
549
+ continue
550
+ cap_locs = torch.nonzero(seq == cap_token_id, as_tuple=False).flatten()
551
+ if cap_locs.numel() == 0:
552
+ continue
553
+ start = cap_locs[0].item()
554
+
555
+ # Determine the end boundary: next mask_cap or last token in the sentence.
556
+ # NOTE: <|eot_id|> is NOT used as boundary because it now serves as
557
+ # padding within each caption block after the caption-padding change.
558
+ end_candidates = []
559
+ for later_idx, later_token_id in mask_cap_ids:
560
+ if later_idx <= cap_idx:
561
+ continue
562
+ later_pos = torch.nonzero(seq == later_token_id, as_tuple=False).flatten()
563
+ if later_pos.numel() > 0:
564
+ end_candidates.append(later_pos[0].item())
565
+ end = min(end_candidates) if len(end_candidates) > 0 else seq_len
566
+
567
+ group_tokens = [i for i in range(start, end) if valid_positions[i]]
568
+ if len(group_tokens) == 0:
569
+ continue
570
+
571
+ # Collect reserved token spans for this caption block
572
+ allowed_reserved_positions: List[int] = []
573
+ if reserved_token_spans is not None and len(reserved_token_spans) > b:
574
+ for idx, span_start, span_end in reserved_token_spans[b]:
575
+ if idx == cap_idx:
576
+ allowed_reserved_positions.extend(range(span_start, min(span_end, seq_len)))
577
+
578
+ # Fallback to original reserved token id if no recorded span
579
+ if len(allowed_reserved_positions) == 0:
580
+ reserved_id = reserved_token_ids[cap_idx]
581
+ if reserved_id is not None and reserved_id >= 0:
582
+ allowed_reserved_positions.extend(
583
+ torch.nonzero(seq == reserved_id, as_tuple=False).flatten().tolist()
584
+ )
585
+ fix_prompt_positions = torch.nonzero(
586
+ seq == tokenizer.convert_tokens_to_ids('<|reserved_token_0|>'),
587
+ as_tuple=False,
588
+ ).flatten()
589
+ fix_prompt_len = fix_prompt_positions[0].item() if fix_prompt_positions.numel() > 0 else 0
590
+ # Use the latest recorded reserved span (after sorting) when available
591
+ last_span_end = (
592
+ reserved_token_spans[b][-1][2]
593
+ if reserved_token_spans is not None
594
+ and len(reserved_token_spans) > b
595
+ and len(reserved_token_spans[b]) > 0
596
+ else fix_prompt_len
597
+ )
598
+ mask_cap_0_position = torch.nonzero(
599
+ seq == tokenizer.convert_tokens_to_ids('<|Mask_Cap_0|>'),
600
+ as_tuple=False,
601
+ ).flatten().tolist()
602
+ fix_prompt_idx = torch.arange(fix_prompt_len, device=device).tolist() + list(range(last_span_end, mask_cap_0_position[0]))
603
+ allowed_targets = set(group_tokens) | set(fix_prompt_idx) | set(allowed_reserved_positions)
604
+ disallowed = set(valid_indices) - allowed_targets
605
+ if len(disallowed) == 0:
606
+ continue
607
+ disallowed_tensor = torch.tensor(list(disallowed), device=device)
608
+ for q in group_tokens:
609
+ mask_4d[b, 0, q, disallowed_tensor] = neg_value
610
+
611
+ # Optionally mask out padding for all queries (consistency)
612
+ if len(valid_indices) < seq_len:
613
+ invalid = torch.nonzero(~valid_positions, as_tuple=False).flatten()
614
+ if invalid.numel() > 0:
615
+ mask_4d[b, 0, :, invalid] = neg_value
616
+
617
+ return mask_4d
618
+
619
+ def forward(self,
620
+ input_ids: torch.LongTensor = None,
621
+ attention_mask: Optional[torch.BoolTensor] = None,
622
+ position_ids: Optional[torch.LongTensor] = None,
623
+ pixel_values: Optional[torch.Tensor] = None,
624
+ global_mask_values_list: Optional[List[torch.Tensor]] = None,
625
+ aspect_ratios: Optional[List] = None,
626
+ bboxes: Optional[List] = None,
627
+ prompt_tokens: Optional[List] = None,
628
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
629
+ labels: Optional[torch.LongTensor] = None,
630
+ return_dict: bool = True,
631
+ **kwargs,
632
+ ):
633
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
634
+
635
+ # ========Get visual embedding========
636
+ if pixel_values is not None:
637
+ vision_embeds = self.forward_vision(pixel_values, global_mask_values_list=global_mask_values_list, prompt_tokens=prompt_tokens)
638
+ else:
639
+ vision_embeds = None
640
+
641
+ # ========Prepare inputs for LM========
642
+ # print(f"input_ids.shape: {input_ids.shape}", {vision_embeds.shape})
643
+ inputs_embeds = self.prepare_for_lm(input_ids, vision_embeds)
644
+ # print(f"inputs_embeds.shape: {inputs_embeds.shape}")
645
+ p_mask = None
646
+ answer_length = None
647
+ reserved_token_spans = [[] for _ in range(input_ids.shape[0])]
648
+
649
+ # ========Feature Replay (from grasp_any_region)========
650
+ if vision_embeds is not None and aspect_ratios is not None and bboxes is not None:
651
+ # Get crop tokens from reserved special tokens
652
+ crop_tokens = [
653
+ self.processor.tokenizer.convert_tokens_to_ids(
654
+ f"<|reserved_token_{pid}|>"
655
+ )
656
+ for pid in range(self.prompt_numbers)
657
+ ]
658
+
659
+ # Reshape vision_embeds to tiles format for feature replay
660
+ # Assuming vision_embeds shape: (num_tiles, num_tokens, hidden_dim)
661
+ # Need to convert to (batch, num_tiles, channels, h, w) format
662
+ patch_num = self.image_size // self.patch_size
663
+ if self.downsample_ratio != 1:
664
+ feat_h = int(patch_num * self.downsample_ratio)
665
+ feat_w = int(patch_num * self.downsample_ratio)
666
+ else:
667
+ feat_h = patch_num
668
+ feat_w = patch_num
669
+
670
+ # Reshape vision_embeds: (num_tiles, num_tokens, hidden_dim) -> (1, num_tiles, hidden_dim, h, w)
671
+ if vision_embeds.shape[0] != 1:
672
+ image_features_tiles = rearrange(
673
+ vision_embeds[1:].unsqueeze(0), "b n (h w) c -> b n c h w", h=feat_h, w=feat_w
674
+ )
675
+ else:
676
+ image_features_tiles = rearrange(
677
+ vision_embeds.unsqueeze(0), "b n (h w) c -> b n c h w", h=feat_h, w=feat_w
678
+ )
679
+
680
+
681
+ new_inputs_embeds = []
682
+ new_input_ids_list = []
683
+ new_labels = [] if labels is not None else None
684
+ length_changed = False
685
+ assert inputs_embeds.shape[0] == 1, "Currently only support batch_size=1"
686
+
687
+ for batch_idx in range(inputs_embeds.shape[0]):
688
+ curr_inputs_embeds = inputs_embeds[batch_idx]
689
+ curr_input_ids = input_ids[batch_idx]
690
+ curr_labels = labels[batch_idx] if labels is not None else None
691
+ # Collect all replacements first to avoid index shifting during insertion
692
+ orig_input_ids = input_ids[batch_idx]
693
+ replacements = []
694
+ for cap_idx, crop_token in enumerate(crop_tokens):
695
+ target_mask = orig_input_ids.eq(crop_token)
696
+ if not target_mask.any():
697
+ continue
698
+ target_indices = target_mask.nonzero().squeeze()
699
+ if target_indices.ndim == 0:
700
+ head_idx = tail_idx = target_indices.item()
701
+ else:
702
+ head_idx = target_indices.min().item()
703
+ tail_idx = target_indices.max().item()
704
+ replacements.append((head_idx, tail_idx, cap_idx, crop_token))
705
+ # Apply replacements in ascending order with running shift to keep spans aligned
706
+ replacements.sort(key=lambda x: x[0])
707
+ running_shift = 0
708
+
709
+ for head_idx, tail_idx, cap_idx, crop_token in replacements:
710
+ adj_head = head_idx + running_shift
711
+ adj_tail = tail_idx + running_shift
712
+
713
+ # Merge tiles back to original spatial layout
714
+ image_features_recover = self._merge(
715
+ image_features_tiles,
716
+ aspect_ratios[batch_idx][0],
717
+ aspect_ratios[batch_idx][1],
718
+ )
719
+ feat_h, feat_w = image_features_recover.shape[2:]
720
+
721
+ # Get bbox coordinates
722
+ x1, y1, x2, y2 = bboxes[batch_idx][str(crop_token)]
723
+
724
+ # RoI-Align
725
+ orig_h, orig_w = feat_h * 28, feat_w * 28 # Original image size
726
+
727
+ # Origin box
728
+ roi_orig_x1 = x1 * orig_w
729
+ roi_orig_y1 = y1 * orig_h
730
+ roi_orig_x2 = x2 * orig_w
731
+ roi_orig_y2 = y2 * orig_h
732
+
733
+ # Feature box
734
+ spatial_scale = feat_w / orig_w
735
+ roi_feat_x1 = roi_orig_x1 * spatial_scale
736
+ roi_feat_y1 = roi_orig_y1 * spatial_scale
737
+ roi_feat_x2 = roi_orig_x2 * spatial_scale
738
+ roi_feat_y2 = roi_orig_y2 * spatial_scale
739
+
740
+ roi = torch.tensor(
741
+ [0, roi_feat_x1, roi_feat_y1, roi_feat_x2, roi_feat_y2],
742
+ dtype=torch.float32,
743
+ device=image_features_recover.device,
744
+ )
745
+
746
+ # output_size controls how many tokens are inserted (output_h * output_w)
747
+ if self.roi_output_size is None:
748
+ output_h, output_w = feat_h, feat_w
749
+ elif isinstance(self.roi_output_size, int):
750
+ output_h = output_w = self.roi_output_size
751
+ else:
752
+ output_h, output_w = self.roi_output_size
753
+
754
+ roi_features = torchvision.ops.roi_align(
755
+ input=image_features_recover.float(),
756
+ boxes=roi.unsqueeze(0),
757
+ output_size=(output_h, output_w),
758
+ spatial_scale=spatial_scale,
759
+ sampling_ratio=2,
760
+ aligned=True,
761
+ )
762
+
763
+ image_features_replay = (
764
+ roi_features.permute(0, 2, 3, 1)
765
+ .flatten(1, 2)
766
+ .to(image_features_recover.dtype)
767
+ .squeeze()
768
+ )
769
+
770
+ # Replace crop token embeddings with RoI features
771
+ curr_inputs_embeds = torch.cat(
772
+ [
773
+ curr_inputs_embeds[:adj_head],
774
+ image_features_replay,
775
+ curr_inputs_embeds[adj_tail + 1 :],
776
+ ]
777
+ )
778
+ curr_input_ids = torch.cat(
779
+ [
780
+ curr_input_ids[:adj_head],
781
+ torch.full(
782
+ (image_features_replay.shape[0],),
783
+ crop_token,
784
+ dtype=torch.long,
785
+ device=input_ids.device,
786
+ ),
787
+ curr_input_ids[adj_tail + 1 :],
788
+ ]
789
+ )
790
+ reserved_token_spans[batch_idx].append(
791
+ (cap_idx, adj_head, adj_head + image_features_replay.shape[0])
792
+ )
793
+
794
+ if curr_labels is not None:
795
+ curr_labels = torch.cat(
796
+ [
797
+ curr_labels[:adj_head],
798
+ -100 * torch.ones(
799
+ image_features_replay.shape[0],
800
+ dtype=torch.long,
801
+ device=labels.device,
802
+ ),
803
+ curr_labels[adj_tail + 1 :],
804
+ ]
805
+ )
806
+
807
+ assert (
808
+ curr_labels is None or curr_inputs_embeds.shape[0] == curr_labels.shape[0]
809
+ ), f"shape mismatch, got {curr_inputs_embeds.shape[0]} != {curr_labels.shape[0]}"
810
+
811
+ length_changed = True
812
+
813
+ # Track shift caused by this replacement for subsequent insertions
814
+ delta = image_features_replay.shape[0] - (tail_idx - head_idx + 1)
815
+ running_shift += delta
816
+
817
+ # Keep spans ordered by start so downstream masking reads consistent positions
818
+ if reserved_token_spans[batch_idx]:
819
+ reserved_token_spans[batch_idx].sort(key=lambda x: x[1])
820
+
821
+ new_inputs_embeds.append(curr_inputs_embeds.unsqueeze(0))
822
+ new_input_ids_list.append(curr_input_ids.unsqueeze(0))
823
+ if new_labels is not None:
824
+ new_labels.append(curr_labels)
825
+
826
+ inputs_embeds = torch.cat(new_inputs_embeds, dim=0)
827
+ input_ids = torch.cat(new_input_ids_list, dim=0)
828
+ if new_labels is not None:
829
+ labels = torch.cat(new_labels, dim=0)
830
+
831
+ if (
832
+ length_changed
833
+ or attention_mask is None
834
+ or attention_mask.shape[1] != inputs_embeds.shape[1]
835
+ or position_ids is None
836
+ or position_ids.shape[1] != inputs_embeds.shape[1]
837
+ ):
838
+ attention_mask = torch.ones(
839
+ inputs_embeds.shape[0],
840
+ inputs_embeds.shape[1],
841
+ dtype=torch.long,
842
+ device=inputs_embeds.device,
843
+ )
844
+ position_ids = (
845
+ torch.arange(
846
+ 0,
847
+ inputs_embeds.shape[1],
848
+ dtype=torch.long,
849
+ device=inputs_embeds.device,
850
+ )
851
+ .unsqueeze(0)
852
+ .repeat(inputs_embeds.shape[0], 1)
853
+ )
854
+
855
+ if attention_mask is None:
856
+ attention_mask = torch.ones(
857
+ inputs_embeds.shape[0],
858
+ inputs_embeds.shape[1],
859
+ dtype=torch.long,
860
+ device=inputs_embeds.device,
861
+ )
862
+ if position_ids is None:
863
+ position_ids = (
864
+ torch.arange(
865
+ 0,
866
+ inputs_embeds.shape[1],
867
+ dtype=torch.long,
868
+ device=inputs_embeds.device,
869
+ )
870
+ .unsqueeze(0)
871
+ .repeat(inputs_embeds.shape[0], 1)
872
+ )
873
+
874
+ tokenizer_for_mask = kwargs.pop("tokenizer", None)
875
+ if tokenizer_for_mask is None and hasattr(self, "processor") and hasattr(self.processor, "tokenizer"):
876
+ tokenizer_for_mask = self.processor.tokenizer
877
+
878
+ custom_mask = self._build_custom_4d_mask(
879
+ input_ids=input_ids,
880
+ attention_mask_2d=attention_mask,
881
+ tokenizer=tokenizer_for_mask,
882
+ dtype=inputs_embeds.dtype,
883
+ reserved_token_spans=reserved_token_spans,
884
+ )
885
+ if custom_mask is not None:
886
+ attention_mask = custom_mask
887
+
888
+ if self.is_gradient_checkpointing and torch.is_grad_enabled():
889
+ inputs_embeds.requires_grad_(True)
890
+
891
+ # Normalize label shape to (batch, seq_len) to match logits masking in language model
892
+ if labels is not None and labels.dim() == 1:
893
+ expected_tokens = inputs_embeds.shape[0] * inputs_embeds.shape[1]
894
+ if labels.numel() == expected_tokens:
895
+ labels = labels.view(inputs_embeds.shape[0], inputs_embeds.shape[1])
896
+
897
+ # ========Forward into LM========
898
+ outputs = self.language_model(
899
+ input_ids=None,
900
+ inputs_embeds=inputs_embeds,
901
+ attention_mask=attention_mask,
902
+ position_ids=position_ids,
903
+ past_key_values=past_key_values,
904
+ return_dict=return_dict,
905
+ labels=labels,
906
+ use_cache=False,
907
+ conversation_ids=None,
908
+ replacement_noise_mode=self.replacement_noise_mode,
909
+ p_mask = p_mask,
910
+ answer_length = answer_length,
911
+ **kwargs,
912
+ )
913
+
914
+ return outputs
915
+
916
+ def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
917
+ super().gradient_checkpointing_enable(gradient_checkpointing_kwargs)
918
+ self.language_model.gradient_checkpointing_enable()
919
+ self.language_model.enable_input_require_grads()
920
+
921
+ def get_input_embeddings(self):
922
+ return self.language_model.get_input_embeddings()
923
+
924
+ def set_input_embeddings(self, value):
925
+ self.language_model.set_input_embeddings(value)
926
+
927
+ def get_output_embeddings(self):
928
+ return self.language_model.get_output_embeddings()
929
+
930
+ def set_output_embeddings(self, new_embeddings):
931
+ self.language_model.set_output_embeddings(new_embeddings)
932
+
933
+ def set_decoder(self, decoder):
934
+ self.language_model.set_decoder(decoder)
935
+
936
+ def get_decoder(self):
937
+ return self.language_model.get_decoder()
938
+
939
+ def tie_weights(self):
940
+ return self.language_model.tie_weights()
941
+
942
+ @torch.no_grad()
943
+ def generate(
944
+ self,
945
+ pixel_values: Optional[torch.FloatTensor] = None,
946
+ input_ids: Optional[torch.FloatTensor] = None,
947
+ global_mask_values_list: Optional[torch.FloatTensor] = None,
948
+ aspect_ratios: Optional[List] = None,
949
+ bboxes: Optional[List] = None,
950
+ prompt_tokens: Optional[List] = None,
951
+ tokenizer=None,
952
+ **generate_kwargs,
953
+ ) -> torch.LongTensor:
954
+ inputs_embeds, attention_mask, position_ids, input_ids, reserved_token_spans = self._prepare_inputs_for_generation(
955
+ input_ids=input_ids,
956
+ pixel_values=pixel_values,
957
+ global_mask_values_list=global_mask_values_list,
958
+ aspect_ratios=aspect_ratios,
959
+ bboxes=bboxes,
960
+ prompt_tokens=prompt_tokens,
961
+ tokenizer=tokenizer,
962
+ )
963
+
964
+ tokenizer_for_mask = tokenizer
965
+ if tokenizer_for_mask is None and hasattr(self, "processor") and hasattr(self.processor, "tokenizer"):
966
+ tokenizer_for_mask = self.processor.tokenizer
967
+
968
+ custom_mask = self._build_custom_4d_mask(
969
+ input_ids=input_ids,
970
+ attention_mask_2d=attention_mask,
971
+ tokenizer=tokenizer_for_mask,
972
+ dtype=inputs_embeds.dtype,
973
+ reserved_token_spans=reserved_token_spans,
974
+ )
975
+ if custom_mask is not None:
976
+ attention_mask = custom_mask
977
+ if 'llada' in self.config.language_model_config.name_or_path.lower():
978
+ outputs = self.language_model.generate_with_embeds_nonblock(
979
+ inputs_embeds=inputs_embeds,
980
+ input_ids=input_ids,
981
+ attention_mask=attention_mask,
982
+ **generate_kwargs,
983
+ )
984
+ return outputs
985
+
986
+ @torch.no_grad()
987
+ def generate_replace_noise(
988
+ self,
989
+ pixel_values: Optional[torch.FloatTensor] = None,
990
+ input_ids: Optional[torch.FloatTensor] = None,
991
+ global_mask_values_list: Optional[torch.FloatTensor] = None,
992
+ aspect_ratios: Optional[List] = None,
993
+ bboxes: Optional[List] = None,
994
+ prompt_tokens: Optional[List] = None,
995
+ tokenizer=None,
996
+ **generate_kwargs,
997
+ ) -> torch.LongTensor:
998
+ inputs_embeds, attention_mask, position_ids, input_ids, reserved_token_spans = self._prepare_inputs_for_generation(
999
+ input_ids=input_ids,
1000
+ pixel_values=pixel_values,
1001
+ global_mask_values_list=global_mask_values_list,
1002
+ aspect_ratios=aspect_ratios,
1003
+ bboxes=bboxes,
1004
+ prompt_tokens=prompt_tokens,
1005
+ tokenizer=tokenizer,
1006
+ )
1007
+
1008
+ tokenizer_for_mask = tokenizer
1009
+ if tokenizer_for_mask is None and hasattr(self, "processor") and hasattr(self.processor, "tokenizer"):
1010
+ tokenizer_for_mask = self.processor.tokenizer
1011
+
1012
+ custom_mask = self._build_custom_4d_mask(
1013
+ input_ids=input_ids,
1014
+ attention_mask_2d=attention_mask,
1015
+ tokenizer=tokenizer_for_mask,
1016
+ dtype=inputs_embeds.dtype,
1017
+ reserved_token_spans=reserved_token_spans,
1018
+ )
1019
+ if custom_mask is not None:
1020
+ attention_mask = custom_mask
1021
+
1022
+ outputs, all_steps_response = self.language_model.generate_with_embeds_replace_noise(
1023
+ inputs_embeds=inputs_embeds,
1024
+ input_ids=input_ids,
1025
+ attention_mask=attention_mask,
1026
+ **generate_kwargs,
1027
+ )
1028
+ return outputs, all_steps_response
1029
+
1030
+ def get_template(self):
1031
+ if 'llada' in self.config.language_model_config.name_or_path.lower():
1032
+ template = dict(
1033
+ SYSTEM=("<|start_header_id|>system<|end_header_id|>\n{system}<|eot_id|>\n"),
1034
+ INSTRUCTION=("<|start_header_id|>user<|end_header_id|>\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"),
1035
+ SUFFIX="<|eot_id|>",
1036
+ SUFFIX_AS_EOS=True,
1037
+ SEP="\n",
1038
+ STOP_WORDS=["<|eot_id|>"],
1039
+ )
1040
+ return template
1041
+
1042
+ @torch.no_grad()
1043
+ def chat(
1044
+ self,
1045
+ tokenizer,
1046
+ pixel_values,
1047
+ question,
1048
+ generation_config,
1049
+ global_mask_values=None,
1050
+ aspect_ratios=None,
1051
+ bboxes=None,
1052
+ history=None,
1053
+ return_history=False,
1054
+ num_patches_list=None,
1055
+ IMG_START_TOKEN='<img>',
1056
+ IMG_END_TOKEN='</img>',
1057
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
1058
+ verbose=False
1059
+ ):
1060
+
1061
+ if history is None and pixel_values is not None and '<image>' not in question:
1062
+ question = '<image>\n' + question
1063
+
1064
+ if num_patches_list is None:
1065
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
1066
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
1067
+
1068
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
1069
+ self.img_context_token_id = img_context_token_id
1070
+
1071
+ template = self.get_template()
1072
+ eos_token_id = tokenizer.convert_tokens_to_ids(template["SUFFIX"].strip())
1073
+
1074
+ history = "" if history is None else history
1075
+ prompt = history
1076
+ prompt = prompt + template["INSTRUCTION"].format(input=question)
1077
+
1078
+ if verbose and pixel_values is not None:
1079
+ image_bs = pixel_values.shape[0]
1080
+ print(f'dynamic ViT batch size: {image_bs}')
1081
+
1082
+ prompt = prompt[::-1]
1083
+ for num_patches in num_patches_list[::-1]:
1084
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
1085
+ prompt = prompt.replace('<image>'[::-1], image_tokens[::-1], 1)
1086
+ prompt = prompt[::-1]
1087
+ model_inputs = tokenizer(prompt, return_tensors='pt')
1088
+ device = torch.device(self.language_model.device if torch.cuda.is_available() else 'cpu')
1089
+ input_ids = model_inputs['input_ids'].to(device)
1090
+ attention_mask = model_inputs['attention_mask'].to(device)
1091
+ generation_config['eos_token_id'] = eos_token_id
1092
+ generation_output = self.generate(
1093
+ pixel_values=pixel_values,
1094
+ global_mask_values=global_mask_values,
1095
+ aspect_ratios=aspect_ratios,
1096
+ bboxes=bboxes,
1097
+ input_ids=input_ids,
1098
+ **generation_config
1099
+ )
1100
+ response = [
1101
+ tokenizer.decode(g[len(p) :].tolist())
1102
+ for p, g in zip(input_ids, generation_output)
1103
+ ][0]
1104
+ # response = tokenizer.batch_decode(generation_output, skip_special_tokens=False)[0]
1105
+ history = history + prompt + response
1106
+ response = response.split(template["SUFFIX"].strip())[0].strip()
1107
+ if return_history:
1108
+ return response, history
1109
+ else:
1110
+ if verbose:
1111
+ print(response)
1112
+ return response
1113
+ return
1114
+
1115
+ @torch.no_grad()
1116
+ def chat_replace_noise(
1117
+ self,
1118
+ tokenizer,
1119
+ pixel_values,
1120
+ question,
1121
+ generation_config,
1122
+ global_mask_values=None,
1123
+ aspect_ratios=None,
1124
+ bboxes=None,
1125
+ history=None,
1126
+ return_history=False,
1127
+ num_patches_list=None,
1128
+ IMG_START_TOKEN='<img>',
1129
+ IMG_END_TOKEN='</img>',
1130
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
1131
+ verbose=False
1132
+
1133
+ ):
1134
+ if history is None and pixel_values is not None and '<image>' not in question:
1135
+ question = '<image>\n' + question
1136
+
1137
+ if num_patches_list is None:
1138
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
1139
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
1140
+
1141
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
1142
+ self.img_context_token_id = img_context_token_id
1143
+
1144
+ template = self.get_template()
1145
+ eos_token_id = tokenizer.convert_tokens_to_ids(template["SUFFIX"].strip())
1146
+
1147
+ history = "" if history is None else history
1148
+ prompt = history
1149
+ prompt = prompt + template["INSTRUCTION"].format(input=question)
1150
+
1151
+ if verbose and pixel_values is not None:
1152
+ image_bs = pixel_values.shape[0]
1153
+ print(f'dynamic ViT batch size: {image_bs}')
1154
+
1155
+ prompt = prompt[::-1]
1156
+ for num_patches in num_patches_list[::-1]:
1157
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
1158
+ prompt = prompt.replace('<image>'[::-1], image_tokens[::-1], 1)
1159
+ prompt = prompt[::-1]
1160
+ model_inputs = tokenizer(prompt, return_tensors='pt')
1161
+ device = torch.device(self.language_model.device if torch.cuda.is_available() else 'cpu')
1162
+ input_ids = model_inputs['input_ids'].to(device)
1163
+ attention_mask = model_inputs['attention_mask'].to(device)
1164
+ generation_config['eos_token_id'] = eos_token_id
1165
+ generation_output, all_steps_response = self.generate_replace_noise(
1166
+ pixel_values=pixel_values,
1167
+ global_mask_values=global_mask_values,
1168
+ aspect_ratios=aspect_ratios,
1169
+ bboxes=bboxes,
1170
+ input_ids=input_ids,
1171
+ **generation_config
1172
+ )
1173
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=False)[0]
1174
+
1175
+ all_steps_response_ = []
1176
+ for step_response in all_steps_response:
1177
+ step_response = tokenizer.batch_decode(step_response, skip_special_tokens=False)[0]
1178
+ all_steps_response_.append(step_response)
1179
+ all_steps_response = all_steps_response_
1180
+ for i, step_response in enumerate(all_steps_response):
1181
+ print(f"Step {i}: {step_response}\n")
1182
+
1183
+ history = history + prompt + response
1184
+ response = response.split(template["SUFFIX"].strip())[0].strip()
1185
+ if return_history:
1186
+ return response, history
1187
+ else:
1188
+ if verbose:
1189
+ print(response)
1190
+ return response
1191
+ return
1192
+
1193
+ AutoConfig.register("pdmllm", PDMLLMConfig)
1194
+ AutoModel.register(PDMLLMConfig, PDMLLM)
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_pdmllm.PDMLLMProcessor"
4
+ },
5
+ "do_convert_rgb": null,
6
+ "do_normalize": true,
7
+ "do_rescale": true,
8
+ "do_resize": true,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_processor_type": "SiglipImageProcessor",
15
+ "image_std": [
16
+ 0.5,
17
+ 0.5,
18
+ 0.5
19
+ ],
20
+ "processor_class": "PDMLLMProcessor",
21
+ "resample": 2,
22
+ "rescale_factor": 0.00392156862745098,
23
+ "size": {
24
+ "height": 512,
25
+ "width": 512
26
+ }
27
+ }
processing_pdmllm.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import math
3
+ import torch
4
+ import warnings
5
+ import PIL.Image
6
+
7
+ from torch.nn import functional as F
8
+ from collections import UserDict, OrderedDict
9
+ from typing import Union, Optional, Tuple, List, Dict, Any
10
+
11
+ from transformers.image_utils import load_image
12
+ from transformers.feature_extraction_utils import BatchFeature
13
+ from .chat_template_utils import render_jinja_template
14
+ from transformers.processing_utils import ProcessorMixin, AllKwargsForChatTemplate
15
+
16
+
17
+ class PDMLLMProcessor(ProcessorMixin):
18
+ attributes = ["tokenizer", "image_processor"]
19
+ optional_attributes = ['chat_template']
20
+ model_input_names = ['input_ids', 'attention_mask', 'pixel_values']
21
+ image_processor_class = "AutoImageProcessor"
22
+ tokenizer_class = "AutoTokenizer"
23
+
24
+ def __init__(
25
+ self, tokenizer, image_processor, chat_template=None,
26
+ image_size=512,
27
+ patch_size=16,
28
+ downsample_ratio=0.5,
29
+ max_sub_img=6,
30
+ min_sub_img=1,
31
+ image_token='<IMG_CONTEXT>',
32
+ image_start_token='<img>',
33
+ image_end_token='</img>',
34
+ special_tokens=['<IMG_CONTEXT>', '<img>', '</img>'],
35
+ **kwargs):
36
+ if chat_template is None:
37
+ chat_template = "{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|start_header_id|>system<|end_header_id|>\nYou are a helpful assistant.<|eot_id|>\n{% endif %}<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n{% if message['role'] == 'assistant' %}{% generation %}{{ message['content'][0]['text'] }}<|eot_id|>{% endgeneration %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}<img><IMG_CONTEXT></img>{% elif content['type'] == 'video' or 'video' in content %}<video><VIDEO_CONTEXT></video>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|eot_id|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|start_header_id|>assistant<|end_header_id|>\n{% endif %}"
38
+ super().__init__(tokenizer=tokenizer, image_processor=image_processor, chat_template=chat_template)
39
+ if isinstance(image_size, List) or isinstance(image_size, Tuple):
40
+ image_size = image_size[0]
41
+ self.num_image_token = int((image_size // patch_size) ** 2 * (downsample_ratio ** 2))
42
+
43
+ self.vision_token_share_pe = kwargs.get('vision_token_share_pe', True)
44
+ self.image_token_len = kwargs.pop('image_token_len', 256)
45
+ self.max_sub_img = max_sub_img
46
+ self.min_sub_img = min_sub_img
47
+
48
+ self.image_token = image_token
49
+ self.image_start_token = image_start_token
50
+ self.image_end_token = image_end_token
51
+ special_tokens = special_tokens + [f'<|Mask_Cap_{i}|>' for i in range(16)]
52
+ self.tokenizer.add_special_tokens({'additional_special_tokens': special_tokens}, replace_additional_special_tokens=False)
53
+ self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
54
+ self.image_start_token_id = self.tokenizer.convert_tokens_to_ids(self.image_start_token)
55
+ self.image_end_token_id = self.tokenizer.convert_tokens_to_ids(self.image_end_token)
56
+ if 'llada' in tokenizer.name_or_path.lower():
57
+ self._pad_token_id = self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
58
+
59
+ if isinstance(image_size, int):
60
+ image_size = (image_size, image_size)
61
+ else:
62
+ image_size = image_size
63
+ self.image_size = image_size
64
+ assert image_size[0] == image_size[1]
65
+
66
+ def apply_chat_template(self, conversation, chat_template = None, **kwargs) -> str:
67
+ if chat_template is None:
68
+ chat_template = self.chat_template
69
+
70
+ # Split template kwargs from processor/tokenization kwargs so that
71
+ # `tokenize=True` can reuse the processor pipeline without polluting
72
+ # the template rendering inputs.
73
+ tokenize = kwargs.pop("tokenize", False)
74
+ return_dict = kwargs.pop("return_dict", False)
75
+ return_tensors = kwargs.pop("return_tensors", None)
76
+ images = kwargs.pop("images", [])
77
+ videos = kwargs.pop("videos", None)
78
+
79
+ if not images:
80
+ for message in conversation:
81
+ content = message.get("content", [])
82
+ if isinstance(content, list):
83
+ for item in content:
84
+ if isinstance(item, dict) and (item.get("type") == "image" or "image" in item):
85
+ image = item.get("image") or item.get("image_url")
86
+ if image is not None:
87
+ images.append(image)
88
+
89
+ processor_kwargs = {}
90
+ for key in ("padding", "truncation", "max_length"):
91
+ if key in kwargs:
92
+ processor_kwargs[key] = kwargs.pop(key)
93
+ if return_tensors is not None:
94
+ processor_kwargs["return_tensors"] = return_tensors
95
+
96
+ processed_kwargs = {
97
+ "mm_load_kwargs": {},
98
+ "template_kwargs": {},
99
+ }
100
+ # for kwarg_type in processed_kwargs:
101
+ # for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys():
102
+ # kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
103
+ # default_value = getattr(kwarg_type_defaults, key, None)
104
+ # value = kwargs.pop(key, default_value)
105
+ # if value is not None and not isinstance(value, dict):
106
+ # processed_kwargs[kwarg_type][key] = value
107
+
108
+ # Pass unprocessed custom kwargs
109
+ processed_kwargs["template_kwargs"].update(kwargs)
110
+ conversations = [conversation]
111
+
112
+ prompt, generation_indices = render_jinja_template(
113
+ conversations=conversations,
114
+ chat_template=chat_template,
115
+ return_assistant_tokens_mask=True,
116
+ **processed_kwargs["template_kwargs"], # different flags such as `return_assistant_mask`
117
+ **self.tokenizer.special_tokens_map, # tokenizer special tokens are used by some templates
118
+ )
119
+
120
+ if not tokenize:
121
+ return prompt, generation_indices
122
+
123
+ # Reuse the processor pipeline to produce tokenized inputs.
124
+ model_inputs = self(
125
+ text=prompt,
126
+ images=images,
127
+ videos=videos,
128
+ generation_indices=generation_indices,
129
+ **processor_kwargs,
130
+ )
131
+ # if return_dict:
132
+ # return model_inputs
133
+ return model_inputs
134
+
135
+ def __call__(self, text=None, images=[], videos=None, generation_indices=None, **kwargs) ->BatchFeature:
136
+ inputs = self.tokenizer(text, padding=False, truncation=False, return_attention_mask=False)
137
+ assistant_masks = []
138
+ input_ids = inputs["input_ids"]
139
+ for i in range(len(input_ids)):
140
+ current_mask = [0] * len(input_ids[i])
141
+ if 'llada' in self.tokenizer.name_or_path.lower():
142
+ for assistant_start_char, assistant_end_char in generation_indices[i]:
143
+ start_token = inputs.char_to_token(i, assistant_start_char)
144
+ end_token = inputs.char_to_token(i, assistant_end_char - 1)
145
+ if start_token is None:
146
+ # start_token is out of bounds maybe due to truncation.
147
+ break
148
+ for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])):
149
+ current_mask[token_id] = 1
150
+
151
+ assistant_masks.append(current_mask)
152
+
153
+ inputs["assistant_masks"] = assistant_masks[0]
154
+ inputs['input_ids'] = input_ids[0]
155
+
156
+ truncation = kwargs.pop('truncation', False)
157
+ max_length = kwargs.pop('max_length', 1024)
158
+ padding = kwargs.pop('padding', False)
159
+
160
+ inputs = self.process_images(images, inputs=inputs)
161
+ if isinstance(inputs, UserDict):
162
+ inputs = inputs.data
163
+
164
+ if 'attention_mask' not in inputs:
165
+ inputs['attention_mask'] = [1] * len(inputs['input_ids'])
166
+ if 'assistant_masks' in inputs:
167
+ inputs['prompt_mask'] = [1-x for x in inputs.pop('assistant_masks')]
168
+
169
+ inputs = self.process_inputs(inputs)
170
+ if truncation and len(inputs['input_ids']) > max_length:
171
+ inputs = self.truncate(inputs, max_length)
172
+ if padding and len(inputs['input_ids']) < max_length:
173
+ inputs = self.padding(inputs, max_length)
174
+
175
+ inputs = self.to_tensor(inputs)
176
+ self.check(inputs)
177
+ if self.vision_token_share_pe:
178
+ position_ids = self.get_position_ids(inputs)
179
+ position_ids = torch.tensor([position_ids], dtype=torch.long)
180
+ inputs['position_ids'] = position_ids
181
+
182
+ inputs.pop('sub_image_nums', None)
183
+
184
+ return BatchFeature(inputs)
185
+
186
+ def get_position_ids(self, inputs: Dict[str, Any]):
187
+ input_ids = inputs['input_ids'][0]
188
+ image_token_lens = self.get_image_token_length(inputs)
189
+ position_ids = []
190
+ i, j = 0, 0
191
+ while len(position_ids) < len(input_ids):
192
+ if input_ids[len(position_ids)] == self.image_token_id:
193
+ image_token_len = image_token_lens[j]
194
+ assert image_token_len % self.image_token_len == 0
195
+ num_views = image_token_len // self.image_token_len
196
+ for _ in range(num_views):
197
+ position_ids += [i] * self.image_token_len # 同一个图像的所有 token 共享相同的位置编码
198
+ i += 1
199
+ j += 1
200
+ else:
201
+ position_ids.append(i)
202
+ i += 1
203
+
204
+ assert j == len(image_token_lens) and len(position_ids) == len(input_ids), \
205
+ f"Wrong position_ids, {j} != {len(image_token_lens)} or {len(position_ids)} != {len(input_ids)}"
206
+
207
+ return position_ids
208
+
209
+ def process_images(self, images, inputs):
210
+ images = [load_image(img) for img in images]
211
+ if len(images) > 0:
212
+ processed_images = []
213
+ sub_image_nums = []
214
+ for image in images:
215
+ if len(images) > 1:
216
+ # for multi images, remove the split strategy
217
+ sub_images = dynamic_preprocess(
218
+ image, min_num=1,
219
+ max_num=1,
220
+ image_size=self.image_size[0], use_thumbnail=True)
221
+ else:
222
+ sub_images = dynamic_preprocess(
223
+ image, min_num=self.min_sub_img,
224
+ max_num=self.max_sub_img,
225
+ image_size=self.image_size[0], use_thumbnail=True)
226
+
227
+ sub_image_nums.append(len(sub_images))
228
+ processed_images += sub_images
229
+ # print([_img.size for _img in processed_images])
230
+ pixel_values = self.image_processor.preprocess(
231
+ images=processed_images, return_tensors="pt"
232
+ )["pixel_values"] # (N, c, h, w)
233
+ else:
234
+ pixel_values = torch.zeros((
235
+ 1, 3, self.image_size[0], self.image_size[1]), dtype=torch.float32
236
+ )
237
+ sub_image_nums = []
238
+
239
+ inputs['pixel_values'] = pixel_values
240
+ inputs['sub_image_nums'] = sub_image_nums
241
+ return inputs
242
+
243
+ def truncate(self, inputs: Dict[str, Any], max_length: int):
244
+ assert self.image_token_id not in inputs['input_ids'][max_length:], f"Truncate image token is not allowed."
245
+ inputs['input_ids'] = inputs['input_ids'][:max_length]
246
+ inputs['attention_mask'] = inputs['attention_mask'][:max_length]
247
+ if 'prompt_mask' in inputs:
248
+ inputs['prompt_mask'] = inputs['prompt_mask'][:max_length]
249
+ return inputs
250
+
251
+ def get_image_token_length(self, inputs: Dict[str, Any]) -> List[int]:
252
+ sub_image_nums = inputs.get('sub_image_nums', None)
253
+ if sub_image_nums is None or len(sub_image_nums) == 0:
254
+ return []
255
+ image_token_lens = [_num * self.num_image_token for _num in sub_image_nums]
256
+ return image_token_lens
257
+
258
+ def process_inputs(self, inputs: Dict[str, Any]):
259
+ graft_token_lens = self._get_graft_token_length(inputs)
260
+ inputs['input_ids'] = self._graft_token(inputs['input_ids'], graft_token_lens, self.image_token_id)
261
+ inputs['attention_mask'] = self._graft_token(inputs['attention_mask'], graft_token_lens, 'replicate')
262
+ if 'prompt_mask' in inputs:
263
+ inputs['prompt_mask'] = self._graft_token(inputs['prompt_mask'], graft_token_lens, 'replicate')
264
+ return inputs
265
+
266
+ def _graft_token(self, seq, graft_token_lens, value):
267
+ if value == 'replicate':
268
+ for i in reversed(graft_token_lens.keys()):
269
+ seq[i:] = [seq[i]] * graft_token_lens[i] + seq[i+1:]
270
+ else:
271
+ for i in reversed(graft_token_lens.keys()):
272
+ seq[i:] = [value] * graft_token_lens[i] + seq[i+1:]
273
+ return seq
274
+
275
+ def _get_graft_token_length(self, inputs: Dict[str, Any]) -> Dict[int, int]:
276
+ image_token_pos = [i for i, x in enumerate(inputs['input_ids']) if x == self.image_token_id]
277
+ image_token_lens = self.get_image_token_length(inputs)
278
+ assert len(image_token_pos) == len(image_token_lens), \
279
+ "Wrong image token count, " \
280
+ f"image_token_count({len(image_token_pos)}) != image_count({len(image_token_lens)})"
281
+
282
+ graft_token_lens = OrderedDict(item for item in zip(image_token_pos, image_token_lens))
283
+ return graft_token_lens
284
+
285
+ def check(self, inputs: Dict[str, Any]):
286
+ image_embed_token_count = torch.count_nonzero(inputs['input_ids'] == self.image_token_id).item()
287
+ image_embed_count = sum(self.get_image_token_length(inputs))
288
+ assert image_embed_token_count == image_embed_count, \
289
+ "Wrong image embed token count, " \
290
+ f"image_embed_token_count({image_embed_token_count}) != image_embed_count({image_embed_count})"
291
+
292
+ def padding(self, inputs: Dict[str, Any], max_length: int):
293
+ padding_len = max_length - len(inputs['input_ids'])
294
+ inputs['input_ids'] += [self.pad_token_id] * padding_len
295
+ inputs['attention_mask'] += [0] * padding_len
296
+ if 'prompt_mask' in inputs:
297
+ inputs['prompt_mask'] += [0] * padding_len
298
+ return inputs
299
+
300
+ def decode(self, token_ids: Union[List[int], torch.Tensor], **kwargs):
301
+ if isinstance(token_ids, torch.Tensor):
302
+ token_ids = token_ids.tolist()
303
+ text = self.tokenizer.decode(token_ids, **kwargs)
304
+ return text
305
+
306
+ def batch_decode(self, sequences: Union[List[List[int]], torch.Tensor], **kwargs):
307
+ if isinstance(sequences, torch.Tensor):
308
+ sequences = sequences.tolist()
309
+ texts = self.tokenizer.batch_decode(sequences, **kwargs)
310
+ return texts
311
+
312
+ def to_tensor(self, inputs):
313
+ inputs['input_ids'] = torch.tensor([inputs['input_ids']], dtype=torch.long)
314
+ inputs['attention_mask'] = torch.tensor([inputs['attention_mask']], dtype=torch.bool)
315
+ if 'prompt_mask' in inputs:
316
+ inputs['prompt_mask'] = torch.tensor([inputs['prompt_mask']], dtype=torch.bool)
317
+ return inputs
318
+
319
+ @property
320
+ def pad_token_id(self):
321
+ return self._pad_token_id
322
+
323
+ def __repr__(self):
324
+ pass
325
+
326
+ def __str__(self):
327
+ return 'PDMLLMProcessor'
328
+
329
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
330
+ best_ratio_diff = float('inf')
331
+ best_ratio = (1, 1)
332
+ area = width * height
333
+ for ratio in target_ratios:
334
+ target_aspect_ratio = ratio[0] / ratio[1]
335
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
336
+ if ratio_diff < best_ratio_diff:
337
+ best_ratio_diff = ratio_diff
338
+ best_ratio = ratio
339
+ elif ratio_diff == best_ratio_diff:
340
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
341
+ best_ratio = ratio
342
+ # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
343
+ return best_ratio
344
+
345
+
346
+ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=512, use_thumbnail=True):
347
+ orig_width, orig_height = image.size
348
+ aspect_ratio = orig_width / orig_height
349
+
350
+ # calculate the existing image aspect ratio
351
+ target_ratios = set(
352
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
353
+ i * j <= max_num and i * j >= min_num)
354
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
355
+
356
+ # find the closest aspect ratio to the target
357
+ target_aspect_ratio = find_closest_aspect_ratio(
358
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
359
+
360
+ # calculate the target width and height
361
+ target_width = image_size * target_aspect_ratio[0]
362
+ target_height = image_size * target_aspect_ratio[1]
363
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
364
+
365
+ # resize the image
366
+ resized_img = image.resize((target_width, target_height))
367
+ processed_images = []
368
+ for i in range(blocks):
369
+ box = (
370
+ (i % (target_width // image_size)) * image_size,
371
+ (i // (target_width // image_size)) * image_size,
372
+ ((i % (target_width // image_size)) + 1) * image_size,
373
+ ((i // (target_width // image_size)) + 1) * image_size
374
+ )
375
+ # split the image
376
+ split_img = resized_img.crop(box)
377
+ processed_images.append(split_img)
378
+ assert len(processed_images) == blocks
379
+ if use_thumbnail and len(processed_images) != 1:
380
+ thumbnail_img = image.resize((image_size, image_size))
381
+ processed_images.append(thumbnail_img)
382
+ return processed_images
processor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_pdmllm.PDMLLMProcessor"
4
+ },
5
+ "image_end_token": "</img>",
6
+ "image_size": [
7
+ 512,
8
+ 512
9
+ ],
10
+ "image_start_token": "<img>",
11
+ "image_token": "<IMG_CONTEXT>",
12
+ "max_sub_img": 6,
13
+ "min_sub_img": 1,
14
+ "processor_class": "PDMLLMProcessor"
15
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|mdm_mask|>",
4
+ "<role>",
5
+ "</role>",
6
+ "<|arithmetic_start|>",
7
+ "<|arithmetic_end|>",
8
+ "<|number_start|>",
9
+ "<|number_end|>",
10
+ {
11
+ "content": "<IMG_CONTEXT>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<img>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "</img>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<|Mask_Cap_0|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<|Mask_Cap_1|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<|Mask_Cap_2|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "<|Mask_Cap_3|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<|Mask_Cap_4|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "<|Mask_Cap_5|>",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ },
73
+ {
74
+ "content": "<|Mask_Cap_6|>",
75
+ "lstrip": false,
76
+ "normalized": false,
77
+ "rstrip": false,
78
+ "single_word": false
79
+ },
80
+ {
81
+ "content": "<|Mask_Cap_7|>",
82
+ "lstrip": false,
83
+ "normalized": false,
84
+ "rstrip": false,
85
+ "single_word": false
86
+ },
87
+ {
88
+ "content": "<|Mask_Cap_8|>",
89
+ "lstrip": false,
90
+ "normalized": false,
91
+ "rstrip": false,
92
+ "single_word": false
93
+ },
94
+ {
95
+ "content": "<|Mask_Cap_9|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false
100
+ },
101
+ {
102
+ "content": "<|Mask_Cap_10|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false
107
+ },
108
+ {
109
+ "content": "<|Mask_Cap_11|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false
114
+ },
115
+ {
116
+ "content": "<|Mask_Cap_12|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false
121
+ },
122
+ {
123
+ "content": "<|Mask_Cap_13|>",
124
+ "lstrip": false,
125
+ "normalized": false,
126
+ "rstrip": false,
127
+ "single_word": false
128
+ },
129
+ {
130
+ "content": "<|Mask_Cap_14|>",
131
+ "lstrip": false,
132
+ "normalized": false,
133
+ "rstrip": false,
134
+ "single_word": false
135
+ },
136
+ {
137
+ "content": "<|Mask_Cap_15|>",
138
+ "lstrip": false,
139
+ "normalized": false,
140
+ "rstrip": false,
141
+ "single_word": false
142
+ }
143
+ ],
144
+ "bos_token": {
145
+ "content": "<|startoftext|>",
146
+ "lstrip": false,
147
+ "normalized": false,
148
+ "rstrip": false,
149
+ "single_word": false
150
+ },
151
+ "cls_token": {
152
+ "content": "[CLS]",
153
+ "lstrip": false,
154
+ "normalized": false,
155
+ "rstrip": false,
156
+ "single_word": false
157
+ },
158
+ "eos_token": {
159
+ "content": "<|endoftext|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false
164
+ },
165
+ "pad_token": {
166
+ "content": "<|endoftext|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false
171
+ }
172
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,2359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "126080": {
6
+ "content": "<|startoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "126081": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "126082": {
22
+ "content": "[CLS]",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "126083": {
30
+ "content": "[gMASK]",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "126084": {
38
+ "content": "<|reserved_token_0|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "126085": {
46
+ "content": "<|reserved_token_1|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "126086": {
54
+ "content": "<|reserved_token_2|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "126087": {
62
+ "content": "<|reserved_token_3|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "126088": {
70
+ "content": "<|reserved_token_4|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "126089": {
78
+ "content": "<|reserved_token_5|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "126090": {
86
+ "content": "<|reserved_token_6|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "126091": {
94
+ "content": "<|reserved_token_7|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "126092": {
102
+ "content": "<|reserved_token_8|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "126093": {
110
+ "content": "<|reserved_token_9|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "126094": {
118
+ "content": "<|reserved_token_10|>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "126095": {
126
+ "content": "<|reserved_token_11|>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "126096": {
134
+ "content": "<|reserved_token_12|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "126097": {
142
+ "content": "<|reserved_token_13|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "126098": {
150
+ "content": "<|reserved_token_14|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "126099": {
158
+ "content": "<|reserved_token_15|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "126100": {
166
+ "content": "<|reserved_token_16|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "126101": {
174
+ "content": "<|reserved_token_17|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "126102": {
182
+ "content": "<|reserved_token_18|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "126103": {
190
+ "content": "<|reserved_token_19|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "126104": {
198
+ "content": "<|reserved_token_20|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "126105": {
206
+ "content": "<|reserved_token_21|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "126106": {
214
+ "content": "<|reserved_token_22|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "126107": {
222
+ "content": "<|reserved_token_23|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "126108": {
230
+ "content": "<|reserved_token_24|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "126109": {
238
+ "content": "<|reserved_token_25|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "126110": {
246
+ "content": "<|reserved_token_26|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "126111": {
254
+ "content": "<|reserved_token_27|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "126112": {
262
+ "content": "<|reserved_token_28|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "126113": {
270
+ "content": "<|reserved_token_29|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "126114": {
278
+ "content": "<|reserved_token_30|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "126115": {
286
+ "content": "<|reserved_token_31|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "126116": {
294
+ "content": "<|reserved_token_32|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "126117": {
302
+ "content": "<|reserved_token_33|>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "126118": {
310
+ "content": "<|reserved_token_34|>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "126119": {
318
+ "content": "<|reserved_token_35|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "126120": {
326
+ "content": "<|reserved_token_36|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "126121": {
334
+ "content": "<|reserved_token_37|>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "126122": {
342
+ "content": "<|reserved_token_38|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "126123": {
350
+ "content": "<|reserved_token_39|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "126124": {
358
+ "content": "<|reserved_token_40|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "126125": {
366
+ "content": "<|reserved_token_41|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "126126": {
374
+ "content": "<|reserved_token_42|>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "126127": {
382
+ "content": "<|reserved_token_43|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "126128": {
390
+ "content": "<|reserved_token_44|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "126129": {
398
+ "content": "<|reserved_token_45|>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "126130": {
406
+ "content": "<|reserved_token_46|>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "126131": {
414
+ "content": "<|reserved_token_47|>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "126132": {
422
+ "content": "<|reserved_token_48|>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "126133": {
430
+ "content": "<|reserved_token_49|>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "126134": {
438
+ "content": "<|reserved_token_50|>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "126135": {
446
+ "content": "<|reserved_token_51|>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "126136": {
454
+ "content": "<|reserved_token_52|>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "126137": {
462
+ "content": "<|reserved_token_53|>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "126138": {
470
+ "content": "<|reserved_token_54|>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "126139": {
478
+ "content": "<|reserved_token_55|>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "126140": {
486
+ "content": "<|reserved_token_56|>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "126141": {
494
+ "content": "<|reserved_token_57|>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": true
500
+ },
501
+ "126142": {
502
+ "content": "<|reserved_token_58|>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": true
508
+ },
509
+ "126143": {
510
+ "content": "<|reserved_token_59|>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": true
516
+ },
517
+ "126144": {
518
+ "content": "<|reserved_token_60|>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": true
524
+ },
525
+ "126145": {
526
+ "content": "<|reserved_token_61|>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": true
532
+ },
533
+ "126146": {
534
+ "content": "<|reserved_token_62|>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": true
540
+ },
541
+ "126147": {
542
+ "content": "<|reserved_token_63|>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "126148": {
550
+ "content": "<|reserved_token_64|>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ },
557
+ "126149": {
558
+ "content": "<|reserved_token_65|>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": true
564
+ },
565
+ "126150": {
566
+ "content": "<|reserved_token_66|>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": true
572
+ },
573
+ "126151": {
574
+ "content": "<|reserved_token_67|>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": true
580
+ },
581
+ "126152": {
582
+ "content": "<|reserved_token_68|>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": true
588
+ },
589
+ "126153": {
590
+ "content": "<|reserved_token_69|>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": true
596
+ },
597
+ "126154": {
598
+ "content": "<|reserved_token_70|>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": true
604
+ },
605
+ "126155": {
606
+ "content": "<|reserved_token_71|>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": true
612
+ },
613
+ "126156": {
614
+ "content": "<|reserved_token_72|>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": true
620
+ },
621
+ "126157": {
622
+ "content": "<|reserved_token_73|>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": true
628
+ },
629
+ "126158": {
630
+ "content": "<|reserved_token_74|>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": true
636
+ },
637
+ "126159": {
638
+ "content": "<|reserved_token_75|>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": true
644
+ },
645
+ "126160": {
646
+ "content": "<|reserved_token_76|>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": true
652
+ },
653
+ "126161": {
654
+ "content": "<|reserved_token_77|>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": true
660
+ },
661
+ "126162": {
662
+ "content": "<|reserved_token_78|>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": true
668
+ },
669
+ "126163": {
670
+ "content": "<|reserved_token_79|>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": true
676
+ },
677
+ "126164": {
678
+ "content": "<|reserved_token_80|>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": true
684
+ },
685
+ "126165": {
686
+ "content": "<|reserved_token_81|>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": true
692
+ },
693
+ "126166": {
694
+ "content": "<|reserved_token_82|>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": true
700
+ },
701
+ "126167": {
702
+ "content": "<|reserved_token_83|>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": true
708
+ },
709
+ "126168": {
710
+ "content": "<|reserved_token_84|>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": true
716
+ },
717
+ "126169": {
718
+ "content": "<|reserved_token_85|>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": true
724
+ },
725
+ "126170": {
726
+ "content": "<|reserved_token_86|>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": true
732
+ },
733
+ "126171": {
734
+ "content": "<|reserved_token_87|>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": true
740
+ },
741
+ "126172": {
742
+ "content": "<|reserved_token_88|>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": true
748
+ },
749
+ "126173": {
750
+ "content": "<|reserved_token_89|>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": true
756
+ },
757
+ "126174": {
758
+ "content": "<|reserved_token_90|>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": true
764
+ },
765
+ "126175": {
766
+ "content": "<|reserved_token_91|>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": true
772
+ },
773
+ "126176": {
774
+ "content": "<|reserved_token_92|>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": true
780
+ },
781
+ "126177": {
782
+ "content": "<|reserved_token_93|>",
783
+ "lstrip": false,
784
+ "normalized": false,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": true
788
+ },
789
+ "126178": {
790
+ "content": "<|reserved_token_94|>",
791
+ "lstrip": false,
792
+ "normalized": false,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": true
796
+ },
797
+ "126179": {
798
+ "content": "<|reserved_token_95|>",
799
+ "lstrip": false,
800
+ "normalized": false,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": true
804
+ },
805
+ "126180": {
806
+ "content": "<|reserved_token_96|>",
807
+ "lstrip": false,
808
+ "normalized": false,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": true
812
+ },
813
+ "126181": {
814
+ "content": "<|reserved_token_97|>",
815
+ "lstrip": false,
816
+ "normalized": false,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": true
820
+ },
821
+ "126182": {
822
+ "content": "<|reserved_token_98|>",
823
+ "lstrip": false,
824
+ "normalized": false,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": true
828
+ },
829
+ "126183": {
830
+ "content": "<|reserved_token_99|>",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": true
836
+ },
837
+ "126184": {
838
+ "content": "<|reserved_token_100|>",
839
+ "lstrip": false,
840
+ "normalized": false,
841
+ "rstrip": false,
842
+ "single_word": false,
843
+ "special": true
844
+ },
845
+ "126185": {
846
+ "content": "<|reserved_token_101|>",
847
+ "lstrip": false,
848
+ "normalized": false,
849
+ "rstrip": false,
850
+ "single_word": false,
851
+ "special": true
852
+ },
853
+ "126186": {
854
+ "content": "<|reserved_token_102|>",
855
+ "lstrip": false,
856
+ "normalized": false,
857
+ "rstrip": false,
858
+ "single_word": false,
859
+ "special": true
860
+ },
861
+ "126187": {
862
+ "content": "<|reserved_token_103|>",
863
+ "lstrip": false,
864
+ "normalized": false,
865
+ "rstrip": false,
866
+ "single_word": false,
867
+ "special": true
868
+ },
869
+ "126188": {
870
+ "content": "<|reserved_token_104|>",
871
+ "lstrip": false,
872
+ "normalized": false,
873
+ "rstrip": false,
874
+ "single_word": false,
875
+ "special": true
876
+ },
877
+ "126189": {
878
+ "content": "<|reserved_token_105|>",
879
+ "lstrip": false,
880
+ "normalized": false,
881
+ "rstrip": false,
882
+ "single_word": false,
883
+ "special": true
884
+ },
885
+ "126190": {
886
+ "content": "<|reserved_token_106|>",
887
+ "lstrip": false,
888
+ "normalized": false,
889
+ "rstrip": false,
890
+ "single_word": false,
891
+ "special": true
892
+ },
893
+ "126191": {
894
+ "content": "<|reserved_token_107|>",
895
+ "lstrip": false,
896
+ "normalized": false,
897
+ "rstrip": false,
898
+ "single_word": false,
899
+ "special": true
900
+ },
901
+ "126192": {
902
+ "content": "<|reserved_token_108|>",
903
+ "lstrip": false,
904
+ "normalized": false,
905
+ "rstrip": false,
906
+ "single_word": false,
907
+ "special": true
908
+ },
909
+ "126193": {
910
+ "content": "<|reserved_token_109|>",
911
+ "lstrip": false,
912
+ "normalized": false,
913
+ "rstrip": false,
914
+ "single_word": false,
915
+ "special": true
916
+ },
917
+ "126194": {
918
+ "content": "<|reserved_token_110|>",
919
+ "lstrip": false,
920
+ "normalized": false,
921
+ "rstrip": false,
922
+ "single_word": false,
923
+ "special": true
924
+ },
925
+ "126195": {
926
+ "content": "<|reserved_token_111|>",
927
+ "lstrip": false,
928
+ "normalized": false,
929
+ "rstrip": false,
930
+ "single_word": false,
931
+ "special": true
932
+ },
933
+ "126196": {
934
+ "content": "<|reserved_token_112|>",
935
+ "lstrip": false,
936
+ "normalized": false,
937
+ "rstrip": false,
938
+ "single_word": false,
939
+ "special": true
940
+ },
941
+ "126197": {
942
+ "content": "<|reserved_token_113|>",
943
+ "lstrip": false,
944
+ "normalized": false,
945
+ "rstrip": false,
946
+ "single_word": false,
947
+ "special": true
948
+ },
949
+ "126198": {
950
+ "content": "<|reserved_token_114|>",
951
+ "lstrip": false,
952
+ "normalized": false,
953
+ "rstrip": false,
954
+ "single_word": false,
955
+ "special": true
956
+ },
957
+ "126199": {
958
+ "content": "<|reserved_token_115|>",
959
+ "lstrip": false,
960
+ "normalized": false,
961
+ "rstrip": false,
962
+ "single_word": false,
963
+ "special": true
964
+ },
965
+ "126200": {
966
+ "content": "<|reserved_token_116|>",
967
+ "lstrip": false,
968
+ "normalized": false,
969
+ "rstrip": false,
970
+ "single_word": false,
971
+ "special": true
972
+ },
973
+ "126201": {
974
+ "content": "<|reserved_token_117|>",
975
+ "lstrip": false,
976
+ "normalized": false,
977
+ "rstrip": false,
978
+ "single_word": false,
979
+ "special": true
980
+ },
981
+ "126202": {
982
+ "content": "<|reserved_token_118|>",
983
+ "lstrip": false,
984
+ "normalized": false,
985
+ "rstrip": false,
986
+ "single_word": false,
987
+ "special": true
988
+ },
989
+ "126203": {
990
+ "content": "<|reserved_token_119|>",
991
+ "lstrip": false,
992
+ "normalized": false,
993
+ "rstrip": false,
994
+ "single_word": false,
995
+ "special": true
996
+ },
997
+ "126204": {
998
+ "content": "<|reserved_token_120|>",
999
+ "lstrip": false,
1000
+ "normalized": false,
1001
+ "rstrip": false,
1002
+ "single_word": false,
1003
+ "special": true
1004
+ },
1005
+ "126205": {
1006
+ "content": "<|reserved_token_121|>",
1007
+ "lstrip": false,
1008
+ "normalized": false,
1009
+ "rstrip": false,
1010
+ "single_word": false,
1011
+ "special": true
1012
+ },
1013
+ "126206": {
1014
+ "content": "<|reserved_token_122|>",
1015
+ "lstrip": false,
1016
+ "normalized": false,
1017
+ "rstrip": false,
1018
+ "single_word": false,
1019
+ "special": true
1020
+ },
1021
+ "126207": {
1022
+ "content": "<|reserved_token_123|>",
1023
+ "lstrip": false,
1024
+ "normalized": false,
1025
+ "rstrip": false,
1026
+ "single_word": false,
1027
+ "special": true
1028
+ },
1029
+ "126208": {
1030
+ "content": "<|reserved_token_124|>",
1031
+ "lstrip": false,
1032
+ "normalized": false,
1033
+ "rstrip": false,
1034
+ "single_word": false,
1035
+ "special": true
1036
+ },
1037
+ "126209": {
1038
+ "content": "<|reserved_token_125|>",
1039
+ "lstrip": false,
1040
+ "normalized": false,
1041
+ "rstrip": false,
1042
+ "single_word": false,
1043
+ "special": true
1044
+ },
1045
+ "126210": {
1046
+ "content": "<|reserved_token_126|>",
1047
+ "lstrip": false,
1048
+ "normalized": false,
1049
+ "rstrip": false,
1050
+ "single_word": false,
1051
+ "special": true
1052
+ },
1053
+ "126211": {
1054
+ "content": "<|reserved_token_127|>",
1055
+ "lstrip": false,
1056
+ "normalized": false,
1057
+ "rstrip": false,
1058
+ "single_word": false,
1059
+ "special": true
1060
+ },
1061
+ "126212": {
1062
+ "content": "<|reserved_token_128|>",
1063
+ "lstrip": false,
1064
+ "normalized": false,
1065
+ "rstrip": false,
1066
+ "single_word": false,
1067
+ "special": true
1068
+ },
1069
+ "126213": {
1070
+ "content": "<|reserved_token_129|>",
1071
+ "lstrip": false,
1072
+ "normalized": false,
1073
+ "rstrip": false,
1074
+ "single_word": false,
1075
+ "special": true
1076
+ },
1077
+ "126214": {
1078
+ "content": "<|reserved_token_130|>",
1079
+ "lstrip": false,
1080
+ "normalized": false,
1081
+ "rstrip": false,
1082
+ "single_word": false,
1083
+ "special": true
1084
+ },
1085
+ "126215": {
1086
+ "content": "<|reserved_token_131|>",
1087
+ "lstrip": false,
1088
+ "normalized": false,
1089
+ "rstrip": false,
1090
+ "single_word": false,
1091
+ "special": true
1092
+ },
1093
+ "126216": {
1094
+ "content": "<|reserved_token_132|>",
1095
+ "lstrip": false,
1096
+ "normalized": false,
1097
+ "rstrip": false,
1098
+ "single_word": false,
1099
+ "special": true
1100
+ },
1101
+ "126217": {
1102
+ "content": "<|reserved_token_133|>",
1103
+ "lstrip": false,
1104
+ "normalized": false,
1105
+ "rstrip": false,
1106
+ "single_word": false,
1107
+ "special": true
1108
+ },
1109
+ "126218": {
1110
+ "content": "<|reserved_token_134|>",
1111
+ "lstrip": false,
1112
+ "normalized": false,
1113
+ "rstrip": false,
1114
+ "single_word": false,
1115
+ "special": true
1116
+ },
1117
+ "126219": {
1118
+ "content": "<|reserved_token_135|>",
1119
+ "lstrip": false,
1120
+ "normalized": false,
1121
+ "rstrip": false,
1122
+ "single_word": false,
1123
+ "special": true
1124
+ },
1125
+ "126220": {
1126
+ "content": "<|reserved_token_136|>",
1127
+ "lstrip": false,
1128
+ "normalized": false,
1129
+ "rstrip": false,
1130
+ "single_word": false,
1131
+ "special": true
1132
+ },
1133
+ "126221": {
1134
+ "content": "<|reserved_token_137|>",
1135
+ "lstrip": false,
1136
+ "normalized": false,
1137
+ "rstrip": false,
1138
+ "single_word": false,
1139
+ "special": true
1140
+ },
1141
+ "126222": {
1142
+ "content": "<|reserved_token_138|>",
1143
+ "lstrip": false,
1144
+ "normalized": false,
1145
+ "rstrip": false,
1146
+ "single_word": false,
1147
+ "special": true
1148
+ },
1149
+ "126223": {
1150
+ "content": "<|reserved_token_139|>",
1151
+ "lstrip": false,
1152
+ "normalized": false,
1153
+ "rstrip": false,
1154
+ "single_word": false,
1155
+ "special": true
1156
+ },
1157
+ "126224": {
1158
+ "content": "<|reserved_token_140|>",
1159
+ "lstrip": false,
1160
+ "normalized": false,
1161
+ "rstrip": false,
1162
+ "single_word": false,
1163
+ "special": true
1164
+ },
1165
+ "126225": {
1166
+ "content": "<|reserved_token_141|>",
1167
+ "lstrip": false,
1168
+ "normalized": false,
1169
+ "rstrip": false,
1170
+ "single_word": false,
1171
+ "special": true
1172
+ },
1173
+ "126226": {
1174
+ "content": "<|reserved_token_142|>",
1175
+ "lstrip": false,
1176
+ "normalized": false,
1177
+ "rstrip": false,
1178
+ "single_word": false,
1179
+ "special": true
1180
+ },
1181
+ "126227": {
1182
+ "content": "<|reserved_token_143|>",
1183
+ "lstrip": false,
1184
+ "normalized": false,
1185
+ "rstrip": false,
1186
+ "single_word": false,
1187
+ "special": true
1188
+ },
1189
+ "126228": {
1190
+ "content": "<|reserved_token_144|>",
1191
+ "lstrip": false,
1192
+ "normalized": false,
1193
+ "rstrip": false,
1194
+ "single_word": false,
1195
+ "special": true
1196
+ },
1197
+ "126229": {
1198
+ "content": "<|reserved_token_145|>",
1199
+ "lstrip": false,
1200
+ "normalized": false,
1201
+ "rstrip": false,
1202
+ "single_word": false,
1203
+ "special": true
1204
+ },
1205
+ "126230": {
1206
+ "content": "<|reserved_token_146|>",
1207
+ "lstrip": false,
1208
+ "normalized": false,
1209
+ "rstrip": false,
1210
+ "single_word": false,
1211
+ "special": true
1212
+ },
1213
+ "126231": {
1214
+ "content": "<|reserved_token_147|>",
1215
+ "lstrip": false,
1216
+ "normalized": false,
1217
+ "rstrip": false,
1218
+ "single_word": false,
1219
+ "special": true
1220
+ },
1221
+ "126232": {
1222
+ "content": "<|reserved_token_148|>",
1223
+ "lstrip": false,
1224
+ "normalized": false,
1225
+ "rstrip": false,
1226
+ "single_word": false,
1227
+ "special": true
1228
+ },
1229
+ "126233": {
1230
+ "content": "<|reserved_token_149|>",
1231
+ "lstrip": false,
1232
+ "normalized": false,
1233
+ "rstrip": false,
1234
+ "single_word": false,
1235
+ "special": true
1236
+ },
1237
+ "126234": {
1238
+ "content": "<|reserved_token_150|>",
1239
+ "lstrip": false,
1240
+ "normalized": false,
1241
+ "rstrip": false,
1242
+ "single_word": false,
1243
+ "special": true
1244
+ },
1245
+ "126235": {
1246
+ "content": "<|reserved_token_151|>",
1247
+ "lstrip": false,
1248
+ "normalized": false,
1249
+ "rstrip": false,
1250
+ "single_word": false,
1251
+ "special": true
1252
+ },
1253
+ "126236": {
1254
+ "content": "<|reserved_token_152|>",
1255
+ "lstrip": false,
1256
+ "normalized": false,
1257
+ "rstrip": false,
1258
+ "single_word": false,
1259
+ "special": true
1260
+ },
1261
+ "126237": {
1262
+ "content": "<|reserved_token_153|>",
1263
+ "lstrip": false,
1264
+ "normalized": false,
1265
+ "rstrip": false,
1266
+ "single_word": false,
1267
+ "special": true
1268
+ },
1269
+ "126238": {
1270
+ "content": "<|reserved_token_154|>",
1271
+ "lstrip": false,
1272
+ "normalized": false,
1273
+ "rstrip": false,
1274
+ "single_word": false,
1275
+ "special": true
1276
+ },
1277
+ "126239": {
1278
+ "content": "<|reserved_token_155|>",
1279
+ "lstrip": false,
1280
+ "normalized": false,
1281
+ "rstrip": false,
1282
+ "single_word": false,
1283
+ "special": true
1284
+ },
1285
+ "126240": {
1286
+ "content": "<|reserved_token_156|>",
1287
+ "lstrip": false,
1288
+ "normalized": false,
1289
+ "rstrip": false,
1290
+ "single_word": false,
1291
+ "special": true
1292
+ },
1293
+ "126241": {
1294
+ "content": "<|reserved_token_157|>",
1295
+ "lstrip": false,
1296
+ "normalized": false,
1297
+ "rstrip": false,
1298
+ "single_word": false,
1299
+ "special": true
1300
+ },
1301
+ "126242": {
1302
+ "content": "<|reserved_token_158|>",
1303
+ "lstrip": false,
1304
+ "normalized": false,
1305
+ "rstrip": false,
1306
+ "single_word": false,
1307
+ "special": true
1308
+ },
1309
+ "126243": {
1310
+ "content": "<|reserved_token_159|>",
1311
+ "lstrip": false,
1312
+ "normalized": false,
1313
+ "rstrip": false,
1314
+ "single_word": false,
1315
+ "special": true
1316
+ },
1317
+ "126244": {
1318
+ "content": "<|reserved_token_160|>",
1319
+ "lstrip": false,
1320
+ "normalized": false,
1321
+ "rstrip": false,
1322
+ "single_word": false,
1323
+ "special": true
1324
+ },
1325
+ "126245": {
1326
+ "content": "<|reserved_token_161|>",
1327
+ "lstrip": false,
1328
+ "normalized": false,
1329
+ "rstrip": false,
1330
+ "single_word": false,
1331
+ "special": true
1332
+ },
1333
+ "126246": {
1334
+ "content": "<|reserved_token_162|>",
1335
+ "lstrip": false,
1336
+ "normalized": false,
1337
+ "rstrip": false,
1338
+ "single_word": false,
1339
+ "special": true
1340
+ },
1341
+ "126247": {
1342
+ "content": "<|reserved_token_163|>",
1343
+ "lstrip": false,
1344
+ "normalized": false,
1345
+ "rstrip": false,
1346
+ "single_word": false,
1347
+ "special": true
1348
+ },
1349
+ "126248": {
1350
+ "content": "<|reserved_token_164|>",
1351
+ "lstrip": false,
1352
+ "normalized": false,
1353
+ "rstrip": false,
1354
+ "single_word": false,
1355
+ "special": true
1356
+ },
1357
+ "126249": {
1358
+ "content": "<|reserved_token_165|>",
1359
+ "lstrip": false,
1360
+ "normalized": false,
1361
+ "rstrip": false,
1362
+ "single_word": false,
1363
+ "special": true
1364
+ },
1365
+ "126250": {
1366
+ "content": "<|reserved_token_166|>",
1367
+ "lstrip": false,
1368
+ "normalized": false,
1369
+ "rstrip": false,
1370
+ "single_word": false,
1371
+ "special": true
1372
+ },
1373
+ "126251": {
1374
+ "content": "<|reserved_token_167|>",
1375
+ "lstrip": false,
1376
+ "normalized": false,
1377
+ "rstrip": false,
1378
+ "single_word": false,
1379
+ "special": true
1380
+ },
1381
+ "126252": {
1382
+ "content": "<|reserved_token_168|>",
1383
+ "lstrip": false,
1384
+ "normalized": false,
1385
+ "rstrip": false,
1386
+ "single_word": false,
1387
+ "special": true
1388
+ },
1389
+ "126253": {
1390
+ "content": "<|reserved_token_169|>",
1391
+ "lstrip": false,
1392
+ "normalized": false,
1393
+ "rstrip": false,
1394
+ "single_word": false,
1395
+ "special": true
1396
+ },
1397
+ "126254": {
1398
+ "content": "<|reserved_token_170|>",
1399
+ "lstrip": false,
1400
+ "normalized": false,
1401
+ "rstrip": false,
1402
+ "single_word": false,
1403
+ "special": true
1404
+ },
1405
+ "126255": {
1406
+ "content": "<|reserved_token_171|>",
1407
+ "lstrip": false,
1408
+ "normalized": false,
1409
+ "rstrip": false,
1410
+ "single_word": false,
1411
+ "special": true
1412
+ },
1413
+ "126256": {
1414
+ "content": "<|reserved_token_172|>",
1415
+ "lstrip": false,
1416
+ "normalized": false,
1417
+ "rstrip": false,
1418
+ "single_word": false,
1419
+ "special": true
1420
+ },
1421
+ "126257": {
1422
+ "content": "<|reserved_token_173|>",
1423
+ "lstrip": false,
1424
+ "normalized": false,
1425
+ "rstrip": false,
1426
+ "single_word": false,
1427
+ "special": true
1428
+ },
1429
+ "126258": {
1430
+ "content": "<|reserved_token_174|>",
1431
+ "lstrip": false,
1432
+ "normalized": false,
1433
+ "rstrip": false,
1434
+ "single_word": false,
1435
+ "special": true
1436
+ },
1437
+ "126259": {
1438
+ "content": "<|reserved_token_175|>",
1439
+ "lstrip": false,
1440
+ "normalized": false,
1441
+ "rstrip": false,
1442
+ "single_word": false,
1443
+ "special": true
1444
+ },
1445
+ "126260": {
1446
+ "content": "<|reserved_token_176|>",
1447
+ "lstrip": false,
1448
+ "normalized": false,
1449
+ "rstrip": false,
1450
+ "single_word": false,
1451
+ "special": true
1452
+ },
1453
+ "126261": {
1454
+ "content": "<|reserved_token_177|>",
1455
+ "lstrip": false,
1456
+ "normalized": false,
1457
+ "rstrip": false,
1458
+ "single_word": false,
1459
+ "special": true
1460
+ },
1461
+ "126262": {
1462
+ "content": "<|reserved_token_178|>",
1463
+ "lstrip": false,
1464
+ "normalized": false,
1465
+ "rstrip": false,
1466
+ "single_word": false,
1467
+ "special": true
1468
+ },
1469
+ "126263": {
1470
+ "content": "<|reserved_token_179|>",
1471
+ "lstrip": false,
1472
+ "normalized": false,
1473
+ "rstrip": false,
1474
+ "single_word": false,
1475
+ "special": true
1476
+ },
1477
+ "126264": {
1478
+ "content": "<|reserved_token_180|>",
1479
+ "lstrip": false,
1480
+ "normalized": false,
1481
+ "rstrip": false,
1482
+ "single_word": false,
1483
+ "special": true
1484
+ },
1485
+ "126265": {
1486
+ "content": "<|reserved_token_181|>",
1487
+ "lstrip": false,
1488
+ "normalized": false,
1489
+ "rstrip": false,
1490
+ "single_word": false,
1491
+ "special": true
1492
+ },
1493
+ "126266": {
1494
+ "content": "<|reserved_token_182|>",
1495
+ "lstrip": false,
1496
+ "normalized": false,
1497
+ "rstrip": false,
1498
+ "single_word": false,
1499
+ "special": true
1500
+ },
1501
+ "126267": {
1502
+ "content": "<|reserved_token_183|>",
1503
+ "lstrip": false,
1504
+ "normalized": false,
1505
+ "rstrip": false,
1506
+ "single_word": false,
1507
+ "special": true
1508
+ },
1509
+ "126268": {
1510
+ "content": "<|reserved_token_184|>",
1511
+ "lstrip": false,
1512
+ "normalized": false,
1513
+ "rstrip": false,
1514
+ "single_word": false,
1515
+ "special": true
1516
+ },
1517
+ "126269": {
1518
+ "content": "<|reserved_token_185|>",
1519
+ "lstrip": false,
1520
+ "normalized": false,
1521
+ "rstrip": false,
1522
+ "single_word": false,
1523
+ "special": true
1524
+ },
1525
+ "126270": {
1526
+ "content": "<|reserved_token_186|>",
1527
+ "lstrip": false,
1528
+ "normalized": false,
1529
+ "rstrip": false,
1530
+ "single_word": false,
1531
+ "special": true
1532
+ },
1533
+ "126271": {
1534
+ "content": "<|reserved_token_187|>",
1535
+ "lstrip": false,
1536
+ "normalized": false,
1537
+ "rstrip": false,
1538
+ "single_word": false,
1539
+ "special": true
1540
+ },
1541
+ "126272": {
1542
+ "content": "<|reserved_token_188|>",
1543
+ "lstrip": false,
1544
+ "normalized": false,
1545
+ "rstrip": false,
1546
+ "single_word": false,
1547
+ "special": true
1548
+ },
1549
+ "126273": {
1550
+ "content": "<|reserved_token_189|>",
1551
+ "lstrip": false,
1552
+ "normalized": false,
1553
+ "rstrip": false,
1554
+ "single_word": false,
1555
+ "special": true
1556
+ },
1557
+ "126274": {
1558
+ "content": "<|reserved_token_190|>",
1559
+ "lstrip": false,
1560
+ "normalized": false,
1561
+ "rstrip": false,
1562
+ "single_word": false,
1563
+ "special": true
1564
+ },
1565
+ "126275": {
1566
+ "content": "<|reserved_token_191|>",
1567
+ "lstrip": false,
1568
+ "normalized": false,
1569
+ "rstrip": false,
1570
+ "single_word": false,
1571
+ "special": true
1572
+ },
1573
+ "126276": {
1574
+ "content": "<|reserved_token_192|>",
1575
+ "lstrip": false,
1576
+ "normalized": false,
1577
+ "rstrip": false,
1578
+ "single_word": false,
1579
+ "special": true
1580
+ },
1581
+ "126277": {
1582
+ "content": "<|reserved_token_193|>",
1583
+ "lstrip": false,
1584
+ "normalized": false,
1585
+ "rstrip": false,
1586
+ "single_word": false,
1587
+ "special": true
1588
+ },
1589
+ "126278": {
1590
+ "content": "<|reserved_token_194|>",
1591
+ "lstrip": false,
1592
+ "normalized": false,
1593
+ "rstrip": false,
1594
+ "single_word": false,
1595
+ "special": true
1596
+ },
1597
+ "126279": {
1598
+ "content": "<|reserved_token_195|>",
1599
+ "lstrip": false,
1600
+ "normalized": false,
1601
+ "rstrip": false,
1602
+ "single_word": false,
1603
+ "special": true
1604
+ },
1605
+ "126280": {
1606
+ "content": "<|reserved_token_196|>",
1607
+ "lstrip": false,
1608
+ "normalized": false,
1609
+ "rstrip": false,
1610
+ "single_word": false,
1611
+ "special": true
1612
+ },
1613
+ "126281": {
1614
+ "content": "<|reserved_token_197|>",
1615
+ "lstrip": false,
1616
+ "normalized": false,
1617
+ "rstrip": false,
1618
+ "single_word": false,
1619
+ "special": true
1620
+ },
1621
+ "126282": {
1622
+ "content": "<|reserved_token_198|>",
1623
+ "lstrip": false,
1624
+ "normalized": false,
1625
+ "rstrip": false,
1626
+ "single_word": false,
1627
+ "special": true
1628
+ },
1629
+ "126283": {
1630
+ "content": "<|reserved_token_199|>",
1631
+ "lstrip": false,
1632
+ "normalized": false,
1633
+ "rstrip": false,
1634
+ "single_word": false,
1635
+ "special": true
1636
+ },
1637
+ "126284": {
1638
+ "content": "<|reserved_token_200|>",
1639
+ "lstrip": false,
1640
+ "normalized": false,
1641
+ "rstrip": false,
1642
+ "single_word": false,
1643
+ "special": true
1644
+ },
1645
+ "126285": {
1646
+ "content": "<|reserved_token_201|>",
1647
+ "lstrip": false,
1648
+ "normalized": false,
1649
+ "rstrip": false,
1650
+ "single_word": false,
1651
+ "special": true
1652
+ },
1653
+ "126286": {
1654
+ "content": "<|reserved_token_202|>",
1655
+ "lstrip": false,
1656
+ "normalized": false,
1657
+ "rstrip": false,
1658
+ "single_word": false,
1659
+ "special": true
1660
+ },
1661
+ "126287": {
1662
+ "content": "<|reserved_token_203|>",
1663
+ "lstrip": false,
1664
+ "normalized": false,
1665
+ "rstrip": false,
1666
+ "single_word": false,
1667
+ "special": true
1668
+ },
1669
+ "126288": {
1670
+ "content": "<|reserved_token_204|>",
1671
+ "lstrip": false,
1672
+ "normalized": false,
1673
+ "rstrip": false,
1674
+ "single_word": false,
1675
+ "special": true
1676
+ },
1677
+ "126289": {
1678
+ "content": "<|reserved_token_205|>",
1679
+ "lstrip": false,
1680
+ "normalized": false,
1681
+ "rstrip": false,
1682
+ "single_word": false,
1683
+ "special": true
1684
+ },
1685
+ "126290": {
1686
+ "content": "<|reserved_token_206|>",
1687
+ "lstrip": false,
1688
+ "normalized": false,
1689
+ "rstrip": false,
1690
+ "single_word": false,
1691
+ "special": true
1692
+ },
1693
+ "126291": {
1694
+ "content": "<|reserved_token_207|>",
1695
+ "lstrip": false,
1696
+ "normalized": false,
1697
+ "rstrip": false,
1698
+ "single_word": false,
1699
+ "special": true
1700
+ },
1701
+ "126292": {
1702
+ "content": "<|reserved_token_208|>",
1703
+ "lstrip": false,
1704
+ "normalized": false,
1705
+ "rstrip": false,
1706
+ "single_word": false,
1707
+ "special": true
1708
+ },
1709
+ "126293": {
1710
+ "content": "<|reserved_token_209|>",
1711
+ "lstrip": false,
1712
+ "normalized": false,
1713
+ "rstrip": false,
1714
+ "single_word": false,
1715
+ "special": true
1716
+ },
1717
+ "126294": {
1718
+ "content": "<|reserved_token_210|>",
1719
+ "lstrip": false,
1720
+ "normalized": false,
1721
+ "rstrip": false,
1722
+ "single_word": false,
1723
+ "special": true
1724
+ },
1725
+ "126295": {
1726
+ "content": "<|reserved_token_211|>",
1727
+ "lstrip": false,
1728
+ "normalized": false,
1729
+ "rstrip": false,
1730
+ "single_word": false,
1731
+ "special": true
1732
+ },
1733
+ "126296": {
1734
+ "content": "<|reserved_token_212|>",
1735
+ "lstrip": false,
1736
+ "normalized": false,
1737
+ "rstrip": false,
1738
+ "single_word": false,
1739
+ "special": true
1740
+ },
1741
+ "126297": {
1742
+ "content": "<|reserved_token_213|>",
1743
+ "lstrip": false,
1744
+ "normalized": false,
1745
+ "rstrip": false,
1746
+ "single_word": false,
1747
+ "special": true
1748
+ },
1749
+ "126298": {
1750
+ "content": "<|reserved_token_214|>",
1751
+ "lstrip": false,
1752
+ "normalized": false,
1753
+ "rstrip": false,
1754
+ "single_word": false,
1755
+ "special": true
1756
+ },
1757
+ "126299": {
1758
+ "content": "<|reserved_token_215|>",
1759
+ "lstrip": false,
1760
+ "normalized": false,
1761
+ "rstrip": false,
1762
+ "single_word": false,
1763
+ "special": true
1764
+ },
1765
+ "126300": {
1766
+ "content": "<|reserved_token_216|>",
1767
+ "lstrip": false,
1768
+ "normalized": false,
1769
+ "rstrip": false,
1770
+ "single_word": false,
1771
+ "special": true
1772
+ },
1773
+ "126301": {
1774
+ "content": "<|reserved_token_217|>",
1775
+ "lstrip": false,
1776
+ "normalized": false,
1777
+ "rstrip": false,
1778
+ "single_word": false,
1779
+ "special": true
1780
+ },
1781
+ "126302": {
1782
+ "content": "<|reserved_token_218|>",
1783
+ "lstrip": false,
1784
+ "normalized": false,
1785
+ "rstrip": false,
1786
+ "single_word": false,
1787
+ "special": true
1788
+ },
1789
+ "126303": {
1790
+ "content": "<|reserved_token_219|>",
1791
+ "lstrip": false,
1792
+ "normalized": false,
1793
+ "rstrip": false,
1794
+ "single_word": false,
1795
+ "special": true
1796
+ },
1797
+ "126304": {
1798
+ "content": "<|reserved_token_220|>",
1799
+ "lstrip": false,
1800
+ "normalized": false,
1801
+ "rstrip": false,
1802
+ "single_word": false,
1803
+ "special": true
1804
+ },
1805
+ "126305": {
1806
+ "content": "<|reserved_token_221|>",
1807
+ "lstrip": false,
1808
+ "normalized": false,
1809
+ "rstrip": false,
1810
+ "single_word": false,
1811
+ "special": true
1812
+ },
1813
+ "126306": {
1814
+ "content": "<|reserved_token_222|>",
1815
+ "lstrip": false,
1816
+ "normalized": false,
1817
+ "rstrip": false,
1818
+ "single_word": false,
1819
+ "special": true
1820
+ },
1821
+ "126307": {
1822
+ "content": "<|reserved_token_223|>",
1823
+ "lstrip": false,
1824
+ "normalized": false,
1825
+ "rstrip": false,
1826
+ "single_word": false,
1827
+ "special": true
1828
+ },
1829
+ "126308": {
1830
+ "content": "<|reserved_token_224|>",
1831
+ "lstrip": false,
1832
+ "normalized": false,
1833
+ "rstrip": false,
1834
+ "single_word": false,
1835
+ "special": true
1836
+ },
1837
+ "126309": {
1838
+ "content": "<|reserved_token_225|>",
1839
+ "lstrip": false,
1840
+ "normalized": false,
1841
+ "rstrip": false,
1842
+ "single_word": false,
1843
+ "special": true
1844
+ },
1845
+ "126310": {
1846
+ "content": "<|reserved_token_226|>",
1847
+ "lstrip": false,
1848
+ "normalized": false,
1849
+ "rstrip": false,
1850
+ "single_word": false,
1851
+ "special": true
1852
+ },
1853
+ "126311": {
1854
+ "content": "<|reserved_token_227|>",
1855
+ "lstrip": false,
1856
+ "normalized": false,
1857
+ "rstrip": false,
1858
+ "single_word": false,
1859
+ "special": true
1860
+ },
1861
+ "126312": {
1862
+ "content": "<|reserved_token_228|>",
1863
+ "lstrip": false,
1864
+ "normalized": false,
1865
+ "rstrip": false,
1866
+ "single_word": false,
1867
+ "special": true
1868
+ },
1869
+ "126313": {
1870
+ "content": "<|reserved_token_229|>",
1871
+ "lstrip": false,
1872
+ "normalized": false,
1873
+ "rstrip": false,
1874
+ "single_word": false,
1875
+ "special": true
1876
+ },
1877
+ "126314": {
1878
+ "content": "<|reserved_token_230|>",
1879
+ "lstrip": false,
1880
+ "normalized": false,
1881
+ "rstrip": false,
1882
+ "single_word": false,
1883
+ "special": true
1884
+ },
1885
+ "126315": {
1886
+ "content": "<|reserved_token_231|>",
1887
+ "lstrip": false,
1888
+ "normalized": false,
1889
+ "rstrip": false,
1890
+ "single_word": false,
1891
+ "special": true
1892
+ },
1893
+ "126316": {
1894
+ "content": "<|reserved_token_232|>",
1895
+ "lstrip": false,
1896
+ "normalized": false,
1897
+ "rstrip": false,
1898
+ "single_word": false,
1899
+ "special": true
1900
+ },
1901
+ "126317": {
1902
+ "content": "<|reserved_token_233|>",
1903
+ "lstrip": false,
1904
+ "normalized": false,
1905
+ "rstrip": false,
1906
+ "single_word": false,
1907
+ "special": true
1908
+ },
1909
+ "126318": {
1910
+ "content": "<|reserved_token_234|>",
1911
+ "lstrip": false,
1912
+ "normalized": false,
1913
+ "rstrip": false,
1914
+ "single_word": false,
1915
+ "special": true
1916
+ },
1917
+ "126319": {
1918
+ "content": "<|reserved_token_235|>",
1919
+ "lstrip": false,
1920
+ "normalized": false,
1921
+ "rstrip": false,
1922
+ "single_word": false,
1923
+ "special": true
1924
+ },
1925
+ "126320": {
1926
+ "content": "<|reserved_token_236|>",
1927
+ "lstrip": false,
1928
+ "normalized": false,
1929
+ "rstrip": false,
1930
+ "single_word": false,
1931
+ "special": true
1932
+ },
1933
+ "126321": {
1934
+ "content": "<|reserved_token_237|>",
1935
+ "lstrip": false,
1936
+ "normalized": false,
1937
+ "rstrip": false,
1938
+ "single_word": false,
1939
+ "special": true
1940
+ },
1941
+ "126322": {
1942
+ "content": "<|reserved_token_238|>",
1943
+ "lstrip": false,
1944
+ "normalized": false,
1945
+ "rstrip": false,
1946
+ "single_word": false,
1947
+ "special": true
1948
+ },
1949
+ "126323": {
1950
+ "content": "<|reserved_token_239|>",
1951
+ "lstrip": false,
1952
+ "normalized": false,
1953
+ "rstrip": false,
1954
+ "single_word": false,
1955
+ "special": true
1956
+ },
1957
+ "126324": {
1958
+ "content": "<|reserved_token_240|>",
1959
+ "lstrip": false,
1960
+ "normalized": false,
1961
+ "rstrip": false,
1962
+ "single_word": false,
1963
+ "special": true
1964
+ },
1965
+ "126325": {
1966
+ "content": "<|reserved_token_241|>",
1967
+ "lstrip": false,
1968
+ "normalized": false,
1969
+ "rstrip": false,
1970
+ "single_word": false,
1971
+ "special": true
1972
+ },
1973
+ "126326": {
1974
+ "content": "<|reserved_token_242|>",
1975
+ "lstrip": false,
1976
+ "normalized": false,
1977
+ "rstrip": false,
1978
+ "single_word": false,
1979
+ "special": true
1980
+ },
1981
+ "126327": {
1982
+ "content": "<|reserved_token_243|>",
1983
+ "lstrip": false,
1984
+ "normalized": false,
1985
+ "rstrip": false,
1986
+ "single_word": false,
1987
+ "special": true
1988
+ },
1989
+ "126328": {
1990
+ "content": "<|reserved_token_244|>",
1991
+ "lstrip": false,
1992
+ "normalized": false,
1993
+ "rstrip": false,
1994
+ "single_word": false,
1995
+ "special": true
1996
+ },
1997
+ "126329": {
1998
+ "content": "<|reserved_token_245|>",
1999
+ "lstrip": false,
2000
+ "normalized": false,
2001
+ "rstrip": false,
2002
+ "single_word": false,
2003
+ "special": true
2004
+ },
2005
+ "126330": {
2006
+ "content": "<|reserved_token_246|>",
2007
+ "lstrip": false,
2008
+ "normalized": false,
2009
+ "rstrip": false,
2010
+ "single_word": false,
2011
+ "special": true
2012
+ },
2013
+ "126331": {
2014
+ "content": "<|reserved_token_247|>",
2015
+ "lstrip": false,
2016
+ "normalized": false,
2017
+ "rstrip": false,
2018
+ "single_word": false,
2019
+ "special": true
2020
+ },
2021
+ "126332": {
2022
+ "content": "<|reserved_token_248|>",
2023
+ "lstrip": false,
2024
+ "normalized": false,
2025
+ "rstrip": false,
2026
+ "single_word": false,
2027
+ "special": true
2028
+ },
2029
+ "126333": {
2030
+ "content": "<|reserved_token_249|>",
2031
+ "lstrip": false,
2032
+ "normalized": false,
2033
+ "rstrip": false,
2034
+ "single_word": false,
2035
+ "special": true
2036
+ },
2037
+ "126334": {
2038
+ "content": "<|reserved_token_250|>",
2039
+ "lstrip": false,
2040
+ "normalized": false,
2041
+ "rstrip": false,
2042
+ "single_word": false,
2043
+ "special": true
2044
+ },
2045
+ "126335": {
2046
+ "content": "<|reserved_token_251|>",
2047
+ "lstrip": false,
2048
+ "normalized": false,
2049
+ "rstrip": false,
2050
+ "single_word": false,
2051
+ "special": true
2052
+ },
2053
+ "126336": {
2054
+ "content": "<|mdm_mask|>",
2055
+ "lstrip": false,
2056
+ "normalized": false,
2057
+ "rstrip": false,
2058
+ "single_word": false,
2059
+ "special": true
2060
+ },
2061
+ "126337": {
2062
+ "content": "<|reserved_token_253|>",
2063
+ "lstrip": false,
2064
+ "normalized": false,
2065
+ "rstrip": false,
2066
+ "single_word": false,
2067
+ "special": true
2068
+ },
2069
+ "126338": {
2070
+ "content": "<|reserved_token_254|>",
2071
+ "lstrip": false,
2072
+ "normalized": false,
2073
+ "rstrip": false,
2074
+ "single_word": false,
2075
+ "special": true
2076
+ },
2077
+ "126339": {
2078
+ "content": "<|reserved_token_255|>",
2079
+ "lstrip": false,
2080
+ "normalized": false,
2081
+ "rstrip": false,
2082
+ "single_word": false,
2083
+ "special": true
2084
+ },
2085
+ "126340": {
2086
+ "content": "<role>",
2087
+ "lstrip": false,
2088
+ "normalized": false,
2089
+ "rstrip": false,
2090
+ "single_word": false,
2091
+ "special": true
2092
+ },
2093
+ "126341": {
2094
+ "content": "</role>",
2095
+ "lstrip": false,
2096
+ "normalized": false,
2097
+ "rstrip": false,
2098
+ "single_word": false,
2099
+ "special": true
2100
+ },
2101
+ "126342": {
2102
+ "content": "<|arithmetic_start|>",
2103
+ "lstrip": false,
2104
+ "normalized": false,
2105
+ "rstrip": false,
2106
+ "single_word": false,
2107
+ "special": true
2108
+ },
2109
+ "126343": {
2110
+ "content": "<|arithmetic_end|>",
2111
+ "lstrip": false,
2112
+ "normalized": false,
2113
+ "rstrip": false,
2114
+ "single_word": false,
2115
+ "special": true
2116
+ },
2117
+ "126344": {
2118
+ "content": "<|number_start|>",
2119
+ "lstrip": false,
2120
+ "normalized": false,
2121
+ "rstrip": false,
2122
+ "single_word": false,
2123
+ "special": true
2124
+ },
2125
+ "126345": {
2126
+ "content": "<|number_end|>",
2127
+ "lstrip": false,
2128
+ "normalized": false,
2129
+ "rstrip": false,
2130
+ "single_word": false,
2131
+ "special": true
2132
+ },
2133
+ "126346": {
2134
+ "content": "<|start_header_id|>",
2135
+ "lstrip": false,
2136
+ "normalized": false,
2137
+ "rstrip": false,
2138
+ "single_word": false,
2139
+ "special": true
2140
+ },
2141
+ "126347": {
2142
+ "content": "<|end_header_id|>",
2143
+ "lstrip": false,
2144
+ "normalized": false,
2145
+ "rstrip": false,
2146
+ "single_word": false,
2147
+ "special": true
2148
+ },
2149
+ "126348": {
2150
+ "content": "<|eot_id|>",
2151
+ "lstrip": false,
2152
+ "normalized": false,
2153
+ "rstrip": false,
2154
+ "single_word": false,
2155
+ "special": true
2156
+ },
2157
+ "126349": {
2158
+ "content": "<IMG_CONTEXT>",
2159
+ "lstrip": false,
2160
+ "normalized": false,
2161
+ "rstrip": false,
2162
+ "single_word": false,
2163
+ "special": true
2164
+ },
2165
+ "126350": {
2166
+ "content": "<img>",
2167
+ "lstrip": false,
2168
+ "normalized": false,
2169
+ "rstrip": false,
2170
+ "single_word": false,
2171
+ "special": true
2172
+ },
2173
+ "126351": {
2174
+ "content": "</img>",
2175
+ "lstrip": false,
2176
+ "normalized": false,
2177
+ "rstrip": false,
2178
+ "single_word": false,
2179
+ "special": true
2180
+ },
2181
+ "126352": {
2182
+ "content": "<|Mask_Cap_0|>",
2183
+ "lstrip": false,
2184
+ "normalized": false,
2185
+ "rstrip": false,
2186
+ "single_word": false,
2187
+ "special": true
2188
+ },
2189
+ "126353": {
2190
+ "content": "<|Mask_Cap_1|>",
2191
+ "lstrip": false,
2192
+ "normalized": false,
2193
+ "rstrip": false,
2194
+ "single_word": false,
2195
+ "special": true
2196
+ },
2197
+ "126354": {
2198
+ "content": "<|Mask_Cap_2|>",
2199
+ "lstrip": false,
2200
+ "normalized": false,
2201
+ "rstrip": false,
2202
+ "single_word": false,
2203
+ "special": true
2204
+ },
2205
+ "126355": {
2206
+ "content": "<|Mask_Cap_3|>",
2207
+ "lstrip": false,
2208
+ "normalized": false,
2209
+ "rstrip": false,
2210
+ "single_word": false,
2211
+ "special": true
2212
+ },
2213
+ "126356": {
2214
+ "content": "<|Mask_Cap_4|>",
2215
+ "lstrip": false,
2216
+ "normalized": false,
2217
+ "rstrip": false,
2218
+ "single_word": false,
2219
+ "special": true
2220
+ },
2221
+ "126357": {
2222
+ "content": "<|Mask_Cap_5|>",
2223
+ "lstrip": false,
2224
+ "normalized": false,
2225
+ "rstrip": false,
2226
+ "single_word": false,
2227
+ "special": true
2228
+ },
2229
+ "126358": {
2230
+ "content": "<|Mask_Cap_6|>",
2231
+ "lstrip": false,
2232
+ "normalized": false,
2233
+ "rstrip": false,
2234
+ "single_word": false,
2235
+ "special": true
2236
+ },
2237
+ "126359": {
2238
+ "content": "<|Mask_Cap_7|>",
2239
+ "lstrip": false,
2240
+ "normalized": false,
2241
+ "rstrip": false,
2242
+ "single_word": false,
2243
+ "special": true
2244
+ },
2245
+ "126360": {
2246
+ "content": "<|Mask_Cap_8|>",
2247
+ "lstrip": false,
2248
+ "normalized": false,
2249
+ "rstrip": false,
2250
+ "single_word": false,
2251
+ "special": true
2252
+ },
2253
+ "126361": {
2254
+ "content": "<|Mask_Cap_9|>",
2255
+ "lstrip": false,
2256
+ "normalized": false,
2257
+ "rstrip": false,
2258
+ "single_word": false,
2259
+ "special": true
2260
+ },
2261
+ "126362": {
2262
+ "content": "<|Mask_Cap_10|>",
2263
+ "lstrip": false,
2264
+ "normalized": false,
2265
+ "rstrip": false,
2266
+ "single_word": false,
2267
+ "special": true
2268
+ },
2269
+ "126363": {
2270
+ "content": "<|Mask_Cap_11|>",
2271
+ "lstrip": false,
2272
+ "normalized": false,
2273
+ "rstrip": false,
2274
+ "single_word": false,
2275
+ "special": true
2276
+ },
2277
+ "126364": {
2278
+ "content": "<|Mask_Cap_12|>",
2279
+ "lstrip": false,
2280
+ "normalized": false,
2281
+ "rstrip": false,
2282
+ "single_word": false,
2283
+ "special": true
2284
+ },
2285
+ "126365": {
2286
+ "content": "<|Mask_Cap_13|>",
2287
+ "lstrip": false,
2288
+ "normalized": false,
2289
+ "rstrip": false,
2290
+ "single_word": false,
2291
+ "special": true
2292
+ },
2293
+ "126366": {
2294
+ "content": "<|Mask_Cap_14|>",
2295
+ "lstrip": false,
2296
+ "normalized": false,
2297
+ "rstrip": false,
2298
+ "single_word": false,
2299
+ "special": true
2300
+ },
2301
+ "126367": {
2302
+ "content": "<|Mask_Cap_15|>",
2303
+ "lstrip": false,
2304
+ "normalized": false,
2305
+ "rstrip": false,
2306
+ "single_word": false,
2307
+ "special": true
2308
+ }
2309
+ },
2310
+ "additional_special_tokens": [
2311
+ "<|mdm_mask|>",
2312
+ "<role>",
2313
+ "</role>",
2314
+ "<|arithmetic_start|>",
2315
+ "<|arithmetic_end|>",
2316
+ "<|number_start|>",
2317
+ "<|number_end|>",
2318
+ "<IMG_CONTEXT>",
2319
+ "<img>",
2320
+ "</img>",
2321
+ "<|Mask_Cap_0|>",
2322
+ "<|Mask_Cap_1|>",
2323
+ "<|Mask_Cap_2|>",
2324
+ "<|Mask_Cap_3|>",
2325
+ "<|Mask_Cap_4|>",
2326
+ "<|Mask_Cap_5|>",
2327
+ "<|Mask_Cap_6|>",
2328
+ "<|Mask_Cap_7|>",
2329
+ "<|Mask_Cap_8|>",
2330
+ "<|Mask_Cap_9|>",
2331
+ "<|Mask_Cap_10|>",
2332
+ "<|Mask_Cap_11|>",
2333
+ "<|Mask_Cap_12|>",
2334
+ "<|Mask_Cap_13|>",
2335
+ "<|Mask_Cap_14|>",
2336
+ "<|Mask_Cap_15|>"
2337
+ ],
2338
+ "auto_map": {
2339
+ "AutoProcessor": "processing_pdmllm.PDMLLMProcessor"
2340
+ },
2341
+ "bos_token": "<|startoftext|>",
2342
+ "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
2343
+ "clean_up_tokenization_spaces": false,
2344
+ "cls_token": "[CLS]",
2345
+ "eos_token": "<|endoftext|>",
2346
+ "extra_special_tokens": {},
2347
+ "fast_tokenizer": true,
2348
+ "gmask_token": "[gMASK]",
2349
+ "merges_file": null,
2350
+ "model_input_names": [
2351
+ "input_ids",
2352
+ "attention_mask"
2353
+ ],
2354
+ "model_max_length": 1000000000000000019884624838656,
2355
+ "pad_token": "<|endoftext|>",
2356
+ "processor_class": "PDMLLMProcessor",
2357
+ "tokenizer_class": "PreTrainedTokenizer",
2358
+ "trust_remote_code": true
2359
+ }