jiang-cc commited on
Commit
b9b4987
·
verified ·
1 Parent(s): 91a25d9

Upload processor

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags: []
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+ This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_ad_copilot.py ADDED
@@ -0,0 +1,740 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ from typing import Any, Callable, Optional, Union
6
+
7
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoModelForImageTextToText
8
+ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
9
+ Qwen2_5_VisionTransformerPretrainedModel,
10
+ Qwen2_5_VLModel,
11
+ Qwen2RMSNorm,
12
+ Qwen2_5_VLMLP,
13
+ ALL_ATTENTION_FUNCTIONS
14
+ )
15
+ from transformers.image_utils import ImageInput
16
+ from transformers.tokenization_utils import TextInput, PreTokenizedInput
17
+ from transformers.video_utils import VideoInput
18
+ from transformers.feature_extraction_utils import BatchFeature
19
+
20
+ from transformers import Qwen2_5_VLProcessor, Qwen2_5_VLConfig
21
+ from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLProcessorKwargs
22
+
23
+ class ADCopilotConfig(Qwen2_5_VLConfig):
24
+ model_type = "ad_copilot"
25
+ def __init__(self, **kwargs):
26
+ super().__init__(**kwargs)
27
+ self.vision_config.compare_token_size = 100
28
+ self.architectures = ["ADCopilotVLForConditionalGeneration"]
29
+ self.sequence_compare = True
30
+
31
+ class ADCopilotProcessor(Qwen2_5_VLProcessor):
32
+ config_class = ADCopilotConfig
33
+ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
34
+ super().__init__(image_processor, tokenizer, video_processor, chat_template, **kwargs)
35
+ self.compare_token_size = 100 if "compare_token_size" not in kwargs else kwargs["compare_token_size"]
36
+
37
+ def __call__(
38
+ self,
39
+ images: ImageInput = None,
40
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
41
+ videos: VideoInput = None,
42
+ **kwargs,
43
+ ) -> BatchFeature:
44
+ """
45
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
46
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
47
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
48
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
49
+
50
+ Args:
51
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
52
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
53
+ tensor. Both channels-first and channels-last formats are supported.
54
+ text (`str`, `list[str]`, `list[list[str]]`):
55
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
56
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
57
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
58
+ videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
59
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
60
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
61
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
62
+ If set, will return tensors of a particular framework. Acceptable values are:
63
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
64
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
65
+ - `'np'`: Return NumPy `np.ndarray` objects.
66
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
67
+
68
+ Returns:
69
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
70
+
71
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
72
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
73
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
74
+ `None`).
75
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
76
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
77
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
78
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
79
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
80
+ """
81
+ output_kwargs = self._merge_kwargs(
82
+ Qwen2_5_VLProcessorKwargs,
83
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
84
+ **kwargs,
85
+ )
86
+
87
+ image_inputs = videos_inputs = {}
88
+ if images is not None:
89
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
90
+ image_grid_thw = image_inputs["image_grid_thw"]
91
+
92
+ if videos is not None:
93
+ fps = output_kwargs["videos_kwargs"].get("fps", 2.0)
94
+ videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
95
+ video_grid_thw = videos_inputs["video_grid_thw"]
96
+
97
+ if isinstance(fps, (int, float)):
98
+ second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw)
99
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
100
+ second_per_grid_ts = [self.video_processor.temporal_patch_size / tmp for tmp in fps]
101
+ else:
102
+ raise ValueError(
103
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
104
+ )
105
+ videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
106
+
107
+ if not isinstance(text, list):
108
+ text = [text]
109
+
110
+ text = text.copy() # below lines change text in-place
111
+ if images is not None:
112
+ merge_length = self.image_processor.merge_size**2
113
+ index = 0
114
+ for i in range(len(text)):
115
+ while self.image_token in text[i]:
116
+ num_image_tokens = image_grid_thw[index].prod() // merge_length
117
+ # text[i] = text[i].replace(self.image_token, "<|placeholder|>" * (num_image_tokens), 1)
118
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * (num_image_tokens + self.compare_token_size), 1)
119
+ index += 1
120
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
121
+
122
+ if videos is not None:
123
+ merge_length = self.video_processor.merge_size**2
124
+ index = 0
125
+ for i in range(len(text)):
126
+ while self.video_token in text[i]:
127
+ num_video_tokens = video_grid_thw[index].prod() // merge_length
128
+ text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
129
+ index += 1
130
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
131
+
132
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
133
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
134
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
135
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
136
+
137
+ if return_mm_token_type_ids:
138
+ array_ids = np.array(text_inputs["input_ids"])
139
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
140
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
141
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
142
+
143
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
144
+
145
+
146
+ class OptimizedCrossAttention(nn.Module):
147
+ """
148
+ 仿照 Qwen2_5_VLVisionAttention 结构的优化 Cross Attention
149
+ """
150
+ def __init__(self, config, is_cross_attention=True):
151
+ super().__init__()
152
+ self.config = config
153
+ self.dim = config.hidden_size
154
+ self.num_heads = config.num_heads
155
+ self.head_dim = self.dim // self.num_heads
156
+ self.scaling = self.head_dim**-0.5
157
+ self.attention_dropout = 0.0
158
+ self.is_causal = False # cross attention 不需要因果掩码
159
+ self.is_cross_attention = is_cross_attention
160
+
161
+ if is_cross_attention:
162
+ # Cross attention: Q 来自一个序列,K、V 来自另一个序列
163
+ self.q_proj = nn.Linear(self.dim, self.dim, bias=True)
164
+ self.kv = nn.Linear(self.dim, self.dim * 2, bias=True) # 融合 K、V
165
+ else:
166
+ # Self attention: Q、K、V 来自同一个序列
167
+ self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True) # 融合 Q、K、V
168
+
169
+ self.proj = nn.Linear(self.dim, self.dim, bias=True)
170
+
171
+ def forward(
172
+ self,
173
+ query_states: torch.Tensor,
174
+ key_value_states: Optional[torch.Tensor] = None,
175
+ attention_mask: Optional[torch.Tensor] = None,
176
+ cu_seqlens: Optional[torch.Tensor] = None, # 只FA2用
177
+ kv_cu_seqlens: Optional[torch.Tensor] = None,# 只FA2用
178
+ **kwargs,
179
+ ) -> torch.Tensor:
180
+ # 允许 query_states [B,T,d] 或 [T,d],自动扩展 batch 维
181
+ orig_2d = False
182
+ if query_states.dim() == 2:
183
+ query_states = query_states.unsqueeze(0)
184
+ orig_2d = True
185
+
186
+ batch_size, seq_len_q, _ = query_states.shape
187
+
188
+ # Q/K/V投影
189
+ if self.is_cross_attention and key_value_states is not None:
190
+ if key_value_states.dim() == 2:
191
+ key_value_states = key_value_states.unsqueeze(0)
192
+ q = self.q_proj(query_states)
193
+ kv = self.kv(key_value_states)
194
+ seq_len_kv = kv.shape[1]
195
+ k, v = kv.reshape(batch_size, seq_len_kv, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4).unbind(0)
196
+ q = q.reshape(batch_size, seq_len_q, self.num_heads, self.head_dim).transpose(1, 2)
197
+ else:
198
+ if key_value_states is None:
199
+ key_value_states = query_states
200
+ qkv = self.qkv(query_states)
201
+ q, k, v = qkv.reshape(batch_size, seq_len_q, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4).unbind(0)
202
+
203
+ # 选用哪个 attention kernel
204
+ attn_impl = getattr(self.config, '_attn_implementation', 'sdpa')
205
+ attn_impl = 'sdpa'
206
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS[attn_impl]
207
+
208
+ # ========= 支持 FA2 ==========
209
+ if attn_impl == "flash_attention_2":
210
+ # Qwen2_5 之所以能支持 FA2,是因为准备了 flatten+cu_seqlens
211
+ # 这里假设 query_states/key_value_states 按 batch 维是变长的
212
+
213
+ # 检查 cu_seqlens,有就用,否则尝试自动生成
214
+ if cu_seqlens is None:
215
+ # 默认把每个batch都视为长度=seq_len_q
216
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seq_len_q, step=seq_len_q, dtype=torch.int32, device=q.device)
217
+ if kv_cu_seqlens is None:
218
+ cu_seqlens_k = torch.arange(0, (batch_size + 1) * k.shape[2], step=k.shape[2], dtype=torch.int32, device=k.device)
219
+ else:
220
+ cu_seqlens_k = kv_cu_seqlens
221
+
222
+ # flatten [B, nH, T, d] -> [total_T, nH, d]
223
+ # 注意!FlashAttn2是 (total, nH, d),不是 (nH, total, d),和普通实现不一样
224
+ # 更安全的 flatten 方式
225
+ # [B, nH, T, d] -> [B, T, nH, d] -> [total_T, nH, d]
226
+ q_ = q.transpose(1, 2).contiguous().view(-1, self.num_heads, self.head_dim)
227
+ k_ = k.transpose(1, 2).contiguous().view(-1, self.num_heads, self.head_dim)
228
+ v_ = v.transpose(1, 2).contiguous().view(-1, self.num_heads, self.head_dim)
229
+
230
+ max_seqlen_q = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
231
+ max_seqlen_k = (cu_seqlens_k[1:] - cu_seqlens_k[:-1]).max().item()
232
+
233
+ attn_output, _ = attention_interface(
234
+ self,
235
+ q_,
236
+ k_,
237
+ v_,
238
+ attention_mask=None,
239
+ scaling=self.scaling,
240
+ dropout=0.0 if not self.training else self.attention_dropout,
241
+ cu_seq_lens_q=cu_seqlens,
242
+ cu_seq_lens_k=cu_seqlens_k,
243
+ max_length_q=max_seqlen_q,
244
+ max_length_k=max_seqlen_k,
245
+ is_causal=self.is_causal,
246
+ **kwargs,
247
+ )
248
+
249
+ # 更简洁的输出重构
250
+ # [total_q, nH, d] -> [B, seq_len_q, nH, d]
251
+ attn_output = attn_output.view(batch_size, seq_len_q, self.num_heads, self.head_dim).contiguous()
252
+ else:
253
+ # 普通实现,下游实现就是 [B, nH, T, d]
254
+ attn_output, _ = attention_interface(
255
+ self,
256
+ q, k, v,
257
+ attention_mask=attention_mask,
258
+ scaling=self.scaling,
259
+ dropout=0.0 if not self.training else self.attention_dropout,
260
+ is_causal=self.is_causal,
261
+ **kwargs,
262
+ )
263
+ # attn_output: [B, nH, seq_q, d]
264
+ attn_output = attn_output.transpose(1, 2).contiguous() # [B, seq_q, nH, d]
265
+
266
+ attn_output = attn_output.reshape(batch_size, seq_len_q, self.dim) # [B, seq_q, D]
267
+ attn_output = self.proj(attn_output)
268
+ if orig_2d:
269
+ attn_output = attn_output.squeeze(0)
270
+ return attn_output.contiguous()
271
+
272
+
273
+ class ADCopilotCompareVisualEncoder(nn.Module):
274
+ def __init__(self, config):
275
+ super().__init__()
276
+ self.config = config
277
+ self.sequence_compare = getattr(config, "sequence_compare", True)
278
+ self.hidden_size = config.hidden_size
279
+ # self.token_size = 100 * (config.spatial_merge_size**2) if "compare_token_size" not in config else config.compare_token_size * (config.spatial_merge_size**2)
280
+ self.token_size = 100 if "compare_token_size" not in config else config.compare_token_size
281
+ # Encoder 部分:双向图像特征交互
282
+ # 第一个cross attention: previous attend to current
283
+ self.encoder_cross_attn1 = OptimizedCrossAttention(config, is_cross_attention=True)
284
+ # 第二个cross attention: current attend to previous
285
+ self.encoder_cross_attn2 = OptimizedCrossAttention(config, is_cross_attention=True)
286
+
287
+ self.encoder_norm1 = Qwen2RMSNorm(self.hidden_size, eps=1e-6)
288
+ self.encoder_norm2 = Qwen2RMSNorm(self.hidden_size, eps=1e-6)
289
+ self.encoder_norm3 = Qwen2RMSNorm(self.hidden_size, eps=1e-6)
290
+ self.encoder_norm4 = Qwen2RMSNorm(self.hidden_size, eps=1e-6)
291
+ self.encoder_mlp1 = Qwen2_5_VLMLP(config)
292
+ self.encoder_mlp2 = Qwen2_5_VLMLP(config)
293
+
294
+ # Decoder 部分:Query 与编码特征交互
295
+ # 可学习的 Query Embeddings
296
+ self.query_embeddings = nn.Parameter(
297
+ torch.empty(self.token_size, self.hidden_size)
298
+ )
299
+ # 只保留 Cross Attention for queries to attend to encoded features
300
+ self.decoder_cross_attn = OptimizedCrossAttention(config, is_cross_attention=True)
301
+
302
+ self.decoder_norm1 = Qwen2RMSNorm(self.hidden_size, eps=1e-6)
303
+ self.decoder_norm2 = Qwen2RMSNorm(self.hidden_size, eps=1e-6)
304
+ self.decoder_mlp = Qwen2_5_VLMLP(config)
305
+
306
+ self.compare_projector = nn.Linear(config.hidden_size, config.out_hidden_size)
307
+
308
+ def init_query_embeddings(self):
309
+ nn.init.normal_(self.query_embeddings, mean=0.0, std=0.02)
310
+
311
+ def forward(self, images_hidden_states: list) -> torch.Tensor:
312
+ """
313
+ Args:
314
+ images_hidden_states: List of tensor, each tensor has shape [seq_len, hidden_size]
315
+
316
+ Returns:
317
+ Tensor of shape [total_images, token_size, hidden_size]
318
+ """
319
+ if not images_hidden_states:
320
+ return torch.empty(0, self.token_size, self.hidden_size)
321
+
322
+ # 检查 query_embeddings 是否包含 NaN
323
+ if torch.isnan(self.query_embeddings).any():
324
+ print("警告:query_embeddings 包含 NaN 值")
325
+ # nn.init.normal_(self.query_embeddings, mean=0.0, std=0.02)
326
+
327
+ # 获取每个图像的序列长度
328
+ seq_lengths = [state.size(0) for state in images_hidden_states]
329
+ max_seq_len = max(seq_lengths)
330
+ batch_size = len(images_hidden_states)
331
+ device = images_hidden_states[0].device
332
+ dtype = images_hidden_states[0].dtype
333
+
334
+ # 将所有图像填充到相同长度并堆叠
335
+ padded_states = []
336
+ attention_masks = []
337
+ for state in images_hidden_states:
338
+ pad_len = max_seq_len - state.size(0)
339
+ if pad_len > 0:
340
+ # 填充序列
341
+ padded_state = F.pad(state, (0, 0, 0, pad_len), mode='constant', value=0)
342
+ # 创建注意力掩码
343
+ attention_mask = torch.ones(max_seq_len, dtype=torch.bool, device=device)
344
+ attention_mask[state.size(0):] = False
345
+ else:
346
+ padded_state = state
347
+ attention_mask = torch.ones(max_seq_len, dtype=torch.bool, device=device)
348
+ padded_states.append(padded_state)
349
+ attention_masks.append(attention_mask)
350
+
351
+ # [batch_size, max_seq_len, hidden_size]
352
+ batched_states = torch.stack(padded_states)
353
+ # [batch_size, max_seq_len]
354
+ attention_masks = torch.stack(attention_masks)
355
+
356
+ # 创建循环移位的状态用于对比
357
+ # 对于第一个图像,使用自身作为previous
358
+ previous_states = torch.roll(batched_states, shifts=1, dims=0)
359
+ previous_masks = torch.roll(attention_masks, shifts=1, dims=0)
360
+
361
+ if previous_states.size(0) > 1 and self.sequence_compare:
362
+ previous_states[0] = previous_states[1]
363
+ previous_masks[0] = previous_masks[1]
364
+
365
+ # Encoder: 批量处理所有图像
366
+ encoded_features = self._encoder_forward(
367
+ batched_states, # [batch_size, max_seq_len, hidden_size]
368
+ previous_states, # [batch_size, max_seq_len, hidden_size]
369
+ attention_masks, # [batch_size, max_seq_len]
370
+ previous_masks # [batch_size, max_seq_len]
371
+ )
372
+
373
+ # Decoder: 批量处理所有图像
374
+ # 扩展query_embeddings到batch维度
375
+ batch_queries = self.query_embeddings.unsqueeze(0).expand(batch_size, -1, -1)
376
+ # [batch_size, token_size, hidden_size]
377
+ compare_visual_embeds = self._decoder_forward(
378
+ batch_queries,
379
+ encoded_features,
380
+ torch.ones(batch_size, self.token_size, dtype=torch.bool, device=device), # query掩码
381
+ attention_masks # encoded特征的掩码
382
+ )
383
+
384
+ # 记录每个batch的token数量
385
+ batch_size = compare_visual_embeds.size(0)
386
+ token_size = compare_visual_embeds.size(1)
387
+ # 将所有batch的数据拼接在一起
388
+ # [batch_size * token_size, hidden_size]
389
+ flattened_embeds = compare_visual_embeds.view(-1, compare_visual_embeds.size(-1))
390
+ merged = self.compare_projector(flattened_embeds) # [batch_size * token_size, merged_hidden_size]
391
+ merged_token_size = token_size
392
+ # [batch_size, merged_token_size, merged_hidden_size]
393
+ compare_visual_embeds = merged.view(batch_size, merged_token_size, -1)
394
+
395
+ return compare_visual_embeds # [batch_size, token_size, out_hidden_size]
396
+
397
+ def _encoder_forward(self, current_features, previous_features, current_mask=None, previous_mask=None):
398
+ """
399
+ Encoder: 双向图像特征交互
400
+ Args:
401
+ current_features: [batch_size, seq_len, hidden_size]
402
+ previous_features: [batch_size, seq_len, hidden_size]
403
+ current_mask: [batch_size, seq_len]
404
+ previous_mask: [batch_size, seq_len]
405
+ """
406
+ # 第一步:previous attend to current
407
+ residual = previous_features
408
+
409
+ # Layer norm
410
+ previous_normed = self.encoder_norm1(previous_features)
411
+ current_normed1 = self.encoder_norm1(current_features)
412
+
413
+ # Cross attention: previous attend to current
414
+ cross_attn_output1 = self.encoder_cross_attn1(
415
+ query_states=previous_normed,
416
+ key_value_states=current_normed1,
417
+ attention_mask=current_mask.unsqueeze(1).unsqueeze(2) if current_mask is not None else None
418
+ )
419
+
420
+ # Residual connection
421
+ previous_features = residual + cross_attn_output1
422
+
423
+ # MLP for previous features
424
+ residual = previous_features
425
+ mlp_input1 = self.encoder_norm2(previous_features)
426
+ mlp_output1 = self.encoder_mlp1(mlp_input1)
427
+ previous_features = residual + mlp_output1
428
+
429
+ # 第二步:current attend to previous (enhanced)
430
+ residual = current_features
431
+
432
+ # Layer norm
433
+ current_normed2 = self.encoder_norm3(current_features)
434
+ previous_normed2 = self.encoder_norm3(previous_features)
435
+
436
+ # Cross attention: current attend to previous
437
+ cross_attn_output2 = self.encoder_cross_attn2(
438
+ query_states=current_normed2,
439
+ key_value_states=previous_normed2,
440
+ attention_mask=previous_mask.unsqueeze(1).unsqueeze(2) if previous_mask is not None else None
441
+ )
442
+
443
+ # Residual connection
444
+ current_features = residual + cross_attn_output2
445
+
446
+ # MLP for current features
447
+ residual = current_features
448
+ mlp_input2 = self.encoder_norm4(current_features)
449
+ mlp_output2 = self.encoder_mlp2(mlp_input2)
450
+ # current_features = residual + mlp_output2
451
+ # 修改为减法
452
+ current_features = residual - mlp_output2
453
+ return current_features
454
+
455
+ def _decoder_forward(self, queries, encoded_features, query_mask=None, encoded_mask=None):
456
+ """
457
+ Decoder: Query 与编码特征交互
458
+ Args:
459
+ queries: [batch_size, token_size, hidden_size]
460
+ encoded_features: [batch_size, seq_len, hidden_size]
461
+ query_mask: [batch_size, token_size]
462
+ encoded_mask: [batch_size, seq_len]
463
+ """
464
+ # Cross attention: queries attend to encoded features
465
+ residual = queries
466
+ queries_normed = self.decoder_norm1(queries)
467
+ encoded_normed = self.decoder_norm1(encoded_features)
468
+
469
+ cross_attn_output = self.decoder_cross_attn(
470
+ query_states=queries_normed,
471
+ key_value_states=encoded_normed,
472
+ attention_mask=encoded_mask.unsqueeze(1).unsqueeze(2) if encoded_mask is not None else None
473
+ )
474
+
475
+ queries = residual + cross_attn_output
476
+
477
+ # MLP
478
+ residual = queries
479
+ mlp_input = self.decoder_norm2(queries)
480
+ mlp_output = self.decoder_mlp(mlp_input)
481
+ queries = residual + mlp_output
482
+
483
+ return queries # [batch_size, token_size, hidden_size]
484
+
485
+
486
+ # 先把组件继承出来方便修改
487
+ class ADCopilotVisionTransformerPretrainedModel(Qwen2_5_VisionTransformerPretrainedModel):
488
+ def __init__(self, config, *inputs, **kwargs) -> None:
489
+ super().__init__(config, *inputs, **kwargs)
490
+ self.compare_visual_encoder = ADCopilotCompareVisualEncoder(config)
491
+
492
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
493
+ """
494
+ Args:
495
+ hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
496
+ The final hidden states of the model.
497
+ grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
498
+ The temporal, height and width of feature shape of each image in LLM.
499
+
500
+ Returns:
501
+ `torch.Tensor`: hidden_states, compare_visual_embeds.
502
+ """
503
+ hidden_states = self.patch_embed(hidden_states)
504
+ rotary_pos_emb = self.rot_pos_emb(grid_thw)
505
+ window_index, cu_window_seqlens = self.get_window_index(grid_thw)
506
+ cu_window_seqlens = torch.tensor(
507
+ cu_window_seqlens,
508
+ device=hidden_states.device,
509
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
510
+ )
511
+ cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
512
+
513
+ seq_len, _ = hidden_states.size()
514
+ hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
515
+ hidden_states = hidden_states[window_index, :, :]
516
+ hidden_states = hidden_states.reshape(seq_len, -1)
517
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
518
+ rotary_pos_emb = rotary_pos_emb[window_index, :, :]
519
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
520
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
521
+ position_embeddings = (emb.cos(), emb.sin())
522
+
523
+ cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
524
+ dim=0,
525
+ # Select dtype based on the following factors:
526
+ # - FA2 requires that cu_seqlens_q must have dtype int32
527
+ # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
528
+ # See https://github.com/huggingface/transformers/pull/34852 for more information
529
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
530
+ )
531
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
532
+
533
+ for layer_num, blk in enumerate(self.blocks):
534
+ if layer_num in self.fullatt_block_indexes:
535
+ cu_seqlens_now = cu_seqlens
536
+ else:
537
+ cu_seqlens_now = cu_window_seqlens
538
+
539
+ hidden_states = blk(
540
+ hidden_states,
541
+ cu_seqlens=cu_seqlens_now,
542
+ position_embeddings=position_embeddings,
543
+ **kwargs,
544
+ )
545
+
546
+ split_sizes = grid_thw.prod(-1).tolist()
547
+ splited_hidden_states_before_merger = torch.split(hidden_states, split_sizes)
548
+ # [total_images, token_size, hidden_size]
549
+ compare_visual_embeds = self.compare_visual_encoder(splited_hidden_states_before_merger)
550
+
551
+
552
+ hidden_states = self.merger(hidden_states)
553
+ reverse_indices = torch.argsort(window_index)
554
+ hidden_states = hidden_states[reverse_indices, :]
555
+
556
+ return hidden_states, compare_visual_embeds
557
+
558
+ class ADCopilotVLModel(Qwen2_5_VLModel):
559
+ def __init__(self, config):
560
+ super().__init__(config)
561
+ self.visual = ADCopilotVisionTransformerPretrainedModel._from_config(config.vision_config)
562
+ self.compare_token_size = config.vision_config.compare_token_size
563
+ # self.learnable_image_embeddings = nn.Parameter(
564
+ # torch.randn(100, config.hidden_size) * 0.02 # 使用小的初始化值
565
+ # )
566
+
567
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
568
+ """
569
+ Encodes images into continuous embeddings that can be forwarded to the language model.
570
+
571
+ Args:
572
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
573
+ The tensors corresponding to the input images.
574
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
575
+ The temporal, height and width of feature shape of each image in LLM.
576
+ """
577
+ pixel_values = pixel_values.type(self.visual.dtype)
578
+ image_embeds, compare_visual_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
579
+ # 每个图像添加了对比感知token
580
+ split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
581
+ image_embeds = torch.split(image_embeds, split_sizes)
582
+
583
+ # 将图像嵌入和对比视觉嵌入拼接
584
+ enhanced_image_embeds = []
585
+ for i, embeds in enumerate(image_embeds):
586
+ # 确保 compare_visual_embeds[i] 与 embeds 在相同设备和数据类型
587
+ compare_embed = compare_visual_embeds[i].to(device=embeds.device, dtype=embeds.dtype)
588
+ enhanced_embeds = torch.cat([embeds, compare_embed], dim=0)
589
+ enhanced_image_embeds.append(enhanced_embeds)
590
+
591
+ # image_embeds = torch.cat(enhanced_image_embeds, dim=0)
592
+ return enhanced_image_embeds
593
+
594
+ def get_rope_index(self, input_ids: Optional[torch.LongTensor] = None, image_grid_thw: Optional[torch.LongTensor] = None, video_grid_thw: Optional[torch.LongTensor] = None, second_per_grid_ts: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None) -> tuple[torch.Tensor, torch.Tensor]:
595
+ return self.get_rope_index_with_compare_token(input_ids, image_grid_thw, video_grid_thw, second_per_grid_ts, attention_mask)
596
+
597
+ def get_rope_index_with_compare_token(
598
+ self,
599
+ input_ids: Optional[torch.LongTensor] = None,
600
+ image_grid_thw: Optional[torch.LongTensor] = None,
601
+ video_grid_thw: Optional[torch.LongTensor] = None,
602
+ second_per_grid_ts: Optional[torch.Tensor] = None,
603
+ attention_mask: Optional[torch.Tensor] = None,
604
+ ) -> tuple[torch.Tensor, torch.Tensor]:
605
+ spatial_merge_size = self.config.vision_config.spatial_merge_size
606
+ image_token_id = self.config.image_token_id
607
+ video_token_id = self.config.video_token_id
608
+ vision_start_token_id = self.config.vision_start_token_id
609
+ mrope_position_deltas = []
610
+ if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
611
+ total_input_ids = input_ids
612
+ if attention_mask is None:
613
+ attention_mask = torch.ones_like(total_input_ids)
614
+ position_ids = torch.ones(
615
+ 3,
616
+ input_ids.shape[0],
617
+ input_ids.shape[1],
618
+ dtype=input_ids.dtype,
619
+ device=input_ids.device,
620
+ )
621
+ image_index, video_index = 0, 0
622
+ attention_mask = attention_mask.to(total_input_ids.device)
623
+ for i, input_ids in enumerate(total_input_ids):
624
+ input_ids = input_ids[attention_mask[i] == 1]
625
+ image_nums, video_nums = 0, 0
626
+ vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
627
+ vision_tokens = input_ids[vision_start_indices + 1]
628
+ image_nums = (vision_tokens == image_token_id).sum()
629
+ video_nums = (vision_tokens == video_token_id).sum()
630
+ input_tokens = input_ids.tolist()
631
+ llm_pos_ids_list: list = []
632
+ st = 0
633
+ remain_images, remain_videos = image_nums, video_nums
634
+ for vision_index in range(image_nums + video_nums):
635
+ if image_token_id in input_tokens and remain_images > 0:
636
+ ed_image = input_tokens.index(image_token_id, st)
637
+ else:
638
+ ed_image = len(input_tokens) + 1
639
+ if video_token_id in input_tokens and remain_videos > 0:
640
+ ed_video = input_tokens.index(video_token_id, st)
641
+ else:
642
+ ed_video = len(input_tokens) + 1
643
+ if ed_image < ed_video:
644
+ t, h, w = (
645
+ image_grid_thw[image_index][0],
646
+ image_grid_thw[image_index][1],
647
+ image_grid_thw[image_index][2],
648
+ )
649
+ second_per_grid_t = 0
650
+ image_index += 1
651
+ remain_images -= 1
652
+ ed = ed_image
653
+
654
+ else:
655
+ t, h, w = (
656
+ video_grid_thw[video_index][0],
657
+ video_grid_thw[video_index][1],
658
+ video_grid_thw[video_index][2],
659
+ )
660
+ if second_per_grid_ts is not None:
661
+ second_per_grid_t = second_per_grid_ts[video_index]
662
+ else:
663
+ second_per_grid_t = 1.0
664
+ video_index += 1
665
+ remain_videos -= 1
666
+ ed = ed_video
667
+ llm_grid_t, llm_grid_h, llm_grid_w = (
668
+ t.item(),
669
+ h.item() // spatial_merge_size,
670
+ w.item() // spatial_merge_size,
671
+ )
672
+ text_len = ed - st
673
+
674
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
675
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
676
+
677
+ range_tensor = torch.arange(llm_grid_t).view(-1, 1)
678
+ expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
679
+
680
+ ## normalize type, send to device.
681
+ second_per_grid_t = torch.as_tensor(
682
+ second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
683
+ )
684
+
685
+ time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
686
+
687
+ time_tensor_long = time_tensor.long()
688
+ t_index = time_tensor_long.flatten()
689
+
690
+ h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
691
+ w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
692
+ llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
693
+ st = ed + llm_grid_t * llm_grid_h * llm_grid_w
694
+ if ed_image < ed_video:
695
+ # 如果当前是图片,则需要插入 compare_token_size 个图像对比的token的position
696
+ compare_t_index = t_index[-1].repeat(self.compare_token_size)
697
+ # compare_h_index = torch.arange(self.compare_token_size)
698
+ # compare_w_index = torch.arange(self.compare_token_size)
699
+ compare_h_index = compare_t_index
700
+ compare_w_index = compare_t_index
701
+ llm_pos_ids_list.append(torch.stack([compare_t_index, compare_h_index, compare_w_index]) + text_len + st_idx)
702
+ st = st + self.compare_token_size
703
+
704
+ if st < len(input_tokens):
705
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
706
+ text_len = len(input_tokens) - st
707
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
708
+
709
+ llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
710
+ position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
711
+ mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
712
+ mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
713
+ return position_ids, mrope_position_deltas
714
+ else:
715
+ if attention_mask is not None:
716
+ position_ids = attention_mask.long().cumsum(-1) - 1
717
+ position_ids.masked_fill_(attention_mask == 0, 1)
718
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
719
+ max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
720
+ mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
721
+ else:
722
+ position_ids = (
723
+ torch.arange(input_ids.shape[1], device=input_ids.device)
724
+ .view(1, 1, -1)
725
+ .expand(3, input_ids.shape[0], -1)
726
+ )
727
+ mrope_position_deltas = torch.zeros(
728
+ [input_ids.shape[0], 1],
729
+ device=input_ids.device,
730
+ dtype=input_ids.dtype,
731
+ )
732
+
733
+ return position_ids, mrope_position_deltas
734
+
735
+ class ADCopilotVLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
736
+ config_class = ADCopilotConfig
737
+
738
+ def __init__(self, config):
739
+ super().__init__(config)
740
+ self.model = ADCopilotVLModel(config)
preprocessor_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "modeling_ad_copilot.ADCopilotProcessor"
4
+ },
5
+ "crop_size": null,
6
+ "data_format": "channels_first",
7
+ "default_to_square": true,
8
+ "device": null,
9
+ "disable_grouping": null,
10
+ "do_center_crop": null,
11
+ "do_convert_rgb": true,
12
+ "do_normalize": true,
13
+ "do_rescale": true,
14
+ "do_resize": true,
15
+ "image_mean": [
16
+ 0.48145466,
17
+ 0.4578275,
18
+ 0.40821073
19
+ ],
20
+ "image_processor_type": "Qwen2VLImageProcessorFast",
21
+ "image_std": [
22
+ 0.26862954,
23
+ 0.26130258,
24
+ 0.27577711
25
+ ],
26
+ "input_data_format": null,
27
+ "max_pixels": 12845056,
28
+ "merge_size": 2,
29
+ "min_pixels": 3136,
30
+ "patch_size": 14,
31
+ "processor_class": "ADCopilotProcessor",
32
+ "resample": 3,
33
+ "rescale_factor": 0.00392156862745098,
34
+ "return_tensors": null,
35
+ "size": {
36
+ "longest_edge": 12845056,
37
+ "shortest_edge": 3136
38
+ },
39
+ "temporal_patch_size": 2
40
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "modeling_ad_copilot.ADCopilotProcessor"
4
+ },
5
+ "processor_class": "ADCopilotProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eee858c5123a4279c3e1f7b81247343f356ac767940b2692a928ad929543214
3
+ size 11422063
tokenizer_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "auto_map": {
198
+ "AutoProcessor": "modeling_ad_copilot.ADCopilotProcessor"
199
+ },
200
+ "bos_token": null,
201
+ "clean_up_tokenization_spaces": false,
202
+ "eos_token": "<|im_end|>",
203
+ "errors": "replace",
204
+ "extra_special_tokens": {},
205
+ "max_length": null,
206
+ "model_max_length": 131072,
207
+ "pad_to_multiple_of": null,
208
+ "pad_token": "<|endoftext|>",
209
+ "pad_token_type_id": 0,
210
+ "padding_side": "left",
211
+ "processor_class": "ADCopilotProcessor",
212
+ "split_special_tokens": false,
213
+ "tokenizer_class": "Qwen2Tokenizer",
214
+ "unk_token": null
215
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "modeling_ad_copilot.ADCopilotProcessor"
4
+ },
5
+ "crop_size": null,
6
+ "data_format": "channels_first",
7
+ "default_to_square": true,
8
+ "device": null,
9
+ "do_center_crop": null,
10
+ "do_convert_rgb": true,
11
+ "do_normalize": true,
12
+ "do_pad": null,
13
+ "do_rescale": true,
14
+ "do_resize": true,
15
+ "do_sample_frames": false,
16
+ "fps": null,
17
+ "image_mean": [
18
+ 0.48145466,
19
+ 0.4578275,
20
+ 0.40821073
21
+ ],
22
+ "image_std": [
23
+ 0.26862954,
24
+ 0.26130258,
25
+ 0.27577711
26
+ ],
27
+ "input_data_format": null,
28
+ "max_frames": 768,
29
+ "max_pixels": 12845056,
30
+ "merge_size": 2,
31
+ "min_frames": 4,
32
+ "min_pixels": 3136,
33
+ "num_frames": null,
34
+ "patch_size": 14,
35
+ "processor_class": "ADCopilotProcessor",
36
+ "resample": 3,
37
+ "rescale_factor": 0.00392156862745098,
38
+ "return_metadata": false,
39
+ "size": {
40
+ "longest_edge": 12845056,
41
+ "shortest_edge": 3136
42
+ },
43
+ "size_divisor": null,
44
+ "temporal_patch_size": 2,
45
+ "video_metadata": null,
46
+ "video_processor_type": "Qwen2VLVideoProcessor"
47
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff