JungleGym commited on
Commit
04a0a09
·
verified ·
1 Parent(s): 3cf41db

Upload processing_timelens.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processing_timelens.py +227 -0
processing_timelens.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Modified from https://github.com/huggingface/transformers/blob/v4.57.1/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
2
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ import numpy as np
22
+ import torch
23
+ from transformers import Qwen2_5_VLProcessor
24
+ from transformers.feature_extraction_utils import BatchFeature
25
+ from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import (
26
+ Qwen2_5_VLProcessorKwargs,
27
+ )
28
+
29
+
30
+ class TimeLensProcessor(Qwen2_5_VLProcessor):
31
+ r"""
32
+ Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
33
+ [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
34
+ [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
35
+ Args:
36
+ image_processor ([`Qwen2VLImageProcessor`], *optional*):
37
+ The image processor is a required input.
38
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
39
+ The tokenizer is a required input.
40
+ video_processor ([`Qwen2_5_VLVideoProcessor`], *optional*):
41
+ The video processor is a required input.
42
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
43
+ in a chat into a tokenizable string.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ image_processor=None,
49
+ tokenizer=None,
50
+ video_processor=None,
51
+ chat_template=None,
52
+ **kwargs,
53
+ ):
54
+ super().__init__(
55
+ image_processor, tokenizer, video_processor, chat_template, **kwargs
56
+ )
57
+ # ============ [TimeLens] Modification BEGIN ============
58
+ self.vision_start = (
59
+ "<|vision_start|>"
60
+ if not hasattr(tokenizer, "vision_start")
61
+ else tokenizer.vision_start
62
+ )
63
+ self.vision_end = (
64
+ "<|vision_end|>"
65
+ if not hasattr(tokenizer, "vision_end")
66
+ else tokenizer.vision_end
67
+ )
68
+ # ============ [TimeLens] Modification END ==============
69
+
70
+ def __call__(
71
+ self,
72
+ images=None,
73
+ text=None,
74
+ videos=None,
75
+ **kwargs,
76
+ ) -> BatchFeature:
77
+ """
78
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
79
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
80
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
81
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
82
+
83
+ Args:
84
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
85
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
86
+ tensor. Both channels-first and channels-last formats are supported.
87
+ text (`str`, `list[str]`, `list[list[str]]`):
88
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
89
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
90
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
91
+ videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
92
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
93
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
94
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
95
+ If set, will return tensors of a particular framework. Acceptable values are:
96
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
97
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
98
+ - `'np'`: Return NumPy `np.ndarray` objects.
99
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
100
+
101
+ Returns:
102
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
103
+
104
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
105
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
106
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
107
+ `None`).
108
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
109
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
110
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
111
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
112
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
113
+ """
114
+ output_kwargs = self._merge_kwargs(
115
+ Qwen2_5_VLProcessorKwargs,
116
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
117
+ **kwargs,
118
+ )
119
+
120
+ image_inputs = videos_inputs = {}
121
+ if images is not None:
122
+ image_inputs = self.image_processor(
123
+ images=images, **output_kwargs["images_kwargs"]
124
+ )
125
+ image_grid_thw = image_inputs["image_grid_thw"]
126
+
127
+ if videos is not None:
128
+ # ============ [TimeLens] Modification BEGIN ============
129
+ # videos is a list of (video_tensor, metadata) tuples
130
+ videos, metadata = [v[0] for v in videos], [v[1] for v in videos]
131
+ # Duplicate frames at even indices
132
+ for cur_video_tensor in videos:
133
+ cur_video_tensor[1::2] = cur_video_tensor[::2]
134
+ # Calculate sampled timestamps for each video
135
+ frames_timestamps = [
136
+ [
137
+ idx / cur_metadata["fps"]
138
+ for idx in cur_metadata["frames_indices"][::2]
139
+ ]
140
+ for cur_metadata in metadata
141
+ ]
142
+
143
+ videos_inputs = self.video_processor(
144
+ videos=videos, **output_kwargs["videos_kwargs"]
145
+ )
146
+ video_grid_thw = videos_inputs["video_grid_thw"]
147
+ # ============ [TimeLens] Modification END ==============
148
+
149
+ if not isinstance(text, list):
150
+ text = [text]
151
+
152
+ text = text.copy() # below lines change text in-place
153
+ if images is not None:
154
+ merge_length = self.image_processor.merge_size**2
155
+ index = 0
156
+ for i in range(len(text)):
157
+ while self.image_token in text[i]:
158
+ num_image_tokens = image_grid_thw[index].prod() // merge_length
159
+ text[i] = text[i].replace(
160
+ self.image_token, "<|placeholder|>" * num_image_tokens, 1
161
+ )
162
+ index += 1
163
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
164
+
165
+ if videos is not None:
166
+ merge_length = self.video_processor.merge_size**2
167
+ index = 0
168
+ # ============ [TimeLens] Modification BEGIN ============
169
+ for i in range(len(text)):
170
+ while self.video_token in text[i]:
171
+ cur_video_tokens = ""
172
+ num_tokens_per_frame = (
173
+ video_grid_thw[index][1:].prod() // merge_length
174
+ )
175
+ per_frame_tokens = (
176
+ self.vision_start
177
+ + "<|placeholder|>" * num_tokens_per_frame
178
+ + self.vision_end
179
+ )
180
+ for cur_frames_timestamp in frames_timestamps[index]:
181
+ cur_video_tokens += (
182
+ f"{cur_frames_timestamp:.1f}s: " + per_frame_tokens
183
+ )
184
+
185
+ text[i] = text[i].replace(
186
+ self.vision_start + self.video_token + self.vision_end,
187
+ cur_video_tokens,
188
+ 1,
189
+ )
190
+ index += 1
191
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
192
+ # modeling_qwen2_5_vl.py calls `.item()` on image_grid_thw to convert t, h, w from tensor to int, so we create image_grid_thw as Tensor to be compatible with `.item()` call
193
+ image_grid_thw = torch.tensor(
194
+ [
195
+ [1, grid_h, grid_w]
196
+ for grid_t, grid_h, grid_w in video_grid_thw
197
+ for _ in range(grid_t)
198
+ ],
199
+ dtype=torch.long,
200
+ )
201
+
202
+ image_inputs = {
203
+ "pixel_values": videos_inputs[
204
+ "pixel_values_videos"
205
+ ], # [grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size] = [num_patches, dim]
206
+ "image_grid_thw": image_grid_thw,
207
+ }
208
+ videos_inputs = {}
209
+ # ============ [TimeLens] Modification END ==============
210
+
211
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
212
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop(
213
+ "return_mm_token_type_ids", None
214
+ )
215
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
216
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
217
+
218
+ if return_mm_token_type_ids:
219
+ array_ids = np.array(text_inputs["input_ids"])
220
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
221
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
222
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
223
+
224
+ return BatchFeature(
225
+ data={**text_inputs, **image_inputs, **videos_inputs},
226
+ tensor_type=return_tensors,
227
+ )