currentfear commited on
Commit
24623aa
·
verified ·
1 Parent(s): 397557e

Upload processing_minicpmv.py

Browse files
Files changed (1) hide show
  1. processing_minicpmv.py +255 -0
processing_minicpmv.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for MiniCPMV.
17
+ """
18
+
19
+ from typing import List, Optional, Union, Dict, Any
20
+ import torch
21
+ import re
22
+
23
+ from transformers.image_processing_utils import BatchFeature
24
+ from transformers.image_utils import ImageInput
25
+ from transformers.processing_utils import ProcessorMixin
26
+ from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
27
+ from transformers.utils import TensorType, requires_backends, is_torch_dtype, is_torch_device
28
+
29
+ from .image_processing_minicpmv import MiniCPMVBatchFeature
30
+
31
+
32
+ class MiniCPMVProcessor(ProcessorMixin):
33
+ r"""
34
+ Constructs a MiniCPMV processor which wraps a MiniCPMV image processor and a MiniCPMV tokenizer into a single processor.
35
+
36
+ [`MiniCPMVProcessor`] offers all the functionalities of [`MiniCPMVImageProcessor`] and [`LlamaTokenizerWrapper`]. See the
37
+ [`~MiniCPMVProcessor.__call__`] and [`~MiniCPMVProcessor.decode`] for more information.
38
+
39
+ Args:
40
+ image_processor ([`MiniCPMVImageProcessor`], *optional*):
41
+ The image processor is a required input.
42
+ tokenizer ([`LlamaTokenizerWrapper`], *optional*):
43
+ The tokenizer is a required input.
44
+ """
45
+ attributes = ["image_processor", "tokenizer"]
46
+ image_processor_class = "AutoImageProcessor"
47
+ tokenizer_class = "AutoTokenizer"
48
+
49
+ def __init__(self, image_processor=None, tokenizer=None):
50
+ super().__init__(image_processor, tokenizer)
51
+ self.version = image_processor.version
52
+
53
+ def __call__(
54
+ self,
55
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
56
+ images: ImageInput = None,
57
+ max_length: Optional[int] = None,
58
+ do_pad: Optional[bool] = True,
59
+ max_slice_nums: int = None,
60
+ use_image_id: bool = None,
61
+ temporal_ids: Optional[Union[List[List[int]], List[List[List[int]]]]] = None,
62
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
63
+ **kwargs
64
+ ) -> MiniCPMVBatchFeature:
65
+
66
+ if images is not None:
67
+ # image_inputs = self.image_processor(images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors)
68
+ image_inputs = self.image_processor(images, do_pad=do_pad, max_slice_nums=max_slice_nums, temporal_ids=temporal_ids, return_tensors=return_tensors)
69
+ # return self._convert_images_texts_to_inputs(image_inputs, text, max_slice_nums=max_slice_nums, use_image_id=use_image_id, max_length=max_length, **kwargs)
70
+ return self._convert_images_texts_to_inputs(image_inputs, text, max_slice_nums=max_slice_nums, use_image_id=use_image_id, max_length=max_length, temporal_ids=temporal_ids, **kwargs)
71
+
72
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
73
+ def batch_decode(self, *args, **kwargs):
74
+ """
75
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
76
+ refer to the docstring of this method for more information.
77
+ """
78
+ output_ids = args[0]
79
+ result_text = []
80
+ for result in output_ids:
81
+ result = result[result != 0]
82
+ if result[0] == self.tokenizer.bos_id:
83
+ result = result[1:]
84
+ if result[-1] == self.tokenizer.eos_id:
85
+ result = result[:-1]
86
+ result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
87
+ return result_text
88
+ # return self.tokenizer.batch_decode(*args, **kwargs)
89
+
90
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
91
+ def decode(self, *args, **kwargs):
92
+ """
93
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
94
+ the docstring of this method for more information.
95
+ """
96
+ result = args[0]
97
+ result = result[result != 0]
98
+ if result[0] == self.tokenizer.bos_id:
99
+ result = result[1:]
100
+ if result[-1] == self.tokenizer.eos_id or (hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id):
101
+ result = result[:-1]
102
+ return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
103
+
104
+ def _convert(
105
+ self, input_str, max_inp_length: Optional[int] = None
106
+ ):
107
+ if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
108
+ input_ids = self.tokenizer.encode(input_str)
109
+ else:
110
+ input_ids = [self.tokenizer.bos_id] + self.tokenizer.encode(input_str)
111
+ if max_inp_length is not None:
112
+ input_ids = input_ids[:max_inp_length]
113
+ input_ids = torch.tensor(input_ids, dtype=torch.int32)
114
+
115
+ start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
116
+ end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
117
+
118
+ image_start_tokens = torch.where(start_cond)[0]
119
+ image_start_tokens += 1
120
+ image_end_tokens = torch.where(end_cond)[0]
121
+
122
+ valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
123
+
124
+ image_bounds = torch.hstack(
125
+ [
126
+ image_start_tokens[:valid_image_nums].unsqueeze(-1),
127
+ image_end_tokens[:valid_image_nums].unsqueeze(-1),
128
+ ]
129
+ )
130
+ return input_ids, image_bounds
131
+
132
+ def _convert_images_texts_to_inputs(
133
+ self,
134
+ images,
135
+ texts: Union[str, List[str]],
136
+ truncation=None,
137
+ max_length=None,
138
+ max_slice_nums=None,
139
+ use_image_id=None,
140
+ return_tensors=None,
141
+ **kwargs
142
+ ):
143
+ if images is None or not len(images):
144
+ model_inputs = self.tokenizer(texts, return_tensors=return_tensors, truncation=truncation, max_length=max_length, **kwargs)
145
+ return MiniCPMVBatchFeature(data={**model_inputs})
146
+
147
+ pattern = "(<image>./</image>)"
148
+ # images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
149
+ images, image_sizes, tgt_sizes, temporal_ids, skip_image_idx = images["pixel_values"], images["image_sizes"], images["tgt_sizes"], images["temporal_ids"], images["skip_image_idx"]
150
+
151
+ if isinstance(texts, str):
152
+ texts = [texts]
153
+ input_ids_list = []
154
+ image_bounds_list = []
155
+ for index, (text, skip_idx) in enumerate(zip(texts, skip_image_idx)):
156
+ image_tags = re.findall(pattern, text)
157
+ assert len(image_tags) == len(image_sizes[index])
158
+ text_chunks = text.split(pattern)
159
+ final_text = ""
160
+
161
+ for i in range(len(image_tags)):
162
+ if i in skip_idx:
163
+ image_placeholder = ''
164
+ text_chunk = text_chunks[i].strip()
165
+
166
+ else:
167
+ image_placeholder = self.image_processor.get_slice_image_placeholder(
168
+ image_sizes[index][i],
169
+ i,
170
+ max_slice_nums,
171
+ use_image_id
172
+ )
173
+ text_chunk = text_chunks[i]
174
+
175
+ final_text = final_text + text_chunk + image_placeholder
176
+
177
+ final_text += text_chunks[-1]
178
+
179
+ input_ids, image_bounds = self._convert(final_text, max_length)
180
+ input_ids_list.append(input_ids)
181
+ image_bounds_list.append(image_bounds)
182
+ padded_input_ids, padding_lengths = self.pad(
183
+ input_ids_list,
184
+ padding_side="left"
185
+ )
186
+ for i, length in enumerate(padding_lengths):
187
+ image_bounds_list[i] = image_bounds_list[i] + length
188
+ attention_mask = padded_input_ids.ne(0)
189
+
190
+ return MiniCPMVBatchFeature(data={
191
+ "input_ids": padded_input_ids,
192
+ "attention_mask": attention_mask,
193
+ "pixel_values": images,
194
+ "image_sizes": image_sizes,
195
+ "image_bound": image_bounds_list,
196
+ "tgt_sizes": tgt_sizes,
197
+ "temporal_ids": temporal_ids
198
+ })
199
+
200
+ @property
201
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
202
+ def model_input_names(self):
203
+ tokenizer_input_names = self.tokenizer.model_input_names
204
+ image_processor_input_names = self.image_processor.model_input_names
205
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
206
+
207
+
208
+ def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
209
+ items = []
210
+ if isinstance(inputs[0], list):
211
+ assert isinstance(inputs[0][0], torch.Tensor)
212
+ for it in inputs:
213
+ for tr in it:
214
+ items.append(tr)
215
+ else:
216
+ assert isinstance(inputs[0], torch.Tensor)
217
+ items = inputs
218
+
219
+ batch_size = len(items)
220
+ shape = items[0].shape
221
+ dim = len(shape)
222
+ assert dim <= 2
223
+ if max_length is None:
224
+ max_length = 0
225
+ max_length = max(max_length, max(item.shape[-1] for item in items))
226
+ min_length = min(item.shape[-1] for item in items)
227
+ dtype = items[0].dtype
228
+
229
+ if dim == 0:
230
+ return torch.stack([item for item in items], dim=0), [0]
231
+ elif dim == 1:
232
+ if max_length == min_length:
233
+ return torch.stack([item for item in items], dim=0), [0] * batch_size
234
+ tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
235
+ else:
236
+ tensor = (
237
+ torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype)
238
+ + padding_value
239
+ )
240
+
241
+ padding_length = []
242
+ for i, item in enumerate(items):
243
+ if dim == 1:
244
+ if padding_side == "left":
245
+ tensor[i, -len(item) :] = item.clone()
246
+ else:
247
+ tensor[i, : len(item)] = item.clone()
248
+ elif dim == 2:
249
+ if padding_side == "left":
250
+ tensor[i, -len(item) :, :] = item.clone()
251
+ else:
252
+ tensor[i, : len(item), :] = item.clone()
253
+ padding_length.append(tensor.shape[-1] - len(item))
254
+
255
+ return tensor, padding_length