aravindhs-NV commited on
Commit
53b2716
·
1 Parent(s): ac4c735

Remove all other files that will be kept local

Browse files
added_tokens.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "</box>": 151673,
3
- "</img>": 151671,
4
- "</interval>": 151679,
5
- "</quad>": 151675,
6
- "</ref>": 151677,
7
- "</think>": 151668,
8
- "</tool_call>": 151658,
9
- "</tool_response>": 151666,
10
- "<IMG_CONTEXT>": 151669,
11
- "<box>": 151672,
12
- "<img>": 151670,
13
- "<interval>": 151678,
14
- "<quad>": 151674,
15
- "<ref>": 151676,
16
- "<think>": 151667,
17
- "<tool_call>": 151657,
18
- "<tool_response>": 151665,
19
- "<|box_end|>": 151649,
20
- "<|box_start|>": 151648,
21
- "<|endoftext|>": 151643,
22
- "<|file_sep|>": 151664,
23
- "<|fim_middle|>": 151660,
24
- "<|fim_pad|>": 151662,
25
- "<|fim_prefix|>": 151659,
26
- "<|fim_suffix|>": 151661,
27
- "<|im_end|>": 151645,
28
- "<|im_start|>": 151644,
29
- "<|image_pad|>": 151655,
30
- "<|object_ref_end|>": 151647,
31
- "<|object_ref_start|>": 151646,
32
- "<|quad_end|>": 151651,
33
- "<|quad_start|>": 151650,
34
- "<|repo_name|>": 151663,
35
- "<|video_pad|>": 151656,
36
- "<|vision_end|>": 151653,
37
- "<|vision_pad|>": 151654,
38
- "<|vision_start|>": 151652
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
chat_template.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}<image {{ image_count.value }}>{% endif %}<image-{{ image_count.value }}>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}<video {{ video_count.value }}>{% endif %}<video-{{ video_count.value }}>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
- }
 
 
 
 
image_processing_eagle2_5_vl_fast.py DELETED
@@ -1,502 +0,0 @@
1
- # --------------------------------------------------------
2
- # NVIDIA
3
- # Copyright (c) 2025 NVIDIA
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # --------------------------------------------------------
6
-
7
- from functools import partial
8
-
9
- # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
10
- from typing import Optional
11
-
12
- from transformers.image_processing_utils import (
13
- BatchFeature,
14
- get_patch_output_size,
15
- )
16
- from transformers.image_processing_utils_fast import (
17
- BaseImageProcessorFast,
18
- DefaultFastImageProcessorKwargs,
19
- group_images_by_shape,
20
- reorder_images,
21
- )
22
- from transformers.image_utils import (
23
- IMAGENET_STANDARD_MEAN, # 0.5, 0.5, 0.5
24
- IMAGENET_STANDARD_STD, # 0.5, 0.5, 0.5
25
- ChannelDimension,
26
- ImageInput,
27
- PILImageResampling,
28
- SizeDict,
29
- get_image_size,
30
- make_flat_list_of_images,
31
- validate_kwargs,
32
- )
33
- from transformers.processing_utils import Unpack
34
- from transformers.utils import (
35
- TensorType,
36
- add_start_docstrings,
37
- is_torch_available,
38
- is_torchvision_v2_available,
39
- )
40
- from transformers.video_utils import VideoInput
41
-
42
- if is_torch_available():
43
- import torch
44
- if is_torchvision_v2_available():
45
- from torchvision.transforms.v2 import functional as F # noqa: N812
46
- from transformers.image_utils import pil_torch_interpolation_mapping
47
- else:
48
- from torchvision.transforms import functional as F # noqa: N812
49
-
50
-
51
- def crop(img: torch.Tensor, left: int, top: int, right: int, bottom: int) -> torch.Tensor:
52
- """Crop the given numpy array.
53
-
54
- Args:
55
- img (torch.Tensor): Image to be cropped. Format should be (C, H, W).
56
- left (int): The left coordinate of the crop box.
57
- top (int): The top coordinate of the crop box.
58
- right (int): The right coordinate of the crop box.
59
- bottom (int): The bottom coordinate of the crop box.
60
-
61
- Returns:
62
- torch.Tensor: Cropped image.
63
- """
64
- if not isinstance(img, torch.Tensor):
65
- raise TypeError(f"img should be torch.Tensor. Got {type(img)}")
66
-
67
- if img.ndim not in [2, 3]:
68
- raise ValueError(f"Image should have 2 or 3 dimensions. Got {img.ndim}")
69
-
70
- img_height = img.shape[1]
71
- img_width = img.shape[2]
72
- if top < 0 or left < 0 or bottom > img_height or right > img_width:
73
- raise ValueError("Crop coordinates out of bounds")
74
-
75
- if top >= bottom or left >= right:
76
- raise ValueError("Invalid crop coordinates")
77
-
78
- return img[:, top:bottom, left:right]
79
-
80
-
81
- class Eagle25VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
82
- max_dynamic_tiles: int | None
83
- min_dynamic_tiles: int | None
84
- use_thumbnail: bool | None
85
- pad_during_tiling: bool | None
86
- do_pad: bool | None
87
-
88
-
89
- @add_start_docstrings(
90
- "Constructs a fast ConvNeXT image processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame.",
91
- # BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, TODO: this was depreciated from transformers remove!
92
- """
93
- image_grid_pinpoints (`List[List[int]]`, *optional*):
94
- A list of possible resolutions to use for processing high resolution images. The best resolution is selected
95
- based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
96
- method. Not used for processing videos.
97
- do_pad (`bool`, *optional*):
98
- Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
99
- number of patches in the batch. Padding will be applied to the bottom and right with zeros.
100
- """,
101
- )
102
- class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
103
- resample = PILImageResampling.BICUBIC
104
- image_mean = IMAGENET_STANDARD_MEAN
105
- image_std = IMAGENET_STANDARD_STD
106
- size = {"height": 448, "width": 448}
107
- default_to_square = False
108
- crop_size = None
109
- do_resize = True
110
- do_center_crop = None
111
- do_rescale = True
112
- do_normalize = True
113
- do_convert_rgb = True
114
- do_pad = True
115
- max_dynamic_tiles = 12
116
- min_dynamic_tiles = 1
117
- use_thumbnail = True
118
- pad_during_tiling = False
119
- valid_kwargs = Eagle25VLFastImageProcessorKwargs
120
- model_input_names = ["pixel_values_videos"]
121
-
122
- def __init__(self, **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs]):
123
- super().__init__(**kwargs)
124
-
125
- @add_start_docstrings(
126
- # BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, TODO: this was depreciated from transformers remove!
127
- """
128
- max_dynamic_tiles (`int`, *optional*):
129
- The maximum number of dynamic tiles to use for processing high resolution images.
130
- min_dynamic_tiles (`int`, *optional*):
131
- The minimum number of dynamic tiles to use for processing high resolution images.
132
- use_thumbnail (`bool`, *optional*):
133
- Whether to use a thumbnail for processing high resolution images.
134
- pad_during_tiling (`bool`, *optional*):
135
- Whether to pad the image during tiling.
136
- do_pad (`bool`, *optional*):
137
- Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
138
- number of patches in the batch. Padding will be applied to the bottom and right with zeros.
139
- """,
140
- )
141
-
142
- # NOTE(YL): we will overload the preprocess method to add the image_flags
143
- # def preprocess(
144
- # self, images: ImageInput, **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs]
145
- # ) -> BatchFeature:
146
- # return super().preprocess(images, **kwargs)
147
-
148
- def _prepare_images_structure(
149
- self,
150
- images: ImageInput,
151
- ) -> ImageInput:
152
- """
153
- Prepare the images structure for processing.
154
-
155
- Args:
156
- images (`ImageInput`):
157
- The input images to process.
158
-
159
- Returns:
160
- `ImageInput`: The images with a valid nesting.
161
- """
162
- return make_flat_list_of_images(images)
163
-
164
- def _prepare_videos_structure(self, videos: VideoInput) -> VideoInput:
165
- return self._prepare_images_structure(videos)
166
-
167
- def _prepare_input_videos(
168
- self,
169
- videos: VideoInput,
170
- do_convert_rgb: bool | None = None,
171
- input_data_format: str | ChannelDimension | None = None,
172
- device: Optional["torch.device"] = None,
173
- ) -> list["torch.Tensor"]:
174
- """
175
- Prepare the input images for processing.
176
- """
177
- videos = self._prepare_videos_structure(videos)
178
- process_video_fn = partial(
179
- self._process_image,
180
- do_convert_rgb=do_convert_rgb,
181
- input_data_format=input_data_format,
182
- device=device,
183
- )
184
- # todo: yoni - check if we can parallelize this efficiently
185
- processed_videos = []
186
- for video in videos:
187
- processed_videos.append(process_video_fn(video))
188
-
189
- return processed_videos
190
-
191
- def _resize_for_patching(
192
- self,
193
- image: "torch.Tensor",
194
- target_resolution: tuple,
195
- interpolation: "F.InterpolationMode",
196
- input_data_format: ChannelDimension,
197
- ) -> "torch.Tensor":
198
- """
199
- Resizes an image to a target resolution while maintaining aspect ratio.
200
-
201
- Args:
202
- image ("torch.Tensor"):
203
- The input image.
204
- target_resolution (tuple):
205
- The target resolution (height, width) of the image.
206
- interpolation (`InterpolationMode`):
207
- Resampling filter to use if resizing the image.
208
- input_data_format (`ChannelDimension` or `str`):
209
- The channel dimension format of the input image.
210
-
211
- Returns:
212
- "torch.Tensor": The resized and padded image.
213
- """
214
- new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
215
-
216
- # Resize the image
217
- resized_image = F.resize(image, (new_height, new_width), interpolation=interpolation)
218
-
219
- return resized_image
220
-
221
- def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
222
- """
223
- previous version mainly focus on ratio.
224
- We also consider area ratio here.
225
- """
226
- best_factor = float("-inf")
227
- best_ratio = (1, 1)
228
- area = width * height
229
- for ratio in target_ratios:
230
- target_aspect_ratio = ratio[0] / ratio[1]
231
- # ratio_diff = abs(aspect_ratio - target_aspect_ratio)
232
- # area_ratio = (ratio[0] * ratio[1] * image_size * image_size) / area
233
- """
234
- new area > 60% of original image area is enough.
235
- """
236
- factor_based_on_area_n_ratio = min(
237
- (ratio[0] * ratio[1] * image_size * image_size) / area, 0.6
238
- ) * min(target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio)
239
-
240
- if factor_based_on_area_n_ratio > best_factor:
241
- best_factor = factor_based_on_area_n_ratio
242
- best_ratio = ratio
243
-
244
- return best_ratio
245
-
246
- def _pad_for_patching(
247
- self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension
248
- ) -> "torch.Tensor":
249
- """
250
- Pad an image to a target resolution while maintaining aspect ratio.
251
- """
252
- target_height, target_width = target_resolution
253
- new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
254
-
255
- paste_x = (target_width - new_width) // 2
256
- paste_y = (target_height - new_height) // 2
257
-
258
- padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x, paste_y])
259
-
260
- return padded_image
261
-
262
- def _get_image_patches(
263
- self,
264
- image: "torch.Tensor",
265
- min_num: int,
266
- max_num: int,
267
- size: tuple,
268
- tile_size: int,
269
- use_thumbnail: bool,
270
- interpolation: "F.InterpolationMode",
271
- pad_during_tiling: bool,
272
- ) -> list["torch.Tensor"]:
273
- image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)
274
- orig_height, orig_width = image_size
275
- aspect_ratio = orig_width / orig_height
276
-
277
- # calculate the existing image aspect ratio
278
- target_ratios = {
279
- (i, j)
280
- for n in range(min_num, max_num + 1)
281
- for i in range(1, n + 1)
282
- for j in range(1, n + 1)
283
- if i * j <= max_num and i * j >= min_num
284
- }
285
- target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
286
-
287
- # find the closest aspect ratio to the target
288
- target_aspect_ratio = self.find_closest_aspect_ratio(
289
- aspect_ratio, target_ratios, orig_width, orig_height, tile_size
290
- )
291
-
292
- # calculate the target width and height
293
- target_width = tile_size * target_aspect_ratio[0]
294
- target_height = tile_size * target_aspect_ratio[1]
295
- blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
296
- if pad_during_tiling:
297
- resized_image = self._resize_for_patching(
298
- image,
299
- (target_height, target_width),
300
- interpolation=interpolation,
301
- input_data_format=ChannelDimension.FIRST,
302
- )
303
- padded_image = self._pad_for_patching(
304
- resized_image,
305
- (target_height, target_width),
306
- input_data_format=ChannelDimension.FIRST,
307
- )
308
- image_used_to_split = padded_image
309
- else:
310
- image_used_to_split = F.resize(image, (target_height, target_width), interpolation=interpolation)
311
-
312
- processed_tiles = []
313
- for i in range(blocks):
314
- box = (
315
- (i % (target_width // tile_size)) * tile_size,
316
- (i // (target_width // tile_size)) * tile_size,
317
- ((i % (target_width // tile_size)) + 1) * tile_size,
318
- ((i // (target_width // tile_size)) + 1) * tile_size,
319
- )
320
- # split the image
321
- split_img = crop(image_used_to_split, box[0], box[1], box[2], box[3])
322
- processed_tiles.append(split_img)
323
- assert len(processed_tiles) == blocks
324
-
325
- if use_thumbnail and len(processed_tiles) != 1:
326
- thumbnail_img = F.resize(image, (tile_size, tile_size), interpolation=interpolation)
327
- processed_tiles.append(thumbnail_img)
328
-
329
- return processed_tiles
330
-
331
- def _pad_for_batching(
332
- self,
333
- pixel_values: list["torch.Tensor"],
334
- ) -> list["torch.Tensor"]:
335
- """
336
- Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
337
-
338
- Args:
339
- pixel_values (`List[torch.Tensor]`):
340
- An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
341
-
342
- Returns:
343
- List[`torch.Tensor`]: The padded images.
344
- """
345
- max_patch = max(len(x) for x in pixel_values)
346
- pixel_values = [
347
- torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]])
348
- for image in pixel_values
349
- ]
350
-
351
- return pixel_values
352
-
353
- def _preprocess(
354
- self,
355
- images: list["torch.Tensor"],
356
- do_resize: bool,
357
- size: SizeDict,
358
- max_dynamic_tiles: int,
359
- min_dynamic_tiles: int,
360
- use_thumbnail: bool,
361
- pad_during_tiling: bool,
362
- interpolation: Optional["F.InterpolationMode"],
363
- do_center_crop: bool,
364
- crop_size: SizeDict,
365
- do_rescale: bool,
366
- rescale_factor: float,
367
- do_normalize: bool,
368
- image_mean: float | list[float] | None,
369
- image_std: float | list[float] | None,
370
- do_pad: bool,
371
- return_tensors: str | TensorType | None,
372
- ) -> BatchFeature:
373
- processed_images = []
374
- image_sizes = []
375
- # Determine the size tuple
376
- if size and size.height and size.width:
377
- size_tuple = (size.height, size.width)
378
- else:
379
- size_tuple = (size.shortest_edge, size.shortest_edge)
380
-
381
- # Determine the patch size
382
- if crop_size and crop_size.height:
383
- tile_size = crop_size.height
384
- elif size and size.height:
385
- tile_size = size.height
386
- else:
387
- tile_size = size.shortest_edge
388
-
389
- for image in images:
390
- image_patches = self._get_image_patches(
391
- image,
392
- min_num=min_dynamic_tiles,
393
- max_num=max_dynamic_tiles,
394
- size=size_tuple,
395
- tile_size=tile_size,
396
- use_thumbnail=use_thumbnail,
397
- interpolation=interpolation,
398
- pad_during_tiling=pad_during_tiling,
399
- )
400
-
401
- # Group images by size for batched processing
402
- processed_image_patches_grouped = {}
403
- grouped_image_patches, grouped_image_patches_index = group_images_by_shape(image_patches)
404
-
405
- for shape, stacked_image_patches in grouped_image_patches.items():
406
- if do_resize:
407
- stacked_image_patches = self.resize(
408
- image=stacked_image_patches,
409
- size=size,
410
- interpolation=interpolation,
411
- )
412
- if do_center_crop:
413
- stacked_image_patches = self.center_crop(stacked_image_patches, crop_size)
414
- # Fused rescale and normalize
415
- stacked_image_patches = self.rescale_and_normalize(
416
- stacked_image_patches,
417
- do_rescale,
418
- rescale_factor,
419
- do_normalize,
420
- image_mean,
421
- image_std,
422
- )
423
- processed_image_patches_grouped[shape] = stacked_image_patches
424
- processed_image_patches = reorder_images(
425
- processed_image_patches_grouped, grouped_image_patches_index
426
- )
427
- processed_image_patches = (
428
- torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches
429
- )
430
- processed_images.append(processed_image_patches)
431
- image_sizes.append(get_image_size(image, ChannelDimension.FIRST))
432
-
433
- if do_pad:
434
- processed_images = self._pad_for_batching(processed_images)
435
-
436
- # processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
437
- processed_images = torch.cat(processed_images, dim=0) if return_tensors else processed_images
438
- return BatchFeature(
439
- data={"pixel_values": processed_images, "image_sizes": image_sizes},
440
- tensor_type=return_tensors,
441
- )
442
-
443
- def preprocess(
444
- self,
445
- images: ImageInput,
446
- videos: VideoInput = None,
447
- **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs],
448
- ) -> BatchFeature:
449
- validate_kwargs(
450
- captured_kwargs=kwargs.keys(),
451
- valid_processor_keys=self.valid_kwargs.__annotations__.keys(),
452
- )
453
- # Set default kwargs from self. This ensures that if a kwarg is not provided
454
- # by the user, it gets its default value from the instance, or is set to None.
455
- for kwarg_name in self.valid_kwargs.__annotations__:
456
- kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
457
-
458
- # Extract parameters that are only used for preparing the input images
459
- do_convert_rgb = kwargs.pop("do_convert_rgb")
460
- input_data_format = kwargs.pop("input_data_format")
461
- device = kwargs.pop("device")
462
- # Prepare input images
463
- if images is not None:
464
- images = self._prepare_input_images(
465
- images=images,
466
- do_convert_rgb=do_convert_rgb,
467
- input_data_format=input_data_format,
468
- device=device,
469
- )
470
-
471
- if videos is not None:
472
- videos = self._prepare_input_images(
473
- images=videos,
474
- do_convert_rgb=do_convert_rgb,
475
- input_data_format=input_data_format,
476
- device=device,
477
- )
478
-
479
- # Update kwargs that need further processing before being validated
480
- kwargs = self._further_process_kwargs(**kwargs)
481
-
482
- # Validate kwargs
483
- self._validate_preprocess_kwargs(**kwargs)
484
-
485
- # torch resize uses interpolation instead of resample
486
- resample = kwargs.pop("resample")
487
- kwargs["interpolation"] = (
488
- pil_torch_interpolation_mapping[resample]
489
- if isinstance(resample, PILImageResampling | int)
490
- else resample
491
- )
492
-
493
- # Pop kwargs that are not needed in _preprocess
494
- kwargs.pop("default_to_square")
495
- kwargs.pop("data_format")
496
- if images is not None:
497
- return self._preprocess(images, **kwargs)
498
- elif videos is not None:
499
- return self._preprocess(videos, **kwargs)
500
-
501
-
502
- __all__ = ["Eagle25VLImageProcessorFast"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessor_config.json DELETED
@@ -1,41 +0,0 @@
1
- {
2
- "auto_map": {
3
- "AutoImageProcessor": "image_processing_eagle2_5_vl_fast.Eagle25VLImageProcessorFast",
4
- "AutoProcessor": "processing_eagle2_5_vl.Eagle25VLProcessor"
5
- },
6
- "crop_size": null,
7
- "data_format": "channels_first",
8
- "default_to_square": false,
9
- "device": null,
10
- "do_center_crop": null,
11
- "do_convert_rgb": true,
12
- "do_normalize": true,
13
- "do_pad": false,
14
- "do_rescale": true,
15
- "do_resize": false,
16
- "image_mean": [
17
- 0.5,
18
- 0.5,
19
- 0.5
20
- ],
21
- "image_processor_type": "Eagle25VLImageProcessorFast",
22
- "image_std": [
23
- 0.5,
24
- 0.5,
25
- 0.5
26
- ],
27
- "input_data_format": null,
28
- "max_dynamic_tiles": 12,
29
- "min_dynamic_tiles": 1,
30
- "pad_during_tiling": false,
31
- "processor_class": "Eagle25VLProcessor",
32
- "resample": 3,
33
- "rescale_factor": 0.00392156862745098,
34
- "return_tensors": null,
35
- "size": {
36
- "height": 224,
37
- "width": 224
38
- },
39
- "tokens_per_tile": 256,
40
- "use_thumbnail": true
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
processing_eagle2_5_vl.py DELETED
@@ -1,822 +0,0 @@
1
- # Copyright 2024 The HuggingFace Inc. team.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- """
15
- Processor class for Eagle25VL.
16
- copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/processing_llava_onevision.py
17
- """
18
-
19
- import base64
20
- import math
21
- import os
22
- import re
23
- import time
24
- import warnings
25
- from functools import lru_cache
26
- from io import BytesIO
27
- from typing import Any, Literal
28
-
29
- import requests
30
- import torch
31
- import torchvision
32
- from packaging import version
33
- from PIL import Image
34
- from torchvision import io
35
- from transformers.feature_extraction_utils import BatchFeature
36
- from transformers.image_utils import ImageInput
37
- from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
38
- from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
39
- from transformers.utils import logging
40
- from transformers.video_utils import VideoInput
41
-
42
- logger = logging.get_logger(__name__)
43
-
44
-
45
- FRAME_FACTOR = 2
46
- FPS = 2.0
47
- FPS_MIN_FRAMES = 4
48
- FPS_MAX_FRAMES = 256
49
-
50
-
51
- def adjust_by_factor(number: int, factor: int, method: Literal["round", "ceil", "floor"] = "round") -> int:
52
- """Adjusts 'number' to the nearest, ceiling, or floor multiple of 'factor'."""
53
- op = {"round": round, "ceil": math.ceil, "floor": math.floor}[method]
54
- return op(number / factor) * factor
55
-
56
-
57
- def to_rgb(pil_image: Image.Image) -> Image.Image:
58
- if pil_image.mode == "RGBA":
59
- white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
60
- white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
61
- return white_background
62
- else:
63
- return pil_image.convert("RGB")
64
-
65
-
66
- def fetch_image(ele: dict[str, str | Image.Image]) -> Image.Image:
67
- image = ele["image"] if "image" in ele else ele["image_url"]
68
- image_obj = None
69
- if isinstance(image, Image.Image):
70
- image_obj = image
71
- elif image.startswith("http://") or image.startswith("https://"):
72
- response = requests.get(image, stream=True, timeout=10)
73
- image_obj = Image.open(BytesIO(response.content))
74
- elif image.startswith("file://"):
75
- image_obj = Image.open(image[7:])
76
- elif image.startswith("data:image"):
77
- if "base64," in image:
78
- _, base64_data = image.split("base64,", 1)
79
- data = base64.b64decode(base64_data)
80
- image_obj = Image.open(BytesIO(data))
81
- else:
82
- image_obj = Image.open(image)
83
- if image_obj is None:
84
- raise ValueError(
85
- f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
86
- )
87
- image = to_rgb(image_obj)
88
- if "scale_factor" in ele:
89
- scale_factor = ele["scale_factor"]
90
- image = image.resize((image.width * scale_factor, image.height * scale_factor), Image.BILINEAR)
91
- return image
92
-
93
-
94
- def smart_nframes(
95
- ele: dict,
96
- total_frames: int,
97
- video_fps: int | float,
98
- ) -> int:
99
- """calculate the number of frames for video used for model inputs.
100
-
101
- Args:
102
- ele (dict): a dict contains the configuration of video.
103
- support either `fps` or `nframes`:
104
- - nframes: the number of frames to extract for model inputs.
105
- - fps: the fps to extract frames for model inputs.
106
- - min_frames: the minimum number of frames of the video, only used when fps is provided.
107
- - max_frames: the maximum number of frames of the video, only used when fps is provided.
108
- total_frames (int): the original total number of frames of the video.
109
- video_fps (int | float): the original fps of the video.
110
-
111
- Raises:
112
- ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
113
-
114
- Returns:
115
- int: the number of frames for video used for model inputs.
116
- """
117
- assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
118
- if "nframes" in ele:
119
- nframes = adjust_by_factor(ele["nframes"], FRAME_FACTOR, method="round")
120
- else:
121
- fps = ele.get("fps", FPS)
122
- min_frames = adjust_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR, method="ceil")
123
- max_frames = adjust_by_factor(
124
- ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR, method="floor"
125
- )
126
- nframes = total_frames / video_fps * fps
127
- if nframes > total_frames:
128
- logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
129
- nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
130
- nframes = adjust_by_factor(nframes, FRAME_FACTOR, method="floor")
131
- if not (nframes >= FRAME_FACTOR and nframes <= total_frames):
132
- raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
133
- return nframes
134
-
135
-
136
- def _read_video_torchvision(
137
- ele: dict,
138
- ) -> (torch.Tensor, float, list):
139
- """read video using torchvision.io.read_video and return also per-frame timestamps"""
140
- video_path = ele["video"]
141
- if version.parse(torchvision.__version__) < version.parse("0.19.0"):
142
- if "http://" in video_path or "https://" in video_path:
143
- warnings.warn(
144
- "torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.",
145
- stacklevel=2,
146
- )
147
- if "file://" in video_path:
148
- video_path = video_path[7:]
149
- st = time.time()
150
- video, audio, info = io.read_video(
151
- video_path,
152
- start_pts=ele.get("video_start", 0.0),
153
- end_pts=ele.get("video_end"),
154
- pts_unit="sec",
155
- output_format="TCHW",
156
- )
157
- total_frames, video_fps = video.size(0), info["video_fps"]
158
- logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
159
- nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
160
- # Calculate frame indices and corresponding timestamps (based on video start time)
161
- idx = torch.linspace(0, total_frames - 1, nframes).round().long()
162
- start_time = ele.get("video_start", 0.0)
163
- timestamps = (start_time + idx.to(torch.float32) / video_fps).tolist()
164
- sample_fps = nframes / max(total_frames, 1e-6) * video_fps
165
- video = video[idx]
166
- return video, sample_fps, timestamps
167
-
168
-
169
- def is_decord_available() -> bool:
170
- import importlib.util
171
-
172
- return importlib.util.find_spec("decord") is not None
173
-
174
-
175
- def _read_video_decord(
176
- ele: dict,
177
- ) -> (torch.Tensor, float, list):
178
- """read video using decord.VideoReader and return also per-frame timestamps"""
179
- import decord
180
-
181
- video_path = ele["video"]
182
- st = time.time()
183
- vr = decord.VideoReader(video_path)
184
- if "video_start" in ele or "video_end" in ele:
185
- raise NotImplementedError("not support start_pts and end_pts in decord for now.")
186
- total_frames, video_fps = len(vr), vr.get_avg_fps()
187
- logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
188
- nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
189
- idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
190
- start_time = ele.get("video_start", 0.0) # TODO:
191
- timestamps = [start_time + i / video_fps for i in idx]
192
- video = vr.get_batch(idx).asnumpy()
193
- video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
194
- sample_fps = nframes / max(total_frames, 1e-6) * video_fps
195
- return video, sample_fps, timestamps
196
-
197
-
198
- VIDEO_READER_BACKENDS = {
199
- "decord": _read_video_decord,
200
- "torchvision": _read_video_torchvision,
201
- }
202
-
203
-
204
- @lru_cache(maxsize=1)
205
- def get_video_reader_backend() -> str:
206
- video_reader_backend = "decord" if is_decord_available() else "torchvision"
207
- return video_reader_backend
208
-
209
-
210
- def fetch_video(ele: dict, return_video_sample_fps: bool = False) -> torch.Tensor | list[Image.Image]:
211
- if isinstance(ele["video"], str):
212
- video_reader_backend = get_video_reader_backend()
213
- try:
214
- video, sample_fps, timestamps = VIDEO_READER_BACKENDS[video_reader_backend](ele)
215
- except Exception as e:
216
- logger.warning(
217
- f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}"
218
- )
219
- video, sample_fps, timestamps = VIDEO_READER_BACKENDS["torchvision"](ele)
220
-
221
- nframes, _, height, width = video.shape
222
-
223
- if return_video_sample_fps:
224
- return video, sample_fps, timestamps
225
- return video
226
- else:
227
- assert isinstance(ele["video"], list | tuple)
228
- process_info = ele.copy()
229
- process_info.pop("type", None)
230
- process_info.pop("video", None)
231
- images = [fetch_image({"image": video_element, **process_info}) for video_element in ele["video"]]
232
- nframes = adjust_by_factor(len(images), FRAME_FACTOR, method="ceil")
233
- if len(images) < nframes:
234
- images.extend([images[-1]] * (nframes - len(images)))
235
-
236
- timestamps = [-1 for i in range(nframes)] # not sure about this
237
- if return_video_sample_fps:
238
- return images, process_info.pop("fps", 2.0), timestamps
239
- return images
240
-
241
-
242
- class Eagle25VLProcessorKwargs(ProcessingKwargs, total=False):
243
- # see processing_utils.ProcessingKwargs documentation for usage.
244
- _defaults = {
245
- "text_kwargs": {
246
- "padding": False,
247
- },
248
- "images_kwargs": {},
249
- "videos_kwargs": {"max_dynamic_tiles": 1},
250
- }
251
-
252
-
253
- class Eagle25VLProcessor(ProcessorMixin):
254
- r"""
255
- Constructs a Eagle25VL processor which wraps a Eagle25VL video processor, Eagle25VL image processor and a Eagle25VL tokenizer into a single processor.
256
-
257
- [`Eagle25VLProcessor`] offers all the functionalities of [`Eagle25VLVideoProcessor`], [`Eagle25VLImageProcessor`] and [`Eagle25VLTokenizer`]. See the
258
- [`~Eagle25VLVideoProcessor.__call__`], [`~Eagle25VLProcessor.__call__`] and [`~Eagle25VLProcessor.decode`] for more information.
259
-
260
- Args:
261
- image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
262
- The image processor is a required input.
263
- tokenizer ([`LlamaTokenizerFast`], *optional*):
264
- The tokenizer is a required input.
265
- num_image_tokens (`int`, *optional*):
266
- Number of image tokens for one imagethat will be returned by vision tower.
267
- vision_feature_select_strategy (`str`, *optional*):
268
- The feature selection strategy used to select the vision feature from the vision backbone.
269
- Should be same as in model's config
270
- chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
271
- in a chat into a tokenizable string.
272
- image_token (`str`, *optional*, defaults to `"<image>"`):
273
- Special token used to denote image location.
274
- video_token (`str`, *optional*, defaults to `"<video>"`):
275
- Special token used to denote video location.
276
- """
277
-
278
- attributes = ["image_processor", "tokenizer"]
279
- valid_kwargs = [
280
- "chat_template",
281
- "num_image_tokens",
282
- "vision_feature_select_strategy",
283
- "image_token",
284
- "video_token",
285
- "images_kwargs",
286
- "videos_kwargs",
287
- "text_kwargs",
288
- ]
289
- image_processor_class = "AutoImageProcessor"
290
- tokenizer_class = "AutoTokenizer"
291
-
292
- def __init__(
293
- self,
294
- image_processor=None,
295
- tokenizer=None,
296
- vision_feature_select_strategy=None,
297
- chat_template=None,
298
- image_token="<IMG_CONTEXT>", # nosec: B107
299
- video_token="<IMG_CONTEXT>", # nosec: B107
300
- tokens_per_tile=256,
301
- image_placeholder="image",
302
- video_placeholder="video",
303
- image_start_token="<img>",
304
- image_end_token="</img>",
305
- **kwargs,
306
- ):
307
- self.vision_feature_select_strategy = vision_feature_select_strategy
308
- self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
309
- self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
310
- self.image_token_id = (
311
- tokenizer.image_token_id
312
- if getattr(tokenizer, "image_token_id", None)
313
- else tokenizer.convert_tokens_to_ids(self.image_token)
314
- )
315
- self.video_token_id = (
316
- tokenizer.video_token_id
317
- if getattr(tokenizer, "video_token_id", None)
318
- else tokenizer.convert_tokens_to_ids(self.video_token)
319
- )
320
- self.image_placeholder = image_placeholder
321
- self.video_placeholder = video_placeholder
322
- self.tokens_per_tile = tokens_per_tile
323
- self.image_start_token = image_start_token
324
- self.image_end_token = image_end_token
325
- if "auto_map" in kwargs:
326
- self.auto_map = kwargs["auto_map"]
327
- super().__init__(image_processor, tokenizer, chat_template=chat_template)
328
-
329
- def replace_media_placeholder(
330
- self, text, image_list, video_list, timestamps_list, fps_list, **output_kwargs
331
- ):
332
- num_of_images_in_this_sample = 0
333
- num_of_videos_in_this_sample = 0
334
- # Regular expression pattern to match formats like <image-1> or <video-2>
335
- pattern = re.compile(rf"<({self.image_placeholder}|{self.video_placeholder})-(\d+)>")
336
- unified_frame_list = []
337
-
338
- # image_min_dynamic_tiles = output_kwargs["images_kwargs"].get(
339
- # "min_dynamic_tiles", self.image_processor.min_dynamic_tiles
340
- # )
341
- # image_max_dynamic_tiles = output_kwargs["images_kwargs"].get(
342
- # "max_dynamic_tiles", self.image_processor.max_dynamic_tiles
343
- # )
344
- # image_use_thumbnail = output_kwargs["images_kwargs"].get(
345
- # "use_thumbnail", self.image_processor.use_thumbnail
346
- # )
347
- video_min_dynamic_tiles = output_kwargs["videos_kwargs"].get(
348
- "min_dynamic_tiles", self.image_processor.min_dynamic_tiles
349
- )
350
- video_max_dynamic_tiles = output_kwargs["videos_kwargs"].get(
351
- "max_dynamic_tiles", self.image_processor.max_dynamic_tiles
352
- )
353
- video_use_thumbnail = output_kwargs["videos_kwargs"].get(
354
- "use_thumbnail", self.image_processor.use_thumbnail
355
- )
356
-
357
- tile_size = self.image_processor.size.get("height", 448)
358
-
359
- # Function to replace tags in a single text
360
- def replace_in_text(text):
361
- # repl callback function for each match replacement operation
362
- def repl(match):
363
- nonlocal unified_frame_list
364
- nonlocal num_of_images_in_this_sample
365
- nonlocal num_of_videos_in_this_sample
366
- media_type = match.group(1) # 'image' or 'video'
367
- idx_in_list = int(match.group(2)) - 1 # Convert to list index (0-based)
368
- # Select the corresponding path based on media type
369
- idx_mapper = {
370
- 0: "first",
371
- 1: "second",
372
- 2: "third",
373
- 3: "fourth",
374
- 4: "fifth",
375
- 5: "sixth",
376
- 6: "seventh",
377
- 7: "eighth",
378
- 8: "ninth",
379
- 9: "tenth",
380
- }
381
- if media_type == "image":
382
- image_inputs = self.image_processor(
383
- images=[image_list[idx_in_list]],
384
- videos=None,
385
- **output_kwargs["images_kwargs"],
386
- )
387
- num_all_tiles = image_inputs["pixel_values"].shape[0]
388
- special_placeholder = f"<image {idx_in_list + 1}>{self.image_start_token}{self.image_token * num_all_tiles * self.tokens_per_tile}{self.image_end_token}"
389
- unified_frame_list.append(image_inputs)
390
- num_of_images_in_this_sample += 1
391
-
392
- elif media_type == "video":
393
- video_inputs = self.image_processor(
394
- images=None,
395
- videos=[video_list[idx_in_list]],
396
- **output_kwargs["videos_kwargs"],
397
- )
398
- num_all_tiles = video_inputs["pixel_values"].shape[0]
399
- image_sizes = video_inputs["image_sizes"]
400
- if timestamps_list is not None and -1 not in timestamps_list:
401
- frame_timestamps = timestamps_list[idx_in_list]
402
- else:
403
- frame_timestamps = None
404
- sampled_fps = fps_list[idx_in_list] if fps_list is not None else None
405
-
406
- num_of_tiles_each_frame = [
407
- self.get_number_tiles_based_on_image_size(
408
- image_size,
409
- video_min_dynamic_tiles,
410
- video_max_dynamic_tiles,
411
- video_use_thumbnail,
412
- tile_size,
413
- )
414
- for image_size in image_sizes
415
- ]
416
- assert sum(num_of_tiles_each_frame) == num_all_tiles, (
417
- f"The number of tiles in each frame is not equal to the total number of tiles: {sum(num_of_tiles_each_frame)} != {num_all_tiles}"
418
- )
419
-
420
- if frame_timestamps is not None:
421
- assert len(frame_timestamps) == len(num_of_tiles_each_frame), (
422
- f"The number of timestamps is not equal to the number of frames: {len(frame_timestamps)} != {len(num_of_tiles_each_frame)}"
423
- )
424
- special_placeholder = [
425
- f"Frame {i + 1} sample at {frame_timestamps[i]:.2f}s: {self.image_start_token}{self.image_token * num_of_tiles * self.tokens_per_tile}{self.image_end_token}"
426
- for i, num_of_tiles in enumerate(num_of_tiles_each_frame)
427
- ]
428
- else:
429
- special_placeholder = [
430
- f"Frame {i + 1}: {self.image_start_token}{self.image_token * num_of_tiles * self.tokens_per_tile}{self.image_end_token}"
431
- for i, num_of_tiles in enumerate(num_of_tiles_each_frame)
432
- ]
433
-
434
- if sampled_fps is not None:
435
- special_placeholder = (
436
- f"The {idx_mapper[idx_in_list]} video sampled with {sampled_fps:.2f} fps: "
437
- + "".join(special_placeholder)
438
- )
439
- else:
440
- special_placeholder = f"The {idx_mapper[idx_in_list]} video: " + "".join(
441
- special_placeholder
442
- )
443
- unified_frame_list.append(video_inputs)
444
- num_of_videos_in_this_sample += 1
445
- else:
446
- raise ValueError(f"Unknown media type: {media_type}")
447
- return special_placeholder
448
-
449
- return pattern.sub(repl, text)
450
-
451
- text = replace_in_text(text)
452
- if len(unified_frame_list) > 0:
453
- pixel_values = torch.cat([frame["pixel_values"] for frame in unified_frame_list])
454
- image_sizes = torch.cat([frame["image_sizes"] for frame in unified_frame_list])
455
- else:
456
- pixel_values = None
457
- image_sizes = None
458
- return (
459
- text,
460
- pixel_values,
461
- image_sizes,
462
- num_of_images_in_this_sample,
463
- num_of_videos_in_this_sample,
464
- )
465
-
466
- def __call__(
467
- self,
468
- images: ImageInput = None,
469
- text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
470
- audio=None,
471
- videos: VideoInput = None,
472
- **kwargs: Unpack[Eagle25VLProcessorKwargs],
473
- ) -> BatchFeature:
474
- """
475
- Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
476
- and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
477
- the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
478
- LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
479
- of the above two methods for more information.
480
-
481
- Args:
482
- images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
483
- The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
484
- tensor. Both channels-first and channels-last formats are supported.
485
- text (`str`, `List[str]`, `List[List[str]]`):
486
- The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
487
- (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
488
- `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
489
- videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
490
- The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
491
-
492
- Returns:
493
- [`BatchFeature`]: A [`BatchFeature`] with the following fields:
494
-
495
- - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
496
- - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
497
- `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
498
- `None`).
499
- - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
500
- - **pixel_values_videos** -- Pixel values of a video input to be fed to a model. Returned when `videos` is not `None`.
501
- - **image_sizes** -- Size of each image that will be used to unpad an image. Returned when `images` is not `None`.
502
- """
503
-
504
- output_kwargs = self._merge_kwargs(
505
- Eagle25VLProcessorKwargs,
506
- tokenizer_init_kwargs=self.tokenizer.init_kwargs,
507
- **kwargs,
508
- )
509
-
510
- if isinstance(text, str):
511
- text_list = [text]
512
- elif not isinstance(text, list) and not isinstance(text[0], str):
513
- raise ValueError("Invalid input text. Please provide a string, or a list of strings")
514
- elif isinstance(text, list) and isinstance(text[0], str):
515
- text_list = text
516
-
517
- if images is None:
518
- images = []
519
- if videos is None:
520
- videos = []
521
-
522
- pixel_values_list = []
523
- image_sizes_list = []
524
- new_sample_list = []
525
- image_start_idx = 0
526
- video_start_idx = 0
527
- timestamps_batch = output_kwargs["videos_kwargs"].pop("timestamps", None)
528
- fps_batch = output_kwargs["videos_kwargs"].pop("fps", None)
529
- for sample in text_list:
530
- timestamps_list = timestamps_batch[video_start_idx:] if timestamps_batch is not None else None
531
- fps_list = fps_batch[video_start_idx:] if fps_batch is not None else None
532
- (
533
- sample,
534
- pixel_values,
535
- image_sizes,
536
- num_of_images_in_this_sample,
537
- num_of_videos_in_this_sample,
538
- ) = self.replace_media_placeholder(
539
- sample,
540
- images[image_start_idx:],
541
- videos[video_start_idx:],
542
- timestamps_list,
543
- fps_list,
544
- **output_kwargs,
545
- )
546
- new_sample_list.append(sample)
547
- if pixel_values is not None:
548
- pixel_values_list.append(pixel_values)
549
- image_sizes_list.append(image_sizes)
550
- image_start_idx += num_of_images_in_this_sample
551
- video_start_idx += num_of_videos_in_this_sample
552
-
553
- if len(pixel_values_list) > 0:
554
- image_inputs = {
555
- "pixel_values": torch.cat(pixel_values_list),
556
- "image_sizes": torch.cat(image_sizes_list),
557
- }
558
- else:
559
- image_inputs = {}
560
- video_inputs = {}
561
- text_inputs = self.tokenizer(new_sample_list, **output_kwargs["text_kwargs"])
562
- return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs})
563
-
564
- def get_number_tiles_based_on_image_size(
565
- self, image_size: tuple, min_num: int, max_num: int, use_thumbnail: bool, tile_size: int
566
- ) -> int:
567
- """
568
- Get the number of tiles based on the image size.
569
- """
570
- orig_height, orig_width = image_size
571
- aspect_ratio = orig_width / orig_height
572
- # calculate the existing image aspect ratio
573
- target_ratios = {
574
- (i, j)
575
- for n in range(min_num, max_num + 1)
576
- for i in range(1, n + 1)
577
- for j in range(1, n + 1)
578
- if i * j <= max_num and i * j >= min_num
579
- }
580
- target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
581
-
582
- # find the closest aspect ratio to the target
583
- target_aspect_ratio = self.image_processor.find_closest_aspect_ratio(
584
- aspect_ratio, target_ratios, orig_width, orig_height, tile_size
585
- )
586
- tiles_num = target_aspect_ratio[0] * target_aspect_ratio[1]
587
- if use_thumbnail and tiles_num > 1:
588
- tiles_num += 1
589
- return tiles_num
590
-
591
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
592
- def batch_decode(self, *args, **kwargs):
593
- """
594
- This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
595
- refer to the docstring of this method for more information.
596
- """
597
- return self.tokenizer.batch_decode(*args, **kwargs)
598
-
599
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
600
- def decode(self, *args, **kwargs):
601
- """
602
- This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
603
- the docstring of this method for more information.
604
- """
605
- return self.tokenizer.decode(*args, **kwargs)
606
-
607
- @property
608
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
609
- def model_input_names(self):
610
- tokenizer_input_names = self.tokenizer.model_input_names
611
- image_processor_input_names = self.image_processor.model_input_names
612
- return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
613
-
614
- # override to save video-config in a separate config file
615
- def save_pretrained(self, save_directory, **kwargs):
616
- if os.path.isfile(save_directory):
617
- raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
618
- os.makedirs(save_directory, exist_ok=True)
619
-
620
- outputs = super().save_pretrained(save_directory, **kwargs)
621
- return outputs
622
-
623
- # override to load video-config from a separate config file
624
- @classmethod
625
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
626
- processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
627
-
628
- # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
629
- if isinstance(processor, tuple):
630
- processor = processor[0]
631
- return processor
632
-
633
- # Copy from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
634
- def process_vision_info(
635
- self,
636
- conversations: list[dict] | list[list[dict]],
637
- return_video_kwargs: bool = False,
638
- ) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, dict | None]:
639
- vision_infos = self.extract_vision_info(conversations)
640
- ## Read images or videos
641
- image_inputs = []
642
- video_inputs = []
643
- video_sample_fps_list = []
644
- video_timestamps_list = []
645
- for vision_info in vision_infos:
646
- if "image" in vision_info or "image_url" in vision_info:
647
- image_inputs.append(fetch_image(vision_info))
648
- elif "video" in vision_info:
649
- video_input, video_sample_fps, video_timestamps = fetch_video(
650
- vision_info, return_video_sample_fps=True
651
- )
652
- video_sample_fps_list.append(video_sample_fps)
653
- video_inputs.append(video_input)
654
- video_timestamps_list.append(video_timestamps)
655
- else:
656
- raise ValueError("image, image_url or video should in content.")
657
- if len(image_inputs) == 0:
658
- image_inputs = None
659
- if len(video_inputs) == 0:
660
- video_inputs = None
661
- if return_video_kwargs:
662
- return (
663
- image_inputs,
664
- video_inputs,
665
- {"fps": video_sample_fps_list, "timestamps": video_timestamps_list},
666
- )
667
- return image_inputs, video_inputs
668
-
669
- def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]:
670
- vision_infos = []
671
- if isinstance(conversations[0], dict):
672
- conversations = [conversations]
673
- for conversation in conversations:
674
- for message in conversation:
675
- if isinstance(message["content"], list):
676
- for ele in message["content"]:
677
- if (
678
- "image" in ele
679
- or "image_url" in ele
680
- or "video" in ele
681
- or ele["type"] in ("image", "image_url", "video")
682
- ):
683
- vision_infos.append(ele)
684
- return vision_infos
685
-
686
- def py_apply_chat_template(self, messages, tokenize=False, add_generation_prompt=False):
687
- """
688
- Renders a chat conversation using a custom template with verification of tokens.
689
-
690
- The purpose is to check for the existence of tokens like "<image-1>" or "<video-1>"
691
- in the message text and skip adding them if they already exist.
692
-
693
- Args:
694
- messages (list): A list of message dictionaries. Each message should contain:
695
- - 'role': The role of the speaker (e.g., 'system', 'user', 'assistant').
696
- - 'content': Either a string or a list of content blocks. In the list each block may contain:
697
- * 'type': The type of content, such as 'image' or 'video'.
698
- * 'text': The actual text if present.
699
- * Other keys such as 'image', 'image_url', or 'video'.
700
- add_generation_prompt (bool): If True, appends "<|im_start|>assistant" at the end of the rendered string.
701
- tokenize (bool): If True, tokenize the rendered string.
702
- Returns:
703
- str: The final rendered chat string according to the specified template.
704
- """
705
- assert not tokenize, "tokenize is not supported yet"
706
- result = ""
707
- image_count = 0
708
- video_count = 0
709
-
710
- message_text = ""
711
- for _idx, message in enumerate(messages):
712
- if message.get("role") != "user":
713
- continue
714
- # If content is a string, simply output it.
715
- content = message.get("content")
716
- if isinstance(content, str):
717
- message_text += content
718
- elif isinstance(content, list):
719
- # Process each content item.
720
- for item in content:
721
- # If the block is a dictionary and contains text, add it to message_text.
722
- if isinstance(item, dict) and "text" in item:
723
- message_text += item["text"]
724
- # If an item is already a string in the list, add it directly.
725
- elif isinstance(item, str):
726
- message_text += item
727
-
728
- for idx, message in enumerate(messages):
729
- # If the first message is not from the system, prepend a default system message.
730
- if idx == 0 and message.get("role") != "system":
731
- result += "<|im_start|>system\n"
732
- result += "You are a helpful assistant.\n"
733
- result += "<|im_end|>\n"
734
-
735
- # Start the current message block with its role.
736
- result += f"<|im_start|>{message.get('role', '')}\n"
737
- content = message.get("content")
738
-
739
- # If content is a string, simply output it.
740
- if isinstance(content, str):
741
- result += content
742
- result += "<|im_end|>\n"
743
- else:
744
- # Process each content item.
745
- for item in content:
746
- # Check if the item is an image (explicitly by type or by key presence).
747
- if isinstance(item, dict) and (
748
- item.get("type") == "image" or "image" in item or "image_url" in item
749
- ):
750
- image_count += 1
751
- candidate_token = f"<image-{image_count}>"
752
- # Only add the token if it is not already present in the collected text.
753
- if candidate_token not in message_text:
754
- result += candidate_token
755
- # Check if the item is a video.
756
- elif isinstance(item, dict) and (item.get("type") == "video" or "video" in item):
757
- video_count += 1
758
- candidate_token = f"<video-{video_count}>"
759
- # Only add the token if it is not already present.
760
- if candidate_token not in message_text:
761
- result += candidate_token
762
- # If the item contains text, add it.
763
- elif isinstance(item, dict) and "text" in item:
764
- result += item["text"]
765
- # If the item is a string (and not handled already), add it.
766
- elif isinstance(item, str):
767
- result += item
768
- result += "<|im_end|>\n"
769
-
770
- # Optionally add assistant generation prompt at the end.
771
- if add_generation_prompt:
772
- result += "<|im_start|>assistant\n"
773
-
774
- return result
775
-
776
- @classmethod
777
- def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
778
- """
779
- Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
780
-
781
- Args:
782
- processor_dict (`Dict[str, Any]`):
783
- Dictionary that will be used to instantiate the processor object. Such a dictionary can be
784
- retrieved from a pretrained checkpoint by leveraging the
785
- [`~processing_utils.ProcessingMixin.to_dict`] method.
786
- kwargs (`Dict[str, Any]`):
787
- Additional parameters from which to initialize the processor object.
788
-
789
- Returns:
790
- [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
791
- parameters.
792
- """
793
- processor_dict = processor_dict.copy()
794
- return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
795
-
796
- # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
797
- # If we don't pop, some specific kwargs will raise a warning
798
- if "processor_class" in processor_dict:
799
- del processor_dict["processor_class"]
800
-
801
- # if "auto_map" in processor_dict:
802
- # del processor_dict["auto_map"]
803
-
804
- unused_kwargs = cls.validate_init_kwargs(
805
- processor_config=processor_dict, valid_kwargs=cls.valid_kwargs
806
- )
807
- processor = cls(*args, **processor_dict)
808
-
809
- # Update processor with kwargs if needed
810
- for key in set(kwargs.keys()):
811
- if hasattr(processor, key):
812
- setattr(processor, key, kwargs.pop(key))
813
-
814
- kwargs.update(unused_kwargs)
815
- logger.info(f"Processor {processor}")
816
- if return_unused_kwargs:
817
- return processor, kwargs
818
- else:
819
- return processor
820
-
821
-
822
- __all__ = ["Eagle25VLProcessor"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
processor_config.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "auto_map": {
3
- "AutoImageProcessor": "image_processing_eagle2_5_vl_fast.Eagle25VLImageProcessorFast",
4
- "AutoProcessor": "processing_eagle2_5_vl.Eagle25VLProcessor"
5
- },
6
- "image_end_token": "</img>",
7
- "image_placeholder": "image",
8
- "image_start_token": "<img>",
9
- "image_token": "<IMG_CONTEXT>",
10
- "processor_class": "Eagle25VLProcessor",
11
- "tokens_per_tile": 256,
12
- "video_placeholder": "video",
13
- "video_token": "<IMG_CONTEXT>",
14
- "vision_feature_select_strategy": null
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
special_tokens_map.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>",
16
- "<IMG_CONTEXT>",
17
- "<img>",
18
- "</img>",
19
- "<box>",
20
- "</box>",
21
- "<quad>",
22
- "</quad>",
23
- "<ref>",
24
- "</ref>",
25
- "<interval>",
26
- "</interval>"
27
- ],
28
- "eos_token": {
29
- "content": "<|im_end|>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false
34
- },
35
- "pad_token": {
36
- "content": "<|endoftext|>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false
41
- }
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_config.json DELETED
@@ -1,344 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": false,
5
- "added_tokens_decoder": {
6
- "151643": {
7
- "content": "<|endoftext|>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "151644": {
15
- "content": "<|im_start|>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "151645": {
23
- "content": "<|im_end|>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": true
29
- },
30
- "151646": {
31
- "content": "<|object_ref_start|>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "151647": {
39
- "content": "<|object_ref_end|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": false,
44
- "special": true
45
- },
46
- "151648": {
47
- "content": "<|box_start|>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": false,
51
- "single_word": false,
52
- "special": true
53
- },
54
- "151649": {
55
- "content": "<|box_end|>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": false,
59
- "single_word": false,
60
- "special": true
61
- },
62
- "151650": {
63
- "content": "<|quad_start|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": false,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "151651": {
71
- "content": "<|quad_end|>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": false,
75
- "single_word": false,
76
- "special": true
77
- },
78
- "151652": {
79
- "content": "<|vision_start|>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": false,
83
- "single_word": false,
84
- "special": true
85
- },
86
- "151653": {
87
- "content": "<|vision_end|>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": false,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "151654": {
95
- "content": "<|vision_pad|>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": false,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "151655": {
103
- "content": "<|image_pad|>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": false,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "151656": {
111
- "content": "<|video_pad|>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": false,
115
- "single_word": false,
116
- "special": true
117
- },
118
- "151657": {
119
- "content": "<tool_call>",
120
- "lstrip": false,
121
- "normalized": false,
122
- "rstrip": false,
123
- "single_word": false,
124
- "special": false
125
- },
126
- "151658": {
127
- "content": "</tool_call>",
128
- "lstrip": false,
129
- "normalized": false,
130
- "rstrip": false,
131
- "single_word": false,
132
- "special": false
133
- },
134
- "151659": {
135
- "content": "<|fim_prefix|>",
136
- "lstrip": false,
137
- "normalized": false,
138
- "rstrip": false,
139
- "single_word": false,
140
- "special": false
141
- },
142
- "151660": {
143
- "content": "<|fim_middle|>",
144
- "lstrip": false,
145
- "normalized": false,
146
- "rstrip": false,
147
- "single_word": false,
148
- "special": false
149
- },
150
- "151661": {
151
- "content": "<|fim_suffix|>",
152
- "lstrip": false,
153
- "normalized": false,
154
- "rstrip": false,
155
- "single_word": false,
156
- "special": false
157
- },
158
- "151662": {
159
- "content": "<|fim_pad|>",
160
- "lstrip": false,
161
- "normalized": false,
162
- "rstrip": false,
163
- "single_word": false,
164
- "special": false
165
- },
166
- "151663": {
167
- "content": "<|repo_name|>",
168
- "lstrip": false,
169
- "normalized": false,
170
- "rstrip": false,
171
- "single_word": false,
172
- "special": false
173
- },
174
- "151664": {
175
- "content": "<|file_sep|>",
176
- "lstrip": false,
177
- "normalized": false,
178
- "rstrip": false,
179
- "single_word": false,
180
- "special": false
181
- },
182
- "151665": {
183
- "content": "<tool_response>",
184
- "lstrip": false,
185
- "normalized": false,
186
- "rstrip": false,
187
- "single_word": false,
188
- "special": false
189
- },
190
- "151666": {
191
- "content": "</tool_response>",
192
- "lstrip": false,
193
- "normalized": false,
194
- "rstrip": false,
195
- "single_word": false,
196
- "special": false
197
- },
198
- "151667": {
199
- "content": "<think>",
200
- "lstrip": false,
201
- "normalized": false,
202
- "rstrip": false,
203
- "single_word": false,
204
- "special": false
205
- },
206
- "151668": {
207
- "content": "</think>",
208
- "lstrip": false,
209
- "normalized": false,
210
- "rstrip": false,
211
- "single_word": false,
212
- "special": false
213
- },
214
- "151669": {
215
- "content": "<IMG_CONTEXT>",
216
- "lstrip": false,
217
- "normalized": false,
218
- "rstrip": false,
219
- "single_word": false,
220
- "special": true
221
- },
222
- "151670": {
223
- "content": "<img>",
224
- "lstrip": false,
225
- "normalized": false,
226
- "rstrip": false,
227
- "single_word": false,
228
- "special": true
229
- },
230
- "151671": {
231
- "content": "</img>",
232
- "lstrip": false,
233
- "normalized": false,
234
- "rstrip": false,
235
- "single_word": false,
236
- "special": true
237
- },
238
- "151672": {
239
- "content": "<box>",
240
- "lstrip": false,
241
- "normalized": false,
242
- "rstrip": false,
243
- "single_word": false,
244
- "special": true
245
- },
246
- "151673": {
247
- "content": "</box>",
248
- "lstrip": false,
249
- "normalized": false,
250
- "rstrip": false,
251
- "single_word": false,
252
- "special": true
253
- },
254
- "151674": {
255
- "content": "<quad>",
256
- "lstrip": false,
257
- "normalized": false,
258
- "rstrip": false,
259
- "single_word": false,
260
- "special": true
261
- },
262
- "151675": {
263
- "content": "</quad>",
264
- "lstrip": false,
265
- "normalized": false,
266
- "rstrip": false,
267
- "single_word": false,
268
- "special": true
269
- },
270
- "151676": {
271
- "content": "<ref>",
272
- "lstrip": false,
273
- "normalized": false,
274
- "rstrip": false,
275
- "single_word": false,
276
- "special": true
277
- },
278
- "151677": {
279
- "content": "</ref>",
280
- "lstrip": false,
281
- "normalized": false,
282
- "rstrip": false,
283
- "single_word": false,
284
- "special": true
285
- },
286
- "151678": {
287
- "content": "<interval>",
288
- "lstrip": false,
289
- "normalized": false,
290
- "rstrip": false,
291
- "single_word": false,
292
- "special": true
293
- },
294
- "151679": {
295
- "content": "</interval>",
296
- "lstrip": false,
297
- "normalized": false,
298
- "rstrip": false,
299
- "single_word": false,
300
- "special": true
301
- }
302
- },
303
- "additional_special_tokens": [
304
- "<|im_start|>",
305
- "<|im_end|>",
306
- "<|object_ref_start|>",
307
- "<|object_ref_end|>",
308
- "<|box_start|>",
309
- "<|box_end|>",
310
- "<|quad_start|>",
311
- "<|quad_end|>",
312
- "<|vision_start|>",
313
- "<|vision_end|>",
314
- "<|vision_pad|>",
315
- "<|image_pad|>",
316
- "<|video_pad|>",
317
- "<IMG_CONTEXT>",
318
- "<img>",
319
- "</img>",
320
- "<box>",
321
- "</box>",
322
- "<quad>",
323
- "</quad>",
324
- "<ref>",
325
- "</ref>",
326
- "<interval>",
327
- "</interval>"
328
- ],
329
- "auto_map": {
330
- "AutoProcessor": "processing_eagle2_5_vl.Eagle25VLProcessor"
331
- },
332
- "bos_token": null,
333
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
334
- "clean_up_tokenization_spaces": false,
335
- "eos_token": "<|im_end|>",
336
- "errors": "replace",
337
- "extra_special_tokens": {},
338
- "model_max_length": 16384,
339
- "pad_token": "<|endoftext|>",
340
- "processor_class": "Eagle25VLProcessor",
341
- "split_special_tokens": false,
342
- "tokenizer_class": "Qwen2Tokenizer",
343
- "unk_token": null
344
- }