rooty2020 commited on
Commit
d3697ab
·
verified ·
1 Parent(s): d929524

Upload models

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</abs_vis_token>": 151682,
3
+ "</box>": 151673,
4
+ "</img>": 151671,
5
+ "</interval>": 151679,
6
+ "</observation>": 151684,
7
+ "</quad>": 151675,
8
+ "</ref>": 151677,
9
+ "</think>": 151668,
10
+ "</tool_call>": 151658,
11
+ "</tool_response>": 151666,
12
+ "<IMG_CONTEXT>": 151669,
13
+ "<abs_vis_token>": 151681,
14
+ "<abs_vis_token_pad>": 151680,
15
+ "<box>": 151672,
16
+ "<img>": 151670,
17
+ "<interval>": 151678,
18
+ "<observation>": 151683,
19
+ "<quad>": 151674,
20
+ "<ref>": 151676,
21
+ "<think>": 151667,
22
+ "<tool_call>": 151657,
23
+ "<tool_response>": 151665,
24
+ "<|box_end|>": 151649,
25
+ "<|box_start|>": 151648,
26
+ "<|endoftext|>": 151643,
27
+ "<|file_sep|>": 151664,
28
+ "<|fim_middle|>": 151660,
29
+ "<|fim_pad|>": 151662,
30
+ "<|fim_prefix|>": 151659,
31
+ "<|fim_suffix|>": 151661,
32
+ "<|im_end|>": 151645,
33
+ "<|im_start|>": 151644,
34
+ "<|image_pad|>": 151655,
35
+ "<|object_ref_end|>": 151647,
36
+ "<|object_ref_start|>": 151646,
37
+ "<|quad_end|>": 151651,
38
+ "<|quad_start|>": 151650,
39
+ "<|repo_name|>": 151663,
40
+ "<|video_pad|>": 151656,
41
+ "<|vision_end|>": 151653,
42
+ "<|vision_pad|>": 151654,
43
+ "<|vision_start|>": 151652
44
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}<image {{ image_count.value }}>{% endif %}<image-{{ image_count.value }}>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}<video {{ video_count.value }}>{% endif %}<video-{{ video_count.value }}>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation": "flash_attention_2",
3
+ "_attn_implementation_autoset": false,
4
+ "answer_start_pattern": [
5
+ 151644,
6
+ 77091
7
+ ],
8
+ "architectures": [
9
+ "Eagle3_VLForConditionalGeneration"
10
+ ],
11
+ "auto_map": {
12
+ "AutoConfig": "configuration_eagle3_vl.Eagle3_VLConfig",
13
+ "AutoModel": "modeling_eagle3_vl.Eagle3_VLForConditionalGeneration",
14
+ "AutoModelForCausalLM": "modeling_eagle3_vl.Eagle3_VLForConditionalGeneration"
15
+ },
16
+ "downsample_ratio": 0.5,
17
+ "dtype": "bfloat16",
18
+ "dynamic_image_size": false,
19
+ "eos_token_id": 151645,
20
+ "image_token_index": 151669,
21
+ "initializer_range": 0.02,
22
+ "latent_end_id": 151682,
23
+ "latent_start_id": 151681,
24
+ "latent_token_id": 151680,
25
+ "loss_type": "ForCausalLMLoss",
26
+ "loss_version": "efficient_v2_cp_head",
27
+ "max_dynamic_tiles": 12,
28
+ "min_dynamic_tiles": 1,
29
+ "mlp_checkpoint": false,
30
+ "mlp_connector_layers": 2,
31
+ "model_type": "eagle_3_vl",
32
+ "output_attentions": false,
33
+ "pad2square": false,
34
+ "pad_token_id": 151643,
35
+ "select_layer": -1,
36
+ "stage": "sft_stage3",
37
+ "template": null,
38
+ "text_config": {
39
+ "_attn_implementation_autoset": true,
40
+ "_name_or_path": "Qwen/Qwen3-1.7B",
41
+ "architectures": [
42
+ "Qwen3ForCausalLM"
43
+ ],
44
+ "attention_bias": false,
45
+ "attention_dropout": 0.0,
46
+ "bos_token_id": 151643,
47
+ "dtype": "bfloat16",
48
+ "eos_token_id": 151645,
49
+ "head_dim": 128,
50
+ "hidden_act": "silu",
51
+ "hidden_size": 2048,
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 6144,
54
+ "layer_types": [
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention",
75
+ "full_attention",
76
+ "full_attention",
77
+ "full_attention",
78
+ "full_attention",
79
+ "full_attention",
80
+ "full_attention",
81
+ "full_attention",
82
+ "full_attention"
83
+ ],
84
+ "max_position_embeddings": 40960,
85
+ "max_window_layers": 28,
86
+ "model_type": "qwen3",
87
+ "num_attention_heads": 16,
88
+ "num_hidden_layers": 28,
89
+ "num_key_value_heads": 8,
90
+ "rms_norm_eps": 1e-06,
91
+ "rope_scaling": null,
92
+ "rope_theta": 1000000,
93
+ "sliding_window": null,
94
+ "tie_word_embeddings": true,
95
+ "use_cache": false,
96
+ "use_sliding_window": false,
97
+ "vocab_size": 151685
98
+ },
99
+ "tie_word_embeddings": true,
100
+ "transformers_version": null,
101
+ "use_backbone_lora": 0,
102
+ "use_cache": false,
103
+ "use_llm_lora": 0,
104
+ "use_pixel_shuffle": true,
105
+ "use_thumbnail": false,
106
+ "vision_config": {
107
+ "_attn_implementation_autoset": true,
108
+ "attention_dropout": 0.0,
109
+ "dtype": "bfloat16",
110
+ "full_attention_indexes": [
111
+ 7,
112
+ 14,
113
+ 21,
114
+ 26
115
+ ],
116
+ "hidden_act": "gelu_pytorch_tanh",
117
+ "hidden_size": 1152,
118
+ "intermediate_size": 4304,
119
+ "layer_norm_eps": 1e-06,
120
+ "model_type": "siglip2_vision_model",
121
+ "num_attention_heads": 16,
122
+ "num_channels": 3,
123
+ "num_hidden_layers": 27,
124
+ "num_patches": 256,
125
+ "patch_size": 14,
126
+ "use_rope": false,
127
+ "use_windows_attn": false,
128
+ "window_size": 14
129
+ },
130
+ "vocab_size": 151685
131
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151645
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.57.3"
9
+ }
image_processing_eagle3_vl_fast.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # NVIDIA
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
8
+ from typing import Any, List, Optional, Union
9
+
10
+ from transformers.image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
11
+ try:
12
+ from transformers.image_processing_utils_fast import (
13
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
14
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
15
+ BaseImageProcessorFast,
16
+ DefaultFastImageProcessorKwargs,
17
+ divide_to_patches,
18
+ group_images_by_shape,
19
+ reorder_images,
20
+ )
21
+ except ImportError:
22
+ from transformers.image_processing_utils_fast import (
23
+ BaseImageProcessorFast,
24
+ DefaultFastImageProcessorKwargs,
25
+ divide_to_patches,
26
+ group_images_by_shape,
27
+ reorder_images,
28
+ )
29
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = ""
30
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = ""
31
+ try:
32
+ from transformers.image_utils import (
33
+ OPENAI_CLIP_MEAN,
34
+ OPENAI_CLIP_STD,
35
+ IMAGENET_STANDARD_MEAN, # 0.5, 0.5, 0.5
36
+ IMAGENET_STANDARD_STD, # 0.5, 0.5, 0.5
37
+ ChannelDimension,
38
+ ImageInput,
39
+ VideoInput,
40
+ PILImageResampling,
41
+ SizeDict,
42
+ get_image_size,
43
+ make_flat_list_of_images,
44
+ make_batched_videos,
45
+ validate_kwargs,
46
+ )
47
+ except ImportError:
48
+ from transformers.image_utils import (
49
+ OPENAI_CLIP_MEAN,
50
+ OPENAI_CLIP_STD,
51
+ IMAGENET_STANDARD_MEAN, # 0.5, 0.5, 0.5
52
+ IMAGENET_STANDARD_STD, # 0.5, 0.5, 0.5
53
+ ChannelDimension,
54
+ ImageInput,
55
+ PILImageResampling,
56
+ SizeDict,
57
+ get_image_size,
58
+ make_flat_list_of_images,
59
+ validate_kwargs,
60
+ )
61
+ VideoInput = Any
62
+
63
+ def make_batched_videos(videos):
64
+ return videos
65
+ from transformers.processing_utils import Unpack
66
+ from transformers.utils import TensorType, add_start_docstrings, is_torch_available, is_torchvision_v2_available
67
+
68
+
69
+ if is_torch_available():
70
+ import torch
71
+ if is_torchvision_v2_available():
72
+ from transformers.image_utils import pil_torch_interpolation_mapping
73
+
74
+ from torchvision.transforms.v2 import functional as F
75
+ else:
76
+ from torchvision.transforms import functional as F
77
+
78
+ def crop(img: torch.Tensor, left: int, top: int, right: int, bottom: int) -> torch.Tensor:
79
+ """Crop the given numpy array.
80
+
81
+ Args:
82
+ img (torch.Tensor): Image to be cropped. Format should be (C, H, W).
83
+ left (int): The left coordinate of the crop box.
84
+ top (int): The top coordinate of the crop box.
85
+ right (int): The right coordinate of the crop box.
86
+ bottom (int): The bottom coordinate of the crop box.
87
+
88
+ Returns:
89
+ torch.Tensor: Cropped image.
90
+ """
91
+ if not isinstance(img, torch.Tensor):
92
+ raise TypeError('img should be torch.Tensor. Got {}'.format(type(img)))
93
+
94
+ if img.ndim not in [2, 3]:
95
+ raise ValueError('Image should have 2 or 3 dimensions. Got {}'.format(img.ndim))
96
+
97
+ img_height = img.shape[1]
98
+ img_width = img.shape[2]
99
+ if top < 0 or left < 0 or bottom > img_height or right > img_width:
100
+ raise ValueError('Crop coordinates out of bounds')
101
+
102
+ if top >= bottom or left >= right:
103
+ raise ValueError('Invalid crop coordinates')
104
+
105
+ return img[:, top:bottom, left:right]
106
+
107
+
108
+ class Eagle3_VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
109
+ do_pad: Optional[bool]
110
+
111
+
112
+ @add_start_docstrings(
113
+ "Constructs a fast ConvNeXT image processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame.",
114
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
115
+ """
116
+ image_grid_pinpoints (`List[List[int]]`, *optional*):
117
+ A list of possible resolutions to use for processing high resolution images. The best resolution is selected
118
+ based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
119
+ method. Not used for processing videos.
120
+ do_pad (`bool`, *optional*):
121
+ Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
122
+ number of patches in the batch. Padding will be applied to the bottom and right with zeros.
123
+ """,
124
+ )
125
+ class Eagle3_VLImageProcessorFast(BaseImageProcessorFast):
126
+ resample = PILImageResampling.BICUBIC
127
+ image_mean = IMAGENET_STANDARD_MEAN
128
+ image_std = IMAGENET_STANDARD_STD
129
+ size = {"height": 448, "width": 448}
130
+ default_to_square = False
131
+ crop_size = None
132
+ do_resize = True
133
+ do_center_crop = None
134
+ do_rescale = True
135
+ do_normalize = True
136
+ do_convert_rgb = True
137
+ do_pad = True
138
+ valid_kwargs = Eagle3_VLFastImageProcessorKwargs
139
+ model_input_names = ["pixel_values_videos"]
140
+
141
+ def __init__(self, **kwargs: Unpack[Eagle3_VLFastImageProcessorKwargs]):
142
+ super().__init__(**kwargs)
143
+
144
+ @add_start_docstrings(
145
+ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
146
+ """
147
+ do_pad (`bool`, *optional*):
148
+ Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
149
+ number of patches in the batch. Padding will be applied to the bottom and right with zeros.
150
+ """,
151
+ )
152
+ def preprocess(self, images: ImageInput, **kwargs: Unpack[Eagle3_VLFastImageProcessorKwargs]) -> BatchFeature:
153
+ return super().preprocess(images, **kwargs)
154
+
155
+ def _prepare_images_structure(
156
+ self,
157
+ images: ImageInput,
158
+ expected_ndims: int = 3,
159
+ ) -> ImageInput:
160
+ """
161
+ Prepare the images structure for processing.
162
+
163
+ Args:
164
+ images (`ImageInput`):
165
+ The input images to process.
166
+
167
+ Returns:
168
+ `ImageInput`: The images with a valid nesting.
169
+ """
170
+ try:
171
+ return make_flat_list_of_images(images, expected_ndims=expected_ndims)
172
+ except TypeError:
173
+ return make_flat_list_of_images(images)
174
+
175
+ def _preprocess(
176
+ self,
177
+ images: List["torch.Tensor"],
178
+ do_resize: bool,
179
+ size: SizeDict,
180
+ interpolation: Optional["F.InterpolationMode"],
181
+ do_center_crop: bool,
182
+ crop_size: SizeDict,
183
+ do_rescale: bool,
184
+ rescale_factor: float,
185
+ do_normalize: bool,
186
+ image_mean: Optional[Union[float, List[float]]],
187
+ image_std: Optional[Union[float, List[float]]],
188
+ do_pad: bool,
189
+ return_tensors: Optional[Union[str, TensorType]],
190
+ disable_grouping: Optional[bool] = None,
191
+ **kwargs,
192
+ ) -> BatchFeature:
193
+
194
+ image_sizes = [get_image_size(image, channel_dim=ChannelDimension.FIRST) for image in images]
195
+
196
+ # Group images by size for further processing
197
+ # Needed in case do_resize is False, or resize returns images with different sizes
198
+ try:
199
+ grouped_images, grouped_images_index = group_images_by_shape(
200
+ images, disable_grouping=disable_grouping
201
+ )
202
+ except TypeError:
203
+ grouped_images, grouped_images_index = group_images_by_shape(images)
204
+ processed_images_grouped = {}
205
+ for shape, stacked_images in grouped_images.items():
206
+ # Fused rescale and normalize
207
+ stacked_images = self.rescale_and_normalize(
208
+ stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
209
+ )
210
+ processed_images_grouped[shape] = stacked_images
211
+
212
+ processed_images = reorder_images(processed_images_grouped, grouped_images_index)
213
+ processed_images = torch.stack(processed_images)
214
+
215
+ return BatchFeature(
216
+ data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors
217
+ )
218
+
219
+
220
+ def preprocess(self, images: ImageInput, videos: VideoInput=None, **kwargs: Unpack[Eagle3_VLFastImageProcessorKwargs]) -> BatchFeature:
221
+ valid_keys = getattr(self, "_valid_kwargs_names", list(self.valid_kwargs.__annotations__.keys()))
222
+ validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=valid_keys)
223
+ # Set default kwargs from self. This ensures that if a kwarg is not provided
224
+ # by the user, it gets its default value from the instance, or is set to None.
225
+ for kwarg_name in valid_keys:
226
+ kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
227
+
228
+ # Extract parameters that are only used for preparing the input images
229
+ do_convert_rgb = kwargs.pop("do_convert_rgb")
230
+ input_data_format = kwargs.pop("input_data_format")
231
+ device = kwargs.pop("device")
232
+
233
+ # Transformers API compatibility:
234
+ # newer versions expose `_prepare_image_like_inputs`, older forks may still use `_prepare_input_images`.
235
+ prepare_inputs = getattr(self, "_prepare_image_like_inputs", None)
236
+ if prepare_inputs is None:
237
+ prepare_inputs = getattr(self, "_prepare_input_images", None)
238
+ if prepare_inputs is None:
239
+ raise AttributeError("No image preparation helper found on Eagle3_VLImageProcessorFast.")
240
+
241
+ # Prepare input images/videos
242
+ if images is not None:
243
+ images = prepare_inputs(
244
+ images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
245
+ )
246
+
247
+ if videos is not None:
248
+ videos = prepare_inputs(
249
+ images=videos, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
250
+ )
251
+
252
+ # Update kwargs that need further processing before being validated
253
+ kwargs = self._further_process_kwargs(**kwargs)
254
+
255
+ # Validate kwargs
256
+ self._validate_preprocess_kwargs(**kwargs)
257
+
258
+ # Some older transformers builds still expect manual `resample` -> `interpolation`.
259
+ if "resample" in kwargs:
260
+ resample = kwargs.pop("resample")
261
+ kwargs["interpolation"] = (
262
+ pil_torch_interpolation_mapping[resample]
263
+ if isinstance(resample, (PILImageResampling, int))
264
+ else resample
265
+ )
266
+
267
+ # Pop kwargs that are not needed in _preprocess
268
+ kwargs.pop("default_to_square", None)
269
+ kwargs.pop("data_format", None)
270
+
271
+ if images is not None:
272
+ return self._preprocess(images, **kwargs)
273
+ if videos is not None:
274
+ return self._preprocess(videos, **kwargs)
275
+ raise ValueError("Either `images` or `videos` must be provided.")
276
+
277
+ __all__ = ["Eagle3_VLImageProcessorFast"]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4b7bd170dfba8f1ead4cfab076bc06752256c8dffa6a763118ad9617f601400
3
+ size 4948373920
preprocessor_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_eagle3_vl_fast.Eagle3_VLImageProcessorFast",
4
+ "AutoProcessor": "processing_eagle3_vl.Eagle3_VLProcessor"
5
+ },
6
+ "crop_size": null,
7
+ "data_format": "channels_first",
8
+ "default_to_square": false,
9
+ "device": null,
10
+ "disable_grouping": null,
11
+ "do_center_crop": null,
12
+ "do_convert_rgb": true,
13
+ "do_normalize": true,
14
+ "do_pad": false,
15
+ "do_rescale": true,
16
+ "do_resize": false,
17
+ "image_mean": [
18
+ 0.5,
19
+ 0.5,
20
+ 0.5
21
+ ],
22
+ "image_processor_type": "Eagle3_VLImageProcessorFast",
23
+ "image_std": [
24
+ 0.5,
25
+ 0.5,
26
+ 0.5
27
+ ],
28
+ "input_data_format": null,
29
+ "pad_size": null,
30
+ "processor_class": "Eagle3_VLProcessor",
31
+ "resample": 3,
32
+ "rescale_factor": 0.00392156862745098,
33
+ "return_tensors": null,
34
+ "size": {
35
+ "height": 448,
36
+ "width": 448
37
+ }
38
+ }
processing_eagle3_vl.py ADDED
@@ -0,0 +1,914 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Eagle3_VL.
17
+ copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/processing_llava_onevision.py
18
+ """
19
+
20
+ import math
21
+ import os
22
+ from typing import Iterable, List, Union, Literal
23
+ import base64
24
+ import sys
25
+ import time
26
+ import warnings
27
+ from functools import lru_cache
28
+ from io import BytesIO
29
+ import re
30
+ import requests
31
+ import torch
32
+ import torchvision
33
+ from packaging import version
34
+ from PIL import Image
35
+ from torchvision import io
36
+ from torchvision import transforms
37
+ from torch.nn import functional as F
38
+ from torchvision.transforms import InterpolationMode
39
+ from typing import Optional, Any
40
+ import numpy as np
41
+
42
+ from transformers.feature_extraction_utils import BatchFeature
43
+ from transformers.image_processing_utils import select_best_resolution
44
+ try:
45
+ from transformers.image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
46
+ except ImportError:
47
+ from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
48
+ VideoInput = Any
49
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
50
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
51
+ from transformers.utils import logging
52
+ from transformers.models.auto import AutoImageProcessor
53
+ import lmdb
54
+ import cv2
55
+ import pickle
56
+ logger = logging.get_logger(__name__)
57
+
58
+ # Highly inspired by https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
59
+
60
+ FRAME_FACTOR = 2
61
+ FPS = 2.0
62
+ FPS_MIN_FRAMES = 4
63
+ FPS_MAX_FRAMES = 256
64
+
65
+ IMAGE_FACTOR = 28
66
+ MIN_PIXELS = 4 * 28 * 28
67
+ MAX_PIXELS = 4096 * 28 * 28
68
+ MAX_RATIO = 200
69
+ IMAGE_MAX_SIZE = 500 * 14
70
+
71
+
72
+ VIDEO_MIN_PIXELS = 128 * 28 * 28
73
+ VIDEO_MAX_PIXELS = 768 * 28 * 28
74
+
75
+ # Set the maximum number of video token inputs.
76
+ # Here, 128K represents the maximum number of input tokens for the VLLM model.
77
+ # Remember to adjust it according to your own configuration.
78
+ VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9)))
79
+ logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}")
80
+
81
+
82
+
83
+
84
+ def adjust_by_factor(number: int, factor: int, method: Literal['round', 'ceil', 'floor'] = 'round') -> int:
85
+ """Adjusts 'number' to the nearest, ceiling, or floor multiple of 'factor'."""
86
+ op = {'round': round, 'ceil': math.ceil, 'floor': math.floor}[method]
87
+ return op(number / factor) * factor
88
+
89
+
90
+ def to_rgb(pil_image: Image.Image) -> Image.Image:
91
+ if pil_image.mode == 'RGBA':
92
+ white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
93
+ white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
94
+ return white_background
95
+ else:
96
+ return pil_image.convert("RGB")
97
+
98
+ def smart_resize(
99
+ height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
100
+ ) -> tuple[int, int]:
101
+ """
102
+ Rescales the image so that the following conditions are met:
103
+ 1. Both dimensions (height and width) are divisible by 'factor'.
104
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
105
+ 3. The aspect ratio of the image is maintained as closely as possible.
106
+ """
107
+ if max(height, width) / min(height, width) > MAX_RATIO:
108
+ raise ValueError(
109
+ f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
110
+ )
111
+
112
+
113
+ h_bar = min(max(factor, adjust_by_factor(height, factor, method='round')), IMAGE_MAX_SIZE)
114
+ w_bar = min(max(factor, adjust_by_factor(width, factor, method='round')), IMAGE_MAX_SIZE)
115
+ if h_bar * w_bar > max_pixels:
116
+ beta = math.sqrt((h_bar * w_bar) / max_pixels)
117
+ h_bar = adjust_by_factor(h_bar / beta, factor, method='floor')
118
+ w_bar = adjust_by_factor(w_bar / beta, factor, method='floor')
119
+ elif h_bar * w_bar < min_pixels:
120
+ beta = math.sqrt(min_pixels / (height * width))
121
+ h_bar = adjust_by_factor(height * beta, factor, method='ceil')
122
+ w_bar = adjust_by_factor(width * beta, factor, method='ceil')
123
+
124
+ return h_bar, w_bar
125
+
126
+
127
+ def read_img_from_lmdb_v2(image_data):
128
+ # special case for AgiBotWorld
129
+ lmdb_file, lmdb_key = image_data['lmdb_file'], image_data['lmdb_key']
130
+ key = lmdb_key.encode('ascii')
131
+ env = lmdb.open(lmdb_file, max_readers=10240, readonly=True, lock=False, readahead=False, meminit=False)
132
+ txn = env.begin()
133
+ value = txn.get(key)
134
+ if value is None:
135
+ print(f"Warning: Key {key} not found.")
136
+ return None
137
+ record = pickle.loads(value)
138
+ image_bgr = cv2.imdecode(np.frombuffer(record['image'], dtype=np.uint8), cv2.IMREAD_COLOR)
139
+ image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
140
+ image = Image.fromarray(image_rgb)
141
+
142
+ return image
143
+
144
+ def parse_lmdb_image_data(image_data):
145
+ lmdb_file = image_data['lmdb_file']
146
+ if not os.path.exists(lmdb_file):
147
+ if "/home/zhidingy/workspace/libs/eagle/Eagle2/" in lmdb_file:
148
+ lmdb_file = lmdb_file.replace("/home/zhidingy/workspace/libs/eagle/Eagle2/", "")
149
+ else:
150
+ raise ValueError(f"LMDB file {lmdb_file} does not exist")
151
+
152
+ # special case for AgiBotWorld, will remove it later
153
+ if 'AgiBotWorld' in image_data['lmdb_file']:
154
+ return read_img_from_lmdb_v2(image_data)
155
+
156
+
157
+ try:
158
+ env = lmdb.open(image_data['lmdb_file'], readonly=True, lock=False, max_readers=10240)
159
+ except Exception as e:
160
+ print(f"Failed to open lmdb file {image_data['lmdb_file']}. Error message: {e}", flush=True)
161
+ raise e
162
+
163
+ with env.begin(write=False) as txn:
164
+ try:
165
+ image_bin = txn.get(image_data['lmdb_key'].encode('ascii'))
166
+ buf = BytesIO(image_bin)
167
+ except Exception as e:
168
+ print(f"Failed to get image from lmdb file {image_data['lmdb_file']}. Error message: {e}", flush=True)
169
+ raise e
170
+ try:
171
+ image = Image.open(buf)
172
+ except Exception as e:
173
+ image_np = np.frombuffer(image_bin, dtype=np.uint8)
174
+ image_bgr = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
175
+ image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
176
+ image = Image.fromarray(image_rgb)
177
+ return image
178
+
179
+ def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
180
+ if "image" in ele:
181
+ image = ele["image"]
182
+ else:
183
+ image = ele["image_url"]
184
+ image_obj = None
185
+ if isinstance(image, Image.Image):
186
+ image_obj = image
187
+ elif isinstance(image, dict) and 'lmdb_file' in image:
188
+ image_obj = parse_lmdb_image_data(image)
189
+ elif image.startswith("http://") or image.startswith("https://"):
190
+ response = requests.get(image, stream=True)
191
+ image_obj = Image.open(BytesIO(response.content))
192
+ elif image.startswith("file://"):
193
+ image_obj = Image.open(image[7:])
194
+ elif image.startswith("data:image"):
195
+ if "base64," in image:
196
+ _, base64_data = image.split("base64,", 1)
197
+ data = base64.b64decode(base64_data)
198
+ image_obj = Image.open(BytesIO(data))
199
+ else:
200
+ image_obj = Image.open(image)
201
+ if image_obj is None:
202
+ raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
203
+ image = to_rgb(image_obj)
204
+ # if 'scale_factor' in ele:
205
+ # scale_factor = ele['scale_factor']
206
+ # image = image.resize((image.width * scale_factor, image.height * scale_factor), Image.BILINEAR)
207
+
208
+ if "resized_height" in ele and "resized_width" in ele:
209
+ resized_height, resized_width = smart_resize(
210
+ ele["resized_height"],
211
+ ele["resized_width"],
212
+ factor=size_factor,
213
+ )
214
+ else:
215
+ width, height = image.size
216
+ min_pixels = ele.get("min_pixels", MIN_PIXELS)
217
+ max_pixels = ele.get("max_pixels", MAX_PIXELS)
218
+ resized_height, resized_width = smart_resize(
219
+ height,
220
+ width,
221
+ factor=size_factor,
222
+ min_pixels=min_pixels,
223
+ max_pixels=max_pixels,
224
+ )
225
+ image = image.resize((resized_width, resized_height))
226
+
227
+ return image
228
+
229
+
230
+ def smart_nframes(
231
+ ele: dict,
232
+ total_frames: int,
233
+ video_fps: int | float,
234
+ ) -> int:
235
+ """calculate the number of frames for video used for model inputs.
236
+ Args:
237
+ ele (dict): a dict contains the configuration of video.
238
+ support either `fps` or `nframes`:
239
+ - nframes: the number of frames to extract for model inputs.
240
+ - fps: the fps to extract frames for model inputs.
241
+ - min_frames: the minimum number of frames of the video, only used when fps is provided.
242
+ - max_frames: the maximum number of frames of the video, only used when fps is provided.
243
+ total_frames (int): the original total number of frames of the video.
244
+ video_fps (int | float): the original fps of the video.
245
+ Raises:
246
+ ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
247
+ Returns:
248
+ int: the number of frames for video used for model inputs.
249
+ """
250
+ assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
251
+ if "nframes" in ele:
252
+ nframes = adjust_by_factor(ele["nframes"], FRAME_FACTOR, method='round')
253
+ else:
254
+ fps = ele.get("fps", FPS)
255
+ min_frames = adjust_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR, method='ceil')
256
+ max_frames = adjust_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR, method='floor')
257
+ nframes = total_frames / video_fps * fps
258
+ if nframes > total_frames:
259
+ logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
260
+ nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
261
+ nframes = adjust_by_factor(nframes, FRAME_FACTOR, method='floor')
262
+ if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
263
+ # raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
264
+ nframes = total_frames
265
+ return nframes
266
+
267
+ def _read_video_torchvision(
268
+ ele: dict,
269
+ ) -> (torch.Tensor, float, list):
270
+ """read video using torchvision.io.read_video and return also per-frame timestamps"""
271
+ video_path = ele["video"]
272
+ if version.parse(torchvision.__version__) < version.parse("0.19.0"):
273
+ if "http://" in video_path or "https://" in video_path:
274
+ warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
275
+ if "file://" in video_path:
276
+ video_path = video_path[7:]
277
+ st = time.time()
278
+ video, audio, info = io.read_video(
279
+ video_path,
280
+ start_pts=ele.get("video_start", 0.0),
281
+ end_pts=ele.get("video_end", None),
282
+ pts_unit="sec",
283
+ output_format="TCHW",
284
+ )
285
+ total_frames, video_fps = video.size(0), info["video_fps"]
286
+ logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
287
+ nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
288
+ # Calculate frame indices and corresponding timestamps (based on video start time)
289
+ idx = torch.linspace(0, total_frames - 1, nframes).round().long()
290
+ start_time = ele.get("video_start", 0.0)
291
+ timestamps = (start_time + idx.to(torch.float32) / video_fps).tolist()
292
+ sample_fps = nframes / max(total_frames, 1e-6) * video_fps
293
+ video = video[idx]
294
+ return video, sample_fps, timestamps
295
+
296
+
297
+
298
+ def is_pyav_available() -> bool:
299
+ import importlib.util
300
+
301
+ return importlib.util.find_spec("av") is not None
302
+
303
+ def _read_video_pyav(
304
+ ele: dict,
305
+ ) -> (torch.Tensor, float, list):
306
+ """read video using pyav and return also per-frame timestamps"""
307
+ import av
308
+ video_path = ele["video"]
309
+ st = time.time()
310
+
311
+ # Open video file
312
+ container = av.open(video_path)
313
+ video_stream = container.streams.video[0]
314
+
315
+ # Get video properties
316
+ total_frames = video_stream.frames
317
+ video_fps = float(video_stream.average_rate)
318
+
319
+ # Handle video start and end times
320
+ start_time = ele.get("video_start", 0.0)
321
+ end_time = ele.get("video_end", None)
322
+
323
+ if start_time > 0 or end_time is not None:
324
+ # Seek to start time
325
+ start_pts = int(start_time * video_stream.time_base.denominator / video_stream.time_base.numerator)
326
+ container.seek(start_pts, stream=video_stream)
327
+
328
+ # Calculate end pts if specified
329
+ if end_time is not None:
330
+ end_pts = int(end_time * video_stream.time_base.denominator / video_stream.time_base.numerator)
331
+ else:
332
+ end_pts = None
333
+ else:
334
+ end_pts = None
335
+
336
+ logger.info(f"pyav: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
337
+
338
+ # Calculate number of frames to extract
339
+ nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
340
+
341
+ # Calculate frame indices and timestamps
342
+ idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
343
+ timestamps = [start_time + i / video_fps for i in idx]
344
+
345
+ # Extract frames
346
+ frames = []
347
+ frame_count = 0
348
+ target_frame_indices = set(idx)
349
+
350
+ for frame in container.decode(video_stream):
351
+ if frame_count in target_frame_indices:
352
+ # Convert frame to RGB numpy array
353
+ frame_array = frame.to_ndarray(format='rgb24')
354
+ frames.append(frame_array)
355
+
356
+ frame_count += 1
357
+
358
+ # Stop if we've reached the end time or have enough frames
359
+ if end_pts is not None and frame.pts >= end_pts:
360
+ break
361
+ if len(frames) >= nframes:
362
+ break
363
+
364
+ container.close()
365
+
366
+ # Convert to tensor
367
+ if frames:
368
+ video = torch.tensor(np.stack(frames)).permute(0, 3, 1, 2) # Convert to TCHW format
369
+ else:
370
+ # Fallback: create empty tensor with correct shape
371
+ video = torch.zeros((nframes, 3, 224, 224), dtype=torch.uint8)
372
+
373
+ sample_fps = nframes / max(total_frames, 1e-6) * video_fps
374
+ return video, sample_fps, timestamps
375
+
376
+
377
+ VIDEO_READER_BACKENDS = {
378
+ "pyav": _read_video_pyav,
379
+ "torchvision": _read_video_torchvision,
380
+ }
381
+
382
+
383
+ @lru_cache(maxsize=1)
384
+ def get_video_reader_backend() -> str:
385
+ if is_pyav_available():
386
+ video_reader_backend = "pyav"
387
+ else:
388
+ video_reader_backend = "torchvision"
389
+ return video_reader_backend
390
+
391
+
392
+
393
+
394
+ def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor | list[Image.Image]:
395
+
396
+ if isinstance(ele["video"], str):
397
+ video_reader_backend = get_video_reader_backend()
398
+ try:
399
+ video, sample_fps, timestamps = VIDEO_READER_BACKENDS[video_reader_backend](ele)
400
+ except Exception as e:
401
+ logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}")
402
+ video, sample_fps, timestamps = VIDEO_READER_BACKENDS["torchvision"](ele)
403
+
404
+ nframes, _, height, width = video.shape
405
+
406
+ min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
407
+ total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
408
+ max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
409
+ max_pixels_supposed = ele.get("max_pixels", max_pixels)
410
+ if max_pixels_supposed > max_pixels:
411
+ logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
412
+ max_pixels = min(max_pixels_supposed, max_pixels)
413
+ if "resized_height" in ele and "resized_width" in ele:
414
+ resized_height, resized_width = smart_resize(
415
+ ele["resized_height"],
416
+ ele["resized_width"],
417
+ factor=image_factor,
418
+ )
419
+ else:
420
+ resized_height, resized_width = smart_resize(
421
+ height,
422
+ width,
423
+ factor=image_factor,
424
+ min_pixels=min_pixels,
425
+ max_pixels=max_pixels,
426
+ )
427
+ video = transforms.functional.resize(
428
+ video,
429
+ [resized_height, resized_width],
430
+ interpolation=InterpolationMode.BICUBIC,
431
+ antialias=True,
432
+ ).float()
433
+ if return_video_sample_fps:
434
+ return video, sample_fps, timestamps
435
+ return video
436
+
437
+ else:
438
+ assert isinstance(ele["video"], (list, tuple))
439
+ process_info = ele.copy()
440
+ process_info.pop("type", None)
441
+ process_info.pop("video", None)
442
+ images = [
443
+ fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
444
+ for video_element in ele["video"]
445
+ ]
446
+ nframes = adjust_by_factor(len(images), FRAME_FACTOR, method='ceil')
447
+ if len(images) < nframes:
448
+ images.extend([images[-1]] * (nframes - len(images)))
449
+
450
+ timestamps = [-1 for i in range(nframes)] # not sure about this
451
+ if return_video_sample_fps:
452
+ return images, process_info.pop("fps", 2.0), timestamps
453
+ return images
454
+
455
+ class Eagle3_VLProcessorKwargs(ProcessingKwargs, total=False):
456
+ # see processing_utils.ProcessingKwargs documentation for usage.
457
+ _defaults = {
458
+ "text_kwargs": {
459
+ "padding": False,
460
+ },
461
+ "images_kwargs": {},
462
+ "videos_kwargs": {},
463
+ }
464
+
465
+
466
+ class Eagle3_VLProcessor(ProcessorMixin):
467
+ r"""
468
+ Constructs a Eagle3_VL processor which wraps a Eagle3_VL video processor, Eagle3_VL image processor and a Eagle3_VL tokenizer into a single processor.
469
+ [`Eagle3_VLProcessor`] offers all the functionalities of [`Eagle3_VLVideoProcessor`], [`Eagle3_VLImageProcessor`] and [`Eagle3_VLTokenizer`]. See the
470
+ [`~Eagle3_VLVideoProcessor.__call__`], [`~Eagle3_VLProcessor.__call__`] and [`~Eagle3_VLProcessor.decode`] for more information.
471
+ Args:
472
+ image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
473
+ The image processor is a required input.
474
+ tokenizer ([`LlamaTokenizerFast`], *optional*):
475
+ The tokenizer is a required input.
476
+ num_image_tokens (`int`, *optional*):
477
+ Number of image tokens for one imagethat will be returned by vision tower.
478
+ vision_feature_select_strategy (`str`, *optional*):
479
+ The feature selection strategy used to select the vision feature from the vision backbone.
480
+ Shoudl be same as in model's config
481
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
482
+ in a chat into a tokenizable string.
483
+ image_token (`str`, *optional*, defaults to `"<image>"`):
484
+ Special token used to denote image location.
485
+ video_token (`str`, *optional*, defaults to `"<video>"`):
486
+ Special token used to denote video location.
487
+ """
488
+
489
+ attributes = ["image_processor", "tokenizer"]
490
+ valid_kwargs = [
491
+ "chat_template",
492
+ "num_image_tokens",
493
+ "vision_feature_select_strategy",
494
+ "image_token",
495
+ "video_token",
496
+ "images_kwargs",
497
+ "videos_kwargs",
498
+ "text_kwargs",
499
+ ]
500
+ image_processor_class = "AutoImageProcessor"
501
+ tokenizer_class = "AutoTokenizer"
502
+
503
+ def __init__(
504
+ self,
505
+ image_processor=None,
506
+ tokenizer=None,
507
+ vision_feature_select_strategy=None,
508
+ chat_template=None,
509
+ image_token='<IMG_CONTEXT>',
510
+ video_token='<IMG_CONTEXT>',
511
+ pixels_per_token=28*28,
512
+ image_placeholder='image',
513
+ video_placeholder='video',
514
+ image_start_token='<img>',
515
+ image_end_token='</img>',
516
+ **kwargs,
517
+ ):
518
+ self.vision_feature_select_strategy = vision_feature_select_strategy
519
+ self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
520
+ self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
521
+ self.image_token_id = (
522
+ tokenizer.image_token_id
523
+ if getattr(tokenizer, "image_token_id", None)
524
+ else tokenizer.convert_tokens_to_ids(self.image_token)
525
+ )
526
+ self.video_token_id = (
527
+ tokenizer.video_token_id
528
+ if getattr(tokenizer, "video_token_id", None)
529
+ else tokenizer.convert_tokens_to_ids(self.video_token)
530
+ )
531
+ self.image_placeholder = image_placeholder
532
+ self.video_placeholder = video_placeholder
533
+ self.pixels_per_token = pixels_per_token
534
+ self.image_start_token = image_start_token
535
+ self.image_end_token = image_end_token
536
+ if 'auto_map' in kwargs:
537
+ self.auto_map = kwargs['auto_map']
538
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
539
+
540
+
541
+ def replace_media_placeholder(self, text, image_list, video_list, timestamps_list, fps_list, **output_kwargs):
542
+
543
+ num_of_images_in_this_sample = 0
544
+ num_of_videos_in_this_sample = 0
545
+ # Regular expression pattern to match formats like <image-1> or <video-2>
546
+ pattern = re.compile(rf"<({self.image_placeholder}|{self.video_placeholder})-(\d+)>")
547
+ unified_frame_list = []
548
+
549
+ # Function to replace tags in a single text
550
+ def replace_in_text(text):
551
+ # repl callback function for each match replacement operation
552
+ def repl(match):
553
+ nonlocal unified_frame_list
554
+ nonlocal num_of_images_in_this_sample
555
+ nonlocal num_of_videos_in_this_sample
556
+ media_type = match.group(1) # 'image' or 'video'
557
+ idx_in_list = int(match.group(2)) - 1 # Convert to list index (0-based)
558
+ # Select the corresponding path based on media type
559
+ idx_mapper = {0: "first", 1: "second", 2: "third", 3: "fourth", 4: "fifth", 5: "sixth", 6: "seventh", 7: "eighth", 8: "ninth", 9: "tenth"}
560
+ if media_type == 'image':
561
+ image_inputs = self.image_processor(images=[image_list[idx_in_list]], videos=None, **output_kwargs["images_kwargs"])
562
+ image_height, image_width = image_inputs['image_sizes'][0]
563
+ assert image_height <= IMAGE_MAX_SIZE and image_width <= IMAGE_MAX_SIZE, f"image_height: {image_height}, image_width: {image_width}"
564
+ image_tokens = image_height * image_width // self.pixels_per_token
565
+ special_placeholder = f"<image {idx_in_list+1}>{self.image_start_token}{self.image_token * image_tokens}{self.image_end_token}"
566
+ unified_frame_list.append(image_inputs)
567
+ num_of_images_in_this_sample += 1
568
+
569
+ elif media_type == 'video':
570
+
571
+ video_inputs = self.image_processor(images=None, videos=video_list[idx_in_list], **output_kwargs["videos_kwargs"])
572
+ N, C, image_height, image_width = video_inputs['pixel_values'].shape
573
+ image_tokens = image_height * image_width // self.pixels_per_token
574
+
575
+ assert image_height <= IMAGE_MAX_SIZE and image_width <= IMAGE_MAX_SIZE, f"image_height: {image_height}, image_width: {image_width}"
576
+
577
+ if timestamps_list is not None and -1 not in timestamps_list:
578
+ frame_timestamps = timestamps_list[idx_in_list]
579
+ else:
580
+ frame_timestamps = None
581
+ sampled_fps = fps_list[idx_in_list] if fps_list is not None else None
582
+
583
+ num_of_tokens_list = [image_tokens] * N
584
+
585
+ if frame_timestamps is not None:
586
+ assert len(frame_timestamps) == len(num_of_tokens_list), f"The number of timestamps is not equal to the number of frames: {len(frame_timestamps)} != {len(num_of_tokens_list)}"
587
+ special_placeholder = [f"Frame {i+1} sample at {frame_timestamps[i]:.2f}s: {self.image_start_token}{self.image_token * num_of_tokens}{self.image_end_token}" for i, num_of_tokens in enumerate(num_of_tokens_list)]
588
+ else:
589
+ special_placeholder = [f"Frame {i+1}: {self.image_start_token}{self.image_token * num_of_tokens}{self.image_end_token}" for i, num_of_tokens in enumerate(num_of_tokens_list)]
590
+
591
+ if sampled_fps is not None:
592
+ special_placeholder = f"The {idx_mapper[idx_in_list]} video sampled with {sampled_fps:.2f} fps: " + "".join(special_placeholder)
593
+ else:
594
+ special_placeholder = f"The {idx_mapper[idx_in_list]} video: " + "".join(special_placeholder)
595
+ unified_frame_list.append(video_inputs)
596
+ num_of_videos_in_this_sample += 1
597
+ else:
598
+ raise ValueError(f'Unknown media type: {media_type}')
599
+ return special_placeholder
600
+ return pattern.sub(repl, text)
601
+ text = replace_in_text(text)
602
+ if len(unified_frame_list) > 0:
603
+ pixel_values = [frame['pixel_values'] for frame in unified_frame_list]
604
+ image_sizes = torch.cat([frame['image_sizes'] for frame in unified_frame_list], dim=0)
605
+ else:
606
+ pixel_values = []
607
+ image_sizes = []
608
+ return text, pixel_values, image_sizes, num_of_images_in_this_sample, num_of_videos_in_this_sample
609
+
610
+ def __call__(
611
+ self,
612
+ images: ImageInput = None,
613
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
614
+ audio=None,
615
+ videos: VideoInput = None,
616
+ **kwargs: Unpack[Eagle3_VLProcessorKwargs],
617
+ ) -> BatchFeature:
618
+ """
619
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
620
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
621
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
622
+ LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
623
+ of the above two methods for more information.
624
+ Args:
625
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
626
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
627
+ tensor. Both channels-first and channels-last formats are supported.
628
+ text (`str`, `List[str]`, `List[List[str]]`):
629
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
630
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
631
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
632
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
633
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
634
+ Returns:
635
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
636
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
637
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
638
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
639
+ `None`).
640
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
641
+ - **pixel_values_videos** -- Pixel values of a video input to be fed to a model. Returned when `videos` is not `None`.
642
+ - **image_sizes** -- Size of each image that will be used to unpad an image. Returned when `images` is not `None`.
643
+ """
644
+
645
+
646
+ output_kwargs = self._merge_kwargs(
647
+ Eagle3_VLProcessorKwargs,
648
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
649
+ **kwargs,
650
+ )
651
+
652
+ if isinstance(text, str):
653
+ text_list = [text]
654
+ elif not isinstance(text, list) and not isinstance(text[0], str):
655
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
656
+ elif isinstance(text, list) and isinstance(text[0], str):
657
+ text_list = text
658
+
659
+ if images is None: images = []
660
+ if videos is None: videos = []
661
+
662
+ pixel_values_list = []
663
+ image_sizes_list = []
664
+ new_sample_list = []
665
+ image_start_idx = 0
666
+ video_start_idx = 0
667
+ timestamps_batch = output_kwargs['videos_kwargs'].pop("timestamps", None)
668
+ fps_batch = output_kwargs['videos_kwargs'].pop("fps", None)
669
+ for sample in text_list:
670
+ timestamps_list = timestamps_batch[video_start_idx:] if timestamps_batch is not None else None
671
+ fps_list = fps_batch[video_start_idx:] if fps_batch is not None else None
672
+ sample, pixel_values, image_sizes, num_of_images_in_this_sample, num_of_videos_in_this_sample = self.replace_media_placeholder(sample, images[image_start_idx:], videos[video_start_idx:], timestamps_list, fps_list, **output_kwargs)
673
+ new_sample_list.append(sample)
674
+ pixel_values_list.extend(pixel_values)
675
+ image_sizes_list.extend(image_sizes)
676
+
677
+ image_start_idx += num_of_images_in_this_sample
678
+ video_start_idx += num_of_videos_in_this_sample
679
+
680
+ if len(pixel_values) > 0:
681
+ image_inputs = {
682
+ 'pixel_values':pixel_values_list,
683
+ 'image_sizes': torch.stack(image_sizes_list, dim=0)
684
+ }
685
+ else:
686
+ image_inputs = {}
687
+ video_inputs = {}
688
+ text_inputs = self.tokenizer(new_sample_list, **output_kwargs["text_kwargs"])
689
+ return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs})
690
+
691
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
692
+ def batch_decode(self, *args, **kwargs):
693
+ """
694
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
695
+ refer to the docstring of this method for more information.
696
+ """
697
+ return self.tokenizer.batch_decode(*args, **kwargs)
698
+
699
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
700
+ def decode(self, *args, **kwargs):
701
+ """
702
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
703
+ the docstring of this method for more information.
704
+ """
705
+ return self.tokenizer.decode(*args, **kwargs)
706
+
707
+ @property
708
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
709
+ def model_input_names(self):
710
+ tokenizer_input_names = self.tokenizer.model_input_names
711
+ image_processor_input_names = self.image_processor.model_input_names
712
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
713
+
714
+ # override to save video-config in a separate config file
715
+ def save_pretrained(self, save_directory, **kwargs):
716
+ if os.path.isfile(save_directory):
717
+ raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
718
+ os.makedirs(save_directory, exist_ok=True)
719
+
720
+ outputs = super().save_pretrained(save_directory, **kwargs)
721
+ return outputs
722
+
723
+ # override to load video-config from a separate config file
724
+ @classmethod
725
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
726
+ processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
727
+
728
+ # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
729
+ if isinstance(processor, tuple):
730
+ processor = processor[0]
731
+ return processor
732
+
733
+ # Copy from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
734
+ def process_vision_info(
735
+ self,
736
+ conversations: list[dict] | list[list[dict]],
737
+ return_video_kwargs: bool = False,
738
+ ) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, Optional[dict]]:
739
+
740
+ vision_infos = self.extract_vision_info(conversations)
741
+ ## Read images or videos
742
+ image_inputs = []
743
+ video_inputs = []
744
+ video_sample_fps_list = []
745
+ video_timestamps_list = []
746
+ for vision_info in vision_infos:
747
+ if "image" in vision_info or "image_url" in vision_info:
748
+ image_inputs.append(fetch_image(vision_info))
749
+ elif "video" in vision_info:
750
+ video_input, video_sample_fps, video_timestamps = fetch_video(vision_info, return_video_sample_fps=True)
751
+ video_sample_fps_list.append(video_sample_fps)
752
+ video_inputs.append(video_input)
753
+ video_timestamps_list.append(video_timestamps)
754
+ else:
755
+ raise ValueError("image, image_url or video should in content.")
756
+ if len(image_inputs) == 0:
757
+ image_inputs = None
758
+ if len(video_inputs) == 0:
759
+ video_inputs = None
760
+ if return_video_kwargs:
761
+ return image_inputs, video_inputs, {'fps': video_sample_fps_list, 'timestamps': video_timestamps_list}
762
+ return image_inputs, video_inputs
763
+
764
+ def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]:
765
+ vision_infos = []
766
+ if isinstance(conversations[0], dict):
767
+ conversations = [conversations]
768
+ for conversation in conversations:
769
+ for message in conversation:
770
+ if isinstance(message["content"], list):
771
+ for ele in message["content"]:
772
+ if (
773
+ "image" in ele
774
+ or "image_url" in ele
775
+ or "video" in ele
776
+ or ele["type"] in ("image", "image_url", "video")
777
+ ):
778
+ vision_infos.append(ele)
779
+ return vision_infos
780
+
781
+ def py_apply_chat_template(self, messages, tokenize=False, add_generation_prompt=False):
782
+ """
783
+ Renders a chat conversation using a custom template with verification of tokens.
784
+ The purpose is to check for the existence of tokens like "<image-1>" or "<video-1>"
785
+ in the message text and skip adding them if they already exist.
786
+ Args:
787
+ messages (list): A list of message dictionaries. Each message should contain:
788
+ - 'role': The role of the speaker (e.g., 'system', 'user', 'assistant').
789
+ - 'content': Either a string or a list of content blocks. In the list each block may contain:
790
+ * 'type': The type of content, such as 'image' or 'video'.
791
+ * 'text': The actual text if present.
792
+ * Other keys such as 'image', 'image_url', or 'video'.
793
+ add_generation_prompt (bool): If True, appends "<|im_start|>assistant" at the end of the rendered string.
794
+ tokenize (bool): If True, tokenize the rendered string.
795
+ Returns:
796
+ str: The final rendered chat string according to the specified template.
797
+ """
798
+ assert tokenize == False, "tokenize is not supported yet"
799
+ result = ""
800
+ image_count = 0
801
+ video_count = 0
802
+
803
+ message_text = ""
804
+ for idx, message in enumerate(messages):
805
+ if message.get('role') != 'user': continue
806
+ # If content is a string, simply output it.
807
+ content = message.get('content')
808
+ if isinstance(content, str):
809
+ message_text += content
810
+ elif isinstance(content, list):
811
+ # Process each content item.
812
+ for item in content:
813
+ # If the block is a dictionary and contains text, add it to message_text.
814
+ if isinstance(item, dict) and "text" in item:
815
+ message_text += item["text"]
816
+ # If an item is already a string in the list, add it directly.
817
+ elif isinstance(item, str):
818
+ message_text += item
819
+
820
+ for idx, message in enumerate(messages):
821
+ # If the first message is not from the system, prepend a default system message.
822
+ if idx == 0 and message.get('role') != 'system':
823
+ result += "<|im_start|>system\n"
824
+ result += "You are a helpful assistant.\n"
825
+ result += "<|im_end|>\n"
826
+
827
+ # Start the current message block with its role.
828
+ result += f"<|im_start|>{message.get('role', '')}\n"
829
+ content = message.get('content')
830
+
831
+ # If content is a string, simply output it.
832
+ if isinstance(content, str):
833
+ result += content
834
+ result += "<|im_end|>\n"
835
+ else:
836
+ # Process each content item.
837
+ for item in content:
838
+ # Check if the item is an image (explicitly by type or by key presence).
839
+ if (isinstance(item, dict) and (item.get('type') == 'image' or 'image' in item or 'image_url' in item)):
840
+ image_count += 1
841
+ candidate_token = f"<image-{image_count}>"
842
+ # Only add the token if it is not already present in the collected text.
843
+ if candidate_token not in message_text:
844
+ result += candidate_token
845
+ # Check if the item is a video.
846
+ elif (isinstance(item, dict) and (item.get('type') == 'video' or 'video' in item)):
847
+ video_count += 1
848
+ candidate_token = f"<video-{video_count}>"
849
+ # Only add the token if it is not already present.
850
+ if candidate_token not in message_text:
851
+ result += candidate_token
852
+ # If the item contains text, add it.
853
+ elif isinstance(item, dict) and 'text' in item:
854
+ result += item['text']
855
+ # If the item is a string (and not handled already), add it.
856
+ elif isinstance(item, str):
857
+ result += item
858
+ result += "<|im_end|>\n"
859
+
860
+ # Optionally add assistant generation prompt at the end.
861
+ if add_generation_prompt:
862
+ result += "<|im_start|>assistant\n"
863
+
864
+ return result
865
+
866
+
867
+ @classmethod
868
+ def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
869
+ """
870
+ Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
871
+ Args:
872
+ processor_dict (`Dict[str, Any]`):
873
+ Dictionary that will be used to instantiate the processor object. Such a dictionary can be
874
+ retrieved from a pretrained checkpoint by leveraging the
875
+ [`~processing_utils.ProcessingMixin.to_dict`] method.
876
+ kwargs (`Dict[str, Any]`):
877
+ Additional parameters from which to initialize the processor object.
878
+ Returns:
879
+ [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
880
+ parameters.
881
+ """
882
+ processor_dict = processor_dict.copy()
883
+ return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
884
+
885
+ # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
886
+ # If we don't pop, some specific kwargs will raise a warning
887
+ if "processor_class" in processor_dict:
888
+ del processor_dict["processor_class"]
889
+
890
+ #if "auto_map" in processor_dict:
891
+ # del processor_dict["auto_map"]
892
+
893
+ unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
894
+ processor = cls(*args, **processor_dict)
895
+
896
+ # Update processor with kwargs if needed
897
+ for key in set(kwargs.keys()):
898
+ if hasattr(processor, key):
899
+ setattr(processor, key, kwargs.pop(key))
900
+
901
+ if isinstance(unused_kwargs, dict):
902
+ kwargs.update(unused_kwargs)
903
+ elif isinstance(unused_kwargs, (list, tuple, set)):
904
+ for key in unused_kwargs:
905
+ if isinstance(key, str):
906
+ kwargs.setdefault(key, processor_dict.get(key, None))
907
+ logger.info(f"Processor {processor}")
908
+ if return_unused_kwargs:
909
+ return processor, kwargs
910
+ else:
911
+ return processor
912
+
913
+
914
+ __all__ = ["Eagle3_VLProcessor"]
processor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_eagle3_vl.Eagle3_VLProcessor"
4
+ },
5
+ "image_end_token": "</img>",
6
+ "image_placeholder": "image",
7
+ "image_start_token": "<img>",
8
+ "image_token": "<IMG_CONTEXT>",
9
+ "pixels_per_token": 784,
10
+ "processor_class": "Eagle3_VLProcessor",
11
+ "video_placeholder": "video",
12
+ "video_token": "<IMG_CONTEXT>",
13
+ "vision_feature_select_strategy": null
14
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<IMG_CONTEXT>",
17
+ "<img>",
18
+ "</img>",
19
+ "<box>",
20
+ "</box>",
21
+ "<quad>",
22
+ "</quad>",
23
+ "<ref>",
24
+ "</ref>",
25
+ "<interval>",
26
+ "</interval>"
27
+ ],
28
+ "eos_token": {
29
+ "content": "<|im_end|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6c716b741f9fab52cd98c4f246aa3594e16e3758216825f5e58e7d42be60242
3
+ size 11425759
tokenizer_config.json ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151669": {
215
+ "content": "<IMG_CONTEXT>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<img>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</img>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<box>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</box>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "151674": {
255
+ "content": "<quad>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "151675": {
263
+ "content": "</quad>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "151676": {
271
+ "content": "<ref>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "151677": {
279
+ "content": "</ref>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "151678": {
287
+ "content": "<interval>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "151679": {
295
+ "content": "</interval>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ },
302
+ "151680": {
303
+ "content": "<abs_vis_token_pad>",
304
+ "lstrip": false,
305
+ "normalized": false,
306
+ "rstrip": false,
307
+ "single_word": false,
308
+ "special": true
309
+ },
310
+ "151681": {
311
+ "content": "<abs_vis_token>",
312
+ "lstrip": false,
313
+ "normalized": false,
314
+ "rstrip": false,
315
+ "single_word": false,
316
+ "special": true
317
+ },
318
+ "151682": {
319
+ "content": "</abs_vis_token>",
320
+ "lstrip": false,
321
+ "normalized": false,
322
+ "rstrip": false,
323
+ "single_word": false,
324
+ "special": true
325
+ },
326
+ "151683": {
327
+ "content": "<observation>",
328
+ "lstrip": false,
329
+ "normalized": false,
330
+ "rstrip": false,
331
+ "single_word": false,
332
+ "special": true
333
+ },
334
+ "151684": {
335
+ "content": "</observation>",
336
+ "lstrip": false,
337
+ "normalized": false,
338
+ "rstrip": false,
339
+ "single_word": false,
340
+ "special": true
341
+ }
342
+ },
343
+ "additional_special_tokens": [
344
+ "<|im_start|>",
345
+ "<|im_end|>",
346
+ "<|object_ref_start|>",
347
+ "<|object_ref_end|>",
348
+ "<|box_start|>",
349
+ "<|box_end|>",
350
+ "<|quad_start|>",
351
+ "<|quad_end|>",
352
+ "<|vision_start|>",
353
+ "<|vision_end|>",
354
+ "<|vision_pad|>",
355
+ "<|image_pad|>",
356
+ "<|video_pad|>",
357
+ "<IMG_CONTEXT>",
358
+ "<img>",
359
+ "</img>",
360
+ "<box>",
361
+ "</box>",
362
+ "<quad>",
363
+ "</quad>",
364
+ "<ref>",
365
+ "</ref>",
366
+ "<interval>",
367
+ "</interval>"
368
+ ],
369
+ "auto_map": {
370
+ "AutoProcessor": "processing_eagle3_vl.Eagle3_VLProcessor"
371
+ },
372
+ "bos_token": null,
373
+ "clean_up_tokenization_spaces": false,
374
+ "eos_token": "<|im_end|>",
375
+ "errors": "replace",
376
+ "extra_special_tokens": {},
377
+ "fix_mistral_regex": true,
378
+ "model_max_length": 32768,
379
+ "pad_token": "<|endoftext|>",
380
+ "processor_class": "Eagle3_VLProcessor",
381
+ "split_special_tokens": false,
382
+ "tokenizer_class": "Qwen2Tokenizer",
383
+ "unk_token": null
384
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30a005629d1affea408ebb4f7e9da2b2face9d845ad173ac7f316bef2383eb0f
3
+ size 8145
vocab.json ADDED
The diff for this file is too large to render. See raw diff