TYTTYTTYT commited on
Commit
693246c
·
verified ·
1 Parent(s): 791aaf4

update video processor params to the qwen3vl paper version

Browse files
video_preprocessor_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "do_resize": true,
15
  "do_sample_frames": true,
16
  "focus_size": 2,
17
- "fps": 1,
18
  "image_mean": [
19
  0.5,
20
  0.5,
@@ -26,18 +26,19 @@
26
  0.5
27
  ],
28
  "input_data_format": null,
29
- "max_frames": 3600,
30
  "merge_size": 2,
31
  "min_frames": 4,
32
  "num_frames": null,
33
  "pad_size": null,
34
  "patch_size": 16,
35
  "processor_class": "ZFQwen3VLProcessor",
 
36
  "resample": 3,
37
  "rescale_factor": 0.00392156862745098,
38
  "return_metadata": false,
39
  "size": {
40
- "longest_edge": 235929600,
41
  "shortest_edge": 4096
42
  },
43
  "temporal_patch_size": 2,
 
14
  "do_resize": true,
15
  "do_sample_frames": true,
16
  "focus_size": 2,
17
+ "fps": 2,
18
  "image_mean": [
19
  0.5,
20
  0.5,
 
26
  0.5
27
  ],
28
  "input_data_format": null,
29
+ "max_frames": 2048,
30
  "merge_size": 2,
31
  "min_frames": 4,
32
  "num_frames": null,
33
  "pad_size": null,
34
  "patch_size": 16,
35
  "processor_class": "ZFQwen3VLProcessor",
36
+ "processor_device": "cpu",
37
  "resample": 3,
38
  "rescale_factor": 0.00392156862745098,
39
  "return_metadata": false,
40
  "size": {
41
+ "longest_edge": 458752000,
42
  "shortest_edge": 4096
43
  },
44
  "temporal_patch_size": 2,
video_processing_qwen3_vl.py CHANGED
@@ -1,8 +1,9 @@
1
  import math
2
- from typing import Optional, Union
3
 
4
  import numpy as np
5
  import torch
 
6
 
7
  from transformers.feature_extraction_utils import BatchFeature
8
  from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
@@ -11,8 +12,8 @@ from transformers.utils.generic import TensorType
11
  from transformers.utils.doc import add_start_docstrings
12
  from transformers.utils import logging
13
  from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
14
- from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
15
-
16
 
17
  logger = logging.get_logger(__name__)
18
 
@@ -57,6 +58,7 @@ class Qwen3VLVideoProcessorInitKwargs(VideosKwargs):
57
  focus_size: Optional[int]
58
  min_frames: Optional[int]
59
  max_frames: Optional[int]
 
60
 
61
 
62
  @add_start_docstrings(
@@ -88,6 +90,7 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
88
  min_frames = 4
89
  max_frames = 768
90
  do_sample_frames = True
 
91
  valid_kwargs = Qwen3VLVideoProcessorInitKwargs
92
  model_input_names = ["pixel_values_videos", "video_grid_thw"]
93
 
@@ -183,6 +186,9 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
183
  grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
184
  resized_videos_grouped = {}
185
 
 
 
 
186
  for shape, stacked_videos in grouped_videos.items():
187
  B, T, C, H, W = stacked_videos.shape
188
  num_frames, height, width = T, H, W
@@ -262,5 +268,93 @@ class ZFQwen3VLVideoProcessor(BaseVideoProcessor):
262
 
263
  return BatchFeature(data=data, tensor_type=return_tensors)
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  __all__ = ["ZFQwen3VLVideoProcessor"]
 
1
  import math
2
+ from typing import Optional, Union, Iterable
3
 
4
  import numpy as np
5
  import torch
6
+ from torchvision.transforms.v2 import functional as F
7
 
8
  from transformers.feature_extraction_utils import BatchFeature
9
  from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
 
12
  from transformers.utils.doc import add_start_docstrings
13
  from transformers.utils import logging
14
  from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
15
+ from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos, load_video, VideoInput
16
+ from transformers.image_transforms import to_channel_dimension_format
17
 
18
  logger = logging.get_logger(__name__)
19
 
 
58
  focus_size: Optional[int]
59
  min_frames: Optional[int]
60
  max_frames: Optional[int]
61
+ processor_device: Optional[str]
62
 
63
 
64
  @add_start_docstrings(
 
90
  min_frames = 4
91
  max_frames = 768
92
  do_sample_frames = True
93
+ processor_device: str = "cpu"
94
  valid_kwargs = Qwen3VLVideoProcessorInitKwargs
95
  model_input_names = ["pixel_values_videos", "video_grid_thw"]
96
 
 
186
  grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
187
  resized_videos_grouped = {}
188
 
189
+ for vid in videos:
190
+ print(f'vid type: {type(vid)}, vid shape: {vid.shape}')
191
+
192
  for shape, stacked_videos in grouped_videos.items():
193
  B, T, C, H, W = stacked_videos.shape
194
  num_frames, height, width = T, H, W
 
268
 
269
  return BatchFeature(data=data, tensor_type=return_tensors)
270
 
271
+ def fetch_videos( # type: ignore
272
+ self,
273
+ video_url_or_urls: Union[str, list[str], list[list[str]]],
274
+ sample_indices_fn=None
275
+ ):
276
+ """
277
+ Convert a single or a list of urls into the corresponding `np.array` objects.
278
+
279
+ If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
280
+ returned.
281
+ """
282
+ if isinstance(video_url_or_urls, list):
283
+ return list(zip(*[self.fetch_videos(x, sample_indices_fn=sample_indices_fn) for x in video_url_or_urls]))
284
+ else:
285
+ video, metadata = load_video(
286
+ video_url_or_urls, # type: ignore
287
+ backend="torchcodec",
288
+ sample_indices_fn=sample_indices_fn,
289
+ device=self.processor_device
290
+ ) # type: ignore
291
+ print(f'Loaded video shape: {video.shape}, dtype: {video.dtype}, device: {video.device}')
292
+ return video, metadata
293
+
294
+ def normalize(
295
+ self,
296
+ image: "torch.Tensor",
297
+ mean: Union[float, Iterable[float]],
298
+ std: Union[float, Iterable[float]],
299
+ **kwargs,
300
+ ) -> "torch.Tensor":
301
+ """
302
+ Normalize an image. image = (image - image_mean) / image_std.
303
+
304
+ Args:
305
+ image (`torch.Tensor`):
306
+ Image to normalize.
307
+ mean (`torch.Tensor`, `float` or `Iterable[float]`):
308
+ Image mean to use for normalization.
309
+ std (`torch.Tensor`, `float` or `Iterable[float]`):
310
+ Image standard deviation to use for normalization.
311
+
312
+ Returns:
313
+ `torch.Tensor`: The normalized image.
314
+ """
315
+ return F.normalize(image, mean, std, inplace=True) # type: ignore
316
+
317
+ def rescale(
318
+ self,
319
+ image: "torch.Tensor",
320
+ scale: float,
321
+ **kwargs,
322
+ ) -> "torch.Tensor":
323
+ """
324
+ Rescale an image by a scale factor. image = image * scale.
325
+
326
+ Args:
327
+ image (`torch.Tensor`):
328
+ Image to rescale.
329
+ scale (`float`):
330
+ The scaling factor to rescale pixel values by.
331
+
332
+ Returns:
333
+ `torch.Tensor`: The rescaled image.
334
+ """
335
+ return image.mul_(scale)
336
+
337
+ def _prepare_input_videos(
338
+ self,
339
+ videos: VideoInput,
340
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
341
+ device: Optional[str] = None,
342
+ ) -> list["torch.Tensor"]:
343
+ """
344
+ Prepare the input videos for processing.
345
+ """
346
+ processed_videos = []
347
+ for video in videos:
348
+ # `make_batched_videos` always returns a 4D array per video
349
+ if isinstance(video, np.ndarray):
350
+ video = to_channel_dimension_format(video, ChannelDimension.FIRST, input_data_format)
351
+ # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
352
+ video = torch.from_numpy(video).contiguous()
353
+
354
+ if device is not None:
355
+ raise ValueError("The `device` argument is not supported. Please use `processor_device` instead.")
356
+
357
+ processed_videos.append(video)
358
+ return processed_videos
359
 
360
  __all__ = ["ZFQwen3VLVideoProcessor"]