yiyexy commited on
Commit
9d1705f
·
verified ·
1 Parent(s): ee88bb4

Add codec video backend & docs (processing_llava_onevision2.py)

Browse files
Files changed (1) hide show
  1. processing_llava_onevision2.py +114 -0
processing_llava_onevision2.py CHANGED
@@ -79,6 +79,7 @@ class LlavaOnevision2Processor:
79
  tokenizer=None,
80
  video_processor=None,
81
  chat_template: Optional[str] = None,
 
82
  ):
83
  self.image_processor = image_processor
84
  self.tokenizer = tokenizer
@@ -94,6 +95,9 @@ class LlavaOnevision2Processor:
94
  getattr(image_processor, "merge_size", 2) if image_processor is not None else 2
95
  )
96
 
 
 
 
97
  # ------------------------------------------------------------------ utils
98
 
99
  @classmethod
@@ -114,6 +118,7 @@ class LlavaOnevision2Processor:
114
  kwargs.pop("_from_auto", None)
115
  kwargs.pop("trust_remote_code", None)
116
  kwargs.pop("code_revision", None)
 
117
 
118
  # Use the SLOW Qwen2VLImageProcessor: the Fast variant has small
119
  # normalization rounding differences that change pixel_values bit-for-bit.
@@ -141,10 +146,34 @@ class LlavaOnevision2Processor:
141
  patch_size=getattr(image_processor, "patch_size", 14),
142
  spatial_merge_size=getattr(image_processor, "merge_size", 2),
143
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  return cls(
145
  image_processor=image_processor,
146
  tokenizer=tokenizer,
147
  video_processor=video_processor,
 
148
  )
149
 
150
  # ------------------------------------------------------------- chat helpers
@@ -167,6 +196,14 @@ class LlavaOnevision2Processor:
167
  num_frames: Optional[int] = None,
168
  max_frames: Optional[int] = None,
169
  target_fps: Optional[float] = None,
 
 
 
 
 
 
 
 
170
  **kwargs,
171
  ):
172
  """Process an aligned (text, images, videos) batch.
@@ -200,6 +237,83 @@ class LlavaOnevision2Processor:
200
 
201
  out: dict = {}
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  # ---------------- VIDEO PATH ----------------
204
  # Process videos first so we can rewrite their placeholders into the
205
  # text before tokenization.
 
79
  tokenizer=None,
80
  video_processor=None,
81
  chat_template: Optional[str] = None,
82
+ codec_config: Optional[dict] = None,
83
  ):
84
  self.image_processor = image_processor
85
  self.tokenizer = tokenizer
 
95
  getattr(image_processor, "merge_size", 2) if image_processor is not None else 2
96
  )
97
 
98
+ # Codec config defaults (overridden per-call via ``codec_config=``).
99
+ self._codec_config_defaults: dict = dict(codec_config or {})
100
+
101
  # ------------------------------------------------------------------ utils
102
 
103
  @classmethod
 
118
  kwargs.pop("_from_auto", None)
119
  kwargs.pop("trust_remote_code", None)
120
  kwargs.pop("code_revision", None)
121
+ codec_config_override = kwargs.pop("codec_config", None)
122
 
123
  # Use the SLOW Qwen2VLImageProcessor: the Fast variant has small
124
  # normalization rounding differences that change pixel_values bit-for-bit.
 
146
  patch_size=getattr(image_processor, "patch_size", 14),
147
  spatial_merge_size=getattr(image_processor, "merge_size", 2),
148
  )
149
+
150
+ # Codec defaults are read from preprocessor_config.json's "codec" field.
151
+ # We load the JSON directly because Qwen2VLImageProcessor.from_pretrained
152
+ # may not preserve unknown top-level keys as attributes.
153
+ if codec_config_override is not None:
154
+ codec_defaults = codec_config_override
155
+ else:
156
+ codec_defaults = {}
157
+ try:
158
+ import json as _json
159
+ import os as _os
160
+ # Try local file first (downloaded snapshot), then HF Hub.
161
+ cfg_path = _os.path.join(pretrained_model_name_or_path, "preprocessor_config.json")
162
+ if _os.path.isfile(cfg_path):
163
+ with open(cfg_path, "r", encoding="utf-8") as _f:
164
+ codec_defaults = _json.load(_f).get("codec", {}) or {}
165
+ else:
166
+ from huggingface_hub import hf_hub_download
167
+ cfg_path = hf_hub_download(pretrained_model_name_or_path, "preprocessor_config.json")
168
+ with open(cfg_path, "r", encoding="utf-8") as _f:
169
+ codec_defaults = _json.load(_f).get("codec", {}) or {}
170
+ except Exception:
171
+ codec_defaults = {}
172
  return cls(
173
  image_processor=image_processor,
174
  tokenizer=tokenizer,
175
  video_processor=video_processor,
176
+ codec_config=codec_defaults,
177
  )
178
 
179
  # ------------------------------------------------------------- chat helpers
 
196
  num_frames: Optional[int] = None,
197
  max_frames: Optional[int] = None,
198
  target_fps: Optional[float] = None,
199
+ # Codec video backend (in-processor codec preprocessing). When
200
+ # ``video_backend="codec"`` and ``videos`` is set, the codec pipeline
201
+ # (cv-preinfer) replaces the frame-sampling VideoProcessor. The codec
202
+ # canvas pixel budget is taken from ``max_pixels`` so the user only
203
+ # configures one pixel knob.
204
+ video_backend: str = "frames",
205
+ max_pixels: Optional[int] = None,
206
+ codec_config: Optional[dict] = None,
207
  **kwargs,
208
  ):
209
  """Process an aligned (text, images, videos) batch.
 
237
 
238
  out: dict = {}
239
 
240
+ # ---------------- CODEC VIDEO BACKEND ----------------
241
+ # Codec path: replaces the frame-sampling VideoProcessor entirely.
242
+ # Each video -> N canvases + src_patch_position; we rewrite the
243
+ # <|vision_start|>...<|vision_end|> span in `text` based on the codec
244
+ # patch_positions (one canvas worth of <|image_pad|>s per timestamp).
245
+ if videos is not None and str(video_backend).lower() == "codec":
246
+ try:
247
+ from .codec_video_processing_llava_onevision2 import (
248
+ CodecConfig, process_codec_video, drop_padding_canvases,
249
+ codec_positions_for_processor, rewrite_text_with_codec_positions,
250
+ codec_image_processor_outputs,
251
+ )
252
+ except ImportError:
253
+ from codec_video_processing_llava_onevision2 import (
254
+ CodecConfig, process_codec_video, drop_padding_canvases,
255
+ codec_positions_for_processor, rewrite_text_with_codec_positions,
256
+ codec_image_processor_outputs,
257
+ )
258
+
259
+ # Normalise to list[video_url].
260
+ if isinstance(videos, str):
261
+ videos_list = [videos]
262
+ else:
263
+ videos_list = list(videos)
264
+
265
+ # Build effective codec config: defaults < class-level < per-call.
266
+ cfg_kwargs = dict(self._codec_config_defaults)
267
+ if codec_config:
268
+ cfg_kwargs.update(codec_config)
269
+ # Unify pixel budget with image_processor.
270
+ effective_max_pixels = int(
271
+ max_pixels
272
+ if max_pixels is not None
273
+ else cfg_kwargs.get("max_pixels", getattr(self.image_processor, "max_pixels", 150000))
274
+ )
275
+ cfg_kwargs["max_pixels"] = effective_max_pixels
276
+ cfg = CodecConfig(**cfg_kwargs)
277
+
278
+ all_pixel_values, all_grid_thw, all_patch_positions = [], [], []
279
+ rewritten_texts = list(text)
280
+ if len(rewritten_texts) != len(videos_list):
281
+ if len(rewritten_texts) == 1 and len(videos_list) >= 1:
282
+ rewritten_texts = rewritten_texts * len(videos_list)
283
+ else:
284
+ raise ValueError(
285
+ f"codec video backend: got {len(rewritten_texts)} texts but {len(videos_list)} videos"
286
+ )
287
+
288
+ for idx, video_url in enumerate(videos_list):
289
+ payload = process_codec_video(video_url, cfg)
290
+ imgs, src_positions, _ = drop_padding_canvases(
291
+ payload["images"], payload["src_positions"]
292
+ )
293
+ if not imgs:
294
+ raise RuntimeError(f"codec produced no usable canvases for {video_url}")
295
+ image_data = codec_image_processor_outputs(
296
+ self.image_processor, imgs, max_pixels=effective_max_pixels
297
+ )
298
+ image_grid_thw = image_data["image_grid_thw"]
299
+ patch_positions = codec_positions_for_processor(
300
+ src_positions, image_grid_thw, device=image_grid_thw.device,
301
+ )
302
+ rewritten_texts[idx] = rewrite_text_with_codec_positions(
303
+ rewritten_texts[idx], patch_positions,
304
+ fps=float(payload["fps"]), decimals=1,
305
+ )
306
+ all_pixel_values.append(image_data["pixel_values"])
307
+ all_grid_thw.append(image_grid_thw)
308
+ all_patch_positions.append(patch_positions)
309
+
310
+ out["pixel_values"] = torch.cat(all_pixel_values, dim=0)
311
+ out["image_grid_thw"] = torch.cat(all_grid_thw, dim=0)
312
+ out["patch_positions"] = torch.cat(all_patch_positions, dim=0)
313
+ text = rewritten_texts
314
+ # Codec branch handled the video. Suppress the frame-sampling block below.
315
+ videos = None
316
+
317
  # ---------------- VIDEO PATH ----------------
318
  # Process videos first so we can rewrite their placeholders into the
319
  # text before tokenization.