Update processing_penguinvl.py

#4
by Cyril666 - opened
Files changed (1) hide show
  1. processing_penguinvl.py +96 -114
processing_penguinvl.py CHANGED
@@ -204,120 +204,110 @@ def floor_by_factor(number: int, factor: int) -> int:
204
  return math.floor(number / factor) * factor
205
 
206
  def smart_resize(
207
- height: int, width: int,
208
- factor: int = 14,
209
- min_pixels: int = 0,
210
- max_pixels: int = 16384):
 
 
211
  """
212
- Rescales the image so that the following conditions are met:
213
-
214
- 1. Both dimensions (height and width) are divisible by 'factor'.
215
-
216
- 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
217
-
218
- 3. The aspect ratio of the image is maintained as closely as possible.
219
  """
220
-
221
- if max(height, width) / min(height, width) > 200:
 
 
 
 
 
 
 
 
 
 
222
  raise ValueError(
223
- f"absolute aspect ratio must be smaller than {200}, got {max(height, width) / min(height, width)}"
224
  )
225
- h_bar = max(factor, round_by_factor(height, factor))
226
- w_bar = max(factor, round_by_factor(width, factor))
227
- if h_bar * w_bar > max_pixels:
228
- beta = math.sqrt((height * width) / max_pixels)
229
- h_bar = floor_by_factor(height / beta, factor)
230
- w_bar = floor_by_factor(width / beta, factor)
231
- elif h_bar * w_bar < min_pixels:
232
- beta = math.sqrt(min_pixels / (height * width))
233
- h_bar = ceil_by_factor(height * beta, factor)
234
- w_bar = ceil_by_factor(width * beta, factor)
235
- return max(h_bar, factor), max(w_bar, factor)
236
-
237
- def get_frame_sim(frame1, frame2,
238
- patch_size: int=14,
239
- threshold: float = 0.7,
240
- epsilon: float=1e-8):
241
- assert frame1.dim() == 3 and frame2.dim() == 3, "输入必须是3D张量 [C, H, W]"
242
-
243
- # 将PyTorch张量转换为OpenCV格式的numpy数组
244
- def to_numpy_cvt(tensor):
245
- # 确保张量在CPU上并转换为HWC格式
246
- tensor = tensor.cpu().permute(1, 2, 0).numpy()
247
- if tensor.dtype == np.float32 or tensor.dtype == np.float64:
248
- tensor = (tensor).astype(np.uint8)
249
- # 转换为HSV颜色空间
250
- return cv2.cvtColor(tensor, cv2.COLOR_RGB2HSV)
251
-
252
- # 转换颜色空间
253
- frame1_hsv = to_numpy_cvt(frame1)
254
- frame2_hsv = to_numpy_cvt(frame2)
255
-
256
- # 将HSV图像转回PyTorch张量
257
- frame1_tensor = torch.from_numpy(frame1_hsv).permute(2, 0, 1).to(frame1.device).float()
258
- frame2_tensor = torch.from_numpy(frame2_hsv).permute(2, 0, 1).to(frame2.device).float()
259
-
260
- # 分块处理
261
- patch1 = rearrange(
262
- frame1_tensor, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
263
- patch2 = rearrange(
264
- frame2_tensor, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
265
 
266
  norm1 = torch.norm(patch1, p=2, dim=-1, keepdim=True) + epsilon
267
  norm2 = torch.norm(patch2, p=2, dim=-1, keepdim=True) + epsilon
268
-
269
- normalized1 = patch1 / norm1
270
- normalized2 = patch2 / norm2
271
- cos_sim = (normalized1 * normalized2).sum(dim=-1)
272
-
273
- zero_vector_mask = (norm1.squeeze() < 0.01) & (norm2.squeeze() < 0.01) # 全黑图
274
-
275
- similar = torch.ones_like(cos_sim) # 默认全部相似
276
-
277
- non_zero_mask = ~zero_vector_mask
278
- similar[non_zero_mask] = (cos_sim[non_zero_mask] > threshold).float()
279
-
280
- return similar[non_zero_mask].float().mean().item()
281
-
282
- def extract_slow_fast_frames(frames, threshold = 0.95):
283
- def _extract_slow_indices(frames):
284
- assert frames.dim() == 4, "输入必须是4D张量 [N, C, H, W]"
285
-
286
- # 首帧一定是Slow
287
- slow_indices = [0]
288
- # 定位这里,检查和image[0]报错是不是同一视频
289
- last_key_frame = frames[0]
290
- for i in range(1, frames.size(0)):
291
- current_frame = frames[i]
292
- sim = get_frame_sim(last_key_frame, current_frame)
293
-
294
- if sim < threshold:
295
- slow_indices.append(i)
296
- last_key_frame = current_frame # 更新关键帧
297
-
298
- return slow_indices
299
-
300
- _, _, height, width = frames.shape
301
- resized_height, resized_width = smart_resize(
302
- height,
303
- width,
304
- factor=14,
305
- min_pixels=10 * 14 * 14,
306
- max_pixels=10240 * 14 * 14,
307
- )
308
 
309
- resized_frames = nn.functional.interpolate(
310
- frames,
311
- [resized_height, resized_width],
312
- mode="bilinear",
313
- antialias=True,
314
- ).float()
315
 
316
- slow_indices = _extract_slow_indices(resized_frames)
317
- frame_types = torch.ones(size=(frames.size(0), ), dtype=torch.int32)
318
- frame_types[slow_indices] = 0
319
 
320
- return list(frame_types)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
 
323
  class ChatTemplateKwargs(TypedDict, total=False):
@@ -461,7 +451,6 @@ class PenguinVLQwen3Processor(ProcessorMixin):
461
  ):
462
  """
463
  Load and process a video file and return the frames and the timestamps of each frame.
464
-
465
  Args:
466
  video_path (str): Path to the video file.
467
  start_time (float, optional): Start time in seconds. Defaults to None.
@@ -472,7 +461,6 @@ class PenguinVLQwen3Processor(ProcessorMixin):
472
  size_divisible (int, optional): Size divisible by this number. Defaults to 1.
473
  precise_time (bool, optional): Whether to use precise time. Defaults to False.
474
  verbose (bool, optional): Print ffmpeg output. Defaults to False.
475
-
476
  Returns:
477
  frames (List[PIL.Image]): List of frames.
478
  timestamps (List[float]): List of timestamps.
@@ -551,7 +539,7 @@ class PenguinVLQwen3Processor(ProcessorMixin):
551
  timestamps = np.concatenate([timestamps, timestamps[-1:].repeat(pad_length) + np.arange(1, pad_length + 1) / fps])
552
 
553
  frames_tensor = torch.from_numpy(frames.copy()).float()
554
- frame_types = extract_slow_fast_frames(frames_tensor)
555
 
556
  frames = [frame for frame in frames]
557
  timestamps = [timestamp for timestamp in timestamps]
@@ -575,13 +563,11 @@ class PenguinVLQwen3Processor(ProcessorMixin):
575
  """
576
  Load a video by prioritizing I-frames (keyframes) and dynamically sampling
577
  additional frames between adjacent I-frames up to `max_frames`.
578
-
579
  Notes:
580
  - Real codec I-frames (keyframes) are always used as-is and do NOT follow `fps`.
581
  - If `fps` is provided, it controls how we sample additional non-I frames between
582
  adjacent I-frames (and still respects `max_frames`).
583
  - This function does NOT call `load_video_from_ids`.
584
-
585
  Returns:
586
  frames: List[np.ndarray] where each is CHW (3, H, W) uint8
587
  timestamps: List[float] timestamps in seconds for each returned frame
@@ -747,7 +733,6 @@ class PenguinVLQwen3Processor(ProcessorMixin):
747
  """
748
  Allocate `remaining` frames across windows proportionally by window width using floor,
749
  without redistributing leftover.
750
-
751
  This matches the spec:
752
  - prioritize large I-frame windows
753
  - use floor so the sum does not exceed `remaining`
@@ -1347,7 +1332,6 @@ class PenguinVLQwen3Processor(ProcessorMixin):
1347
  """
1348
  Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
1349
  conversations to turn them into a single tokenizable string.
1350
-
1351
  Args:
1352
  conversation (`List[Dict, str, str]`):
1353
  The conversation to format.
@@ -1432,11 +1416,9 @@ class PenguinVLQwen3Processor(ProcessorMixin):
1432
  Typed dictionary of kwargs specifically required by the model passed.
1433
  tokenizer_init_kwargs (`Dict`, *optional*):
1434
  Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
1435
-
1436
  Returns:
1437
  output_kwargs (`Dict`):
1438
  Dictionary of per-modality kwargs to be passed to each modality-specific processor.
1439
-
1440
  """
1441
  # Initialize dictionaries
1442
  output_kwargs = {
@@ -1517,4 +1499,4 @@ class PenguinVLQwen3Processor(ProcessorMixin):
1517
  # all modality-specific kwargs are updated with common kwargs
1518
  for modality in output_kwargs:
1519
  output_kwargs[modality].update(output_kwargs["common_kwargs"])
1520
- return output_kwargs
 
204
  return math.floor(number / factor) * factor
205
 
206
  def smart_resize(
207
+ height: int,
208
+ width: int,
209
+ factor: int = 14,
210
+ min_pixels: int = 0,
211
+ max_pixels: int = 16384,
212
+ ):
213
  """
214
+ Compute target (height, width) such that:
215
+ - Both dimensions are divisible by factor.
216
+ - Total pixels lie in [min_pixels, max_pixels].
217
+ - Aspect ratio is preserved as closely as possible.
 
 
 
218
  """
219
+ def round_by_factor(number: int, factor: int) -> int:
220
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
221
+ return round(number / factor) * factor
222
+ def ceil_by_factor(number: int, factor: int) -> int:
223
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
224
+ return math.ceil(number / factor) * factor
225
+ def floor_by_factor(number: int, factor: int) -> int:
226
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
227
+ return math.floor(number / factor) * factor
228
+
229
+ max_ratio = 200
230
+ if max(height, width) / min(height, width) > max_ratio:
231
  raise ValueError(
232
+ f"Aspect ratio must be < {max_ratio}, got {max(height, width) / min(height, width)}"
233
  )
234
+ h = max(factor, round_by_factor(height, factor))
235
+ w = max(factor, round_by_factor(width, factor))
236
+ if h * w > max_pixels:
237
+ scale = math.sqrt((height * width) / max_pixels)
238
+ h = floor_by_factor(height / scale, factor)
239
+ w = floor_by_factor(width / scale, factor)
240
+ elif h * w < min_pixels:
241
+ scale = math.sqrt(min_pixels / (height * width))
242
+ h = ceil_by_factor(height * scale, factor)
243
+ w = ceil_by_factor(width * scale, factor)
244
+ return max(h, factor), max(w, factor)
245
+
246
+ # Adapted from Keye-VL: https://github.com/Kwai-Keye/Keye
247
+ def get_frame_sim(
248
+ frame1: torch.Tensor,
249
+ frame2: torch.Tensor,
250
+ patch_size: int = 14,
251
+ threshold: float = 0.7,
252
+ epsilon: float = 1e-8,
253
+ ) -> float:
254
+ """Cosine similarity between two frames in HSV, averaged over patches. Returns mean similarity in [0, 1]."""
255
+ assert frame1.dim() == 3 and frame2.dim() == 3, "Frames must be 3D tensors [C, H, W]"
256
+
257
+ def to_hsv_tensor(tensor: torch.Tensor) -> torch.Tensor:
258
+ arr = tensor.cpu().permute(1, 2, 0).numpy()
259
+ if arr.dtype in (np.float32, np.float64):
260
+ arr = arr.astype(np.uint8)
261
+ hsv = cv2.cvtColor(arr, cv2.COLOR_RGB2HSV)
262
+ return torch.from_numpy(hsv).permute(2, 0, 1).to(tensor.device).float()
263
+
264
+ f1 = to_hsv_tensor(frame1)
265
+ f2 = to_hsv_tensor(frame2)
266
+ patch1 = rearrange(f1, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
267
+ patch2 = rearrange(f2, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
 
 
 
 
 
 
268
 
269
  norm1 = torch.norm(patch1, p=2, dim=-1, keepdim=True) + epsilon
270
  norm2 = torch.norm(patch2, p=2, dim=-1, keepdim=True) + epsilon
271
+ cos_sim = (patch1 / norm1 * patch2 / norm2).sum(dim=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
+ both_near_zero = (norm1.squeeze() < 0.01) & (norm2.squeeze() < 0.01)
274
+ similar = torch.ones_like(cos_sim)
275
+ similar[~both_near_zero] = (cos_sim[~both_near_zero] > threshold).float()
276
+ return similar[~both_near_zero].float().mean().item()
 
 
277
 
 
 
 
278
 
279
+ # KI: keyframe indices (formerly slow/fast). 0 = key frame, 1 = intermediate frame.
280
+ K_PATCH = 14
281
+ K_MIN_PIXELS = 10 * 14 * 14
282
+ K_MAX_PIXELS = 10240 * 14 * 14
283
+ MIN_FRAME_SIMILARITY = 0.95
284
+
285
+ def extract_ki_frames(
286
+ frames: torch.Tensor,
287
+ threshold: float = MIN_FRAME_SIMILARITY,
288
+ ) -> list:
289
+ """
290
+ Label each frame as keyframe (0) or non-keyframe (1) by comparing to the previous keyframe.
291
+ First frame is always a keyframe; a new keyframe is chosen when similarity drops below threshold.
292
+ """
293
+ assert frames.dim() == 4, "Frames must be 4D tensor [N, C, H, W]"
294
+
295
+ def _keyframe_indices(f: torch.Tensor) -> list:
296
+ indices = [0]
297
+ key = f[0]
298
+ for i in range(1, f.size(0)):
299
+ if get_frame_sim(key, f[i]) < threshold:
300
+ indices.append(i)
301
+ key = f[i]
302
+ return indices
303
+
304
+ _, _, h, w = frames.shape
305
+ rh, rw = smart_resize(h, w, factor=K_PATCH, min_pixels=K_MIN_PIXELS, max_pixels=K_MAX_PIXELS)
306
+ resized = nn.functional.interpolate(frames, (rh, rw), mode="bilinear", antialias=True).float()
307
+ k_indices = _keyframe_indices(resized)
308
+ frame_types = torch.ones(frames.size(0), dtype=torch.int32)
309
+ frame_types[k_indices] = 0
310
+ return frame_types.tolist()
311
 
312
 
313
  class ChatTemplateKwargs(TypedDict, total=False):
 
451
  ):
452
  """
453
  Load and process a video file and return the frames and the timestamps of each frame.
 
454
  Args:
455
  video_path (str): Path to the video file.
456
  start_time (float, optional): Start time in seconds. Defaults to None.
 
461
  size_divisible (int, optional): Size divisible by this number. Defaults to 1.
462
  precise_time (bool, optional): Whether to use precise time. Defaults to False.
463
  verbose (bool, optional): Print ffmpeg output. Defaults to False.
 
464
  Returns:
465
  frames (List[PIL.Image]): List of frames.
466
  timestamps (List[float]): List of timestamps.
 
539
  timestamps = np.concatenate([timestamps, timestamps[-1:].repeat(pad_length) + np.arange(1, pad_length + 1) / fps])
540
 
541
  frames_tensor = torch.from_numpy(frames.copy()).float()
542
+ frame_types = extract_ki_frames(frames_tensor)
543
 
544
  frames = [frame for frame in frames]
545
  timestamps = [timestamp for timestamp in timestamps]
 
563
  """
564
  Load a video by prioritizing I-frames (keyframes) and dynamically sampling
565
  additional frames between adjacent I-frames up to `max_frames`.
 
566
  Notes:
567
  - Real codec I-frames (keyframes) are always used as-is and do NOT follow `fps`.
568
  - If `fps` is provided, it controls how we sample additional non-I frames between
569
  adjacent I-frames (and still respects `max_frames`).
570
  - This function does NOT call `load_video_from_ids`.
 
571
  Returns:
572
  frames: List[np.ndarray] where each is CHW (3, H, W) uint8
573
  timestamps: List[float] timestamps in seconds for each returned frame
 
733
  """
734
  Allocate `remaining` frames across windows proportionally by window width using floor,
735
  without redistributing leftover.
 
736
  This matches the spec:
737
  - prioritize large I-frame windows
738
  - use floor so the sum does not exceed `remaining`
 
1332
  """
1333
  Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
1334
  conversations to turn them into a single tokenizable string.
 
1335
  Args:
1336
  conversation (`List[Dict, str, str]`):
1337
  The conversation to format.
 
1416
  Typed dictionary of kwargs specifically required by the model passed.
1417
  tokenizer_init_kwargs (`Dict`, *optional*):
1418
  Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
 
1419
  Returns:
1420
  output_kwargs (`Dict`):
1421
  Dictionary of per-modality kwargs to be passed to each modality-specific processor.
 
1422
  """
1423
  # Initialize dictionaries
1424
  output_kwargs = {
 
1499
  # all modality-specific kwargs are updated with common kwargs
1500
  for modality in output_kwargs:
1501
  output_kwargs[modality].update(output_kwargs["common_kwargs"])
1502
+ return output_kwargs