Update processing_penguinvl.py

#3
by Cyril666 - opened
Files changed (1) hide show
  1. processing_penguinvl.py +154 -113
processing_penguinvl.py CHANGED
@@ -1,3 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Processor class for PenguinVL."""
2
 
3
  import copy
@@ -204,120 +263,109 @@ def floor_by_factor(number: int, factor: int) -> int:
204
  return math.floor(number / factor) * factor
205
 
206
  def smart_resize(
207
- height: int, width: int,
208
- factor: int = 14,
209
- min_pixels: int = 0,
210
- max_pixels: int = 16384):
 
 
211
  """
212
- Rescales the image so that the following conditions are met:
213
-
214
- 1. Both dimensions (height and width) are divisible by 'factor'.
215
-
216
- 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
217
-
218
- 3. The aspect ratio of the image is maintained as closely as possible.
219
  """
220
-
221
- if max(height, width) / min(height, width) > 200:
 
 
 
 
 
 
 
 
 
 
222
  raise ValueError(
223
- f"absolute aspect ratio must be smaller than {200}, got {max(height, width) / min(height, width)}"
224
  )
225
- h_bar = max(factor, round_by_factor(height, factor))
226
- w_bar = max(factor, round_by_factor(width, factor))
227
- if h_bar * w_bar > max_pixels:
228
- beta = math.sqrt((height * width) / max_pixels)
229
- h_bar = floor_by_factor(height / beta, factor)
230
- w_bar = floor_by_factor(width / beta, factor)
231
- elif h_bar * w_bar < min_pixels:
232
- beta = math.sqrt(min_pixels / (height * width))
233
- h_bar = ceil_by_factor(height * beta, factor)
234
- w_bar = ceil_by_factor(width * beta, factor)
235
- return max(h_bar, factor), max(w_bar, factor)
236
-
237
- def get_frame_sim(frame1, frame2,
238
- patch_size: int=14,
239
- threshold: float = 0.7,
240
- epsilon: float=1e-8):
241
- assert frame1.dim() == 3 and frame2.dim() == 3, "输入必须是3D张量 [C, H, W]"
242
-
243
- # 将PyTorch张量转换为OpenCV格式的numpy数组
244
- def to_numpy_cvt(tensor):
245
- # 确保张量在CPU上并转换为HWC格式
246
- tensor = tensor.cpu().permute(1, 2, 0).numpy()
247
- if tensor.dtype == np.float32 or tensor.dtype == np.float64:
248
- tensor = (tensor).astype(np.uint8)
249
- # 转换为HSV颜色空间
250
- return cv2.cvtColor(tensor, cv2.COLOR_RGB2HSV)
251
-
252
- # 转换颜色空间
253
- frame1_hsv = to_numpy_cvt(frame1)
254
- frame2_hsv = to_numpy_cvt(frame2)
255
-
256
- # 将HSV图像转回PyTorch张量
257
- frame1_tensor = torch.from_numpy(frame1_hsv).permute(2, 0, 1).to(frame1.device).float()
258
- frame2_tensor = torch.from_numpy(frame2_hsv).permute(2, 0, 1).to(frame2.device).float()
259
-
260
- # 分块处理
261
- patch1 = rearrange(
262
- frame1_tensor, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
263
- patch2 = rearrange(
264
- frame2_tensor, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
265
 
266
  norm1 = torch.norm(patch1, p=2, dim=-1, keepdim=True) + epsilon
267
  norm2 = torch.norm(patch2, p=2, dim=-1, keepdim=True) + epsilon
268
-
269
- normalized1 = patch1 / norm1
270
- normalized2 = patch2 / norm2
271
- cos_sim = (normalized1 * normalized2).sum(dim=-1)
272
-
273
- zero_vector_mask = (norm1.squeeze() < 0.01) & (norm2.squeeze() < 0.01) # 全黑图
274
-
275
- similar = torch.ones_like(cos_sim) # 默认全部相似
276
-
277
- non_zero_mask = ~zero_vector_mask
278
- similar[non_zero_mask] = (cos_sim[non_zero_mask] > threshold).float()
279
-
280
- return similar[non_zero_mask].float().mean().item()
281
-
282
- def extract_slow_fast_frames(frames, threshold = 0.95):
283
- def _extract_slow_indices(frames):
284
- assert frames.dim() == 4, "输入必须是4D张量 [N, C, H, W]"
285
-
286
- # 首帧一定是Slow
287
- slow_indices = [0]
288
- # 定位这里,检查和image[0]报错是不是同一视频
289
- last_key_frame = frames[0]
290
- for i in range(1, frames.size(0)):
291
- current_frame = frames[i]
292
- sim = get_frame_sim(last_key_frame, current_frame)
293
-
294
- if sim < threshold:
295
- slow_indices.append(i)
296
- last_key_frame = current_frame # 更新关键帧
297
-
298
- return slow_indices
299
-
300
- _, _, height, width = frames.shape
301
- resized_height, resized_width = smart_resize(
302
- height,
303
- width,
304
- factor=14,
305
- min_pixels=10 * 14 * 14,
306
- max_pixels=10240 * 14 * 14,
307
- )
308
 
309
- resized_frames = nn.functional.interpolate(
310
- frames,
311
- [resized_height, resized_width],
312
- mode="bilinear",
313
- antialias=True,
314
- ).float()
315
 
316
- slow_indices = _extract_slow_indices(resized_frames)
317
- frame_types = torch.ones(size=(frames.size(0), ), dtype=torch.int32)
318
- frame_types[slow_indices] = 0
 
319
 
320
- return list(frame_types)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
 
323
  class ChatTemplateKwargs(TypedDict, total=False):
@@ -461,7 +509,6 @@ class PenguinVLQwen3Processor(ProcessorMixin):
461
  ):
462
  """
463
  Load and process a video file and return the frames and the timestamps of each frame.
464
-
465
  Args:
466
  video_path (str): Path to the video file.
467
  start_time (float, optional): Start time in seconds. Defaults to None.
@@ -472,7 +519,6 @@ class PenguinVLQwen3Processor(ProcessorMixin):
472
  size_divisible (int, optional): Size divisible by this number. Defaults to 1.
473
  precise_time (bool, optional): Whether to use precise time. Defaults to False.
474
  verbose (bool, optional): Print ffmpeg output. Defaults to False.
475
-
476
  Returns:
477
  frames (List[PIL.Image]): List of frames.
478
  timestamps (List[float]): List of timestamps.
@@ -551,7 +597,7 @@ class PenguinVLQwen3Processor(ProcessorMixin):
551
  timestamps = np.concatenate([timestamps, timestamps[-1:].repeat(pad_length) + np.arange(1, pad_length + 1) / fps])
552
 
553
  frames_tensor = torch.from_numpy(frames.copy()).float()
554
- frame_types = extract_slow_fast_frames(frames_tensor)
555
 
556
  frames = [frame for frame in frames]
557
  timestamps = [timestamp for timestamp in timestamps]
@@ -575,13 +621,11 @@ class PenguinVLQwen3Processor(ProcessorMixin):
575
  """
576
  Load a video by prioritizing I-frames (keyframes) and dynamically sampling
577
  additional frames between adjacent I-frames up to `max_frames`.
578
-
579
  Notes:
580
  - Real codec I-frames (keyframes) are always used as-is and do NOT follow `fps`.
581
  - If `fps` is provided, it controls how we sample additional non-I frames between
582
  adjacent I-frames (and still respects `max_frames`).
583
  - This function does NOT call `load_video_from_ids`.
584
-
585
  Returns:
586
  frames: List[np.ndarray] where each is CHW (3, H, W) uint8
587
  timestamps: List[float] timestamps in seconds for each returned frame
@@ -747,7 +791,6 @@ class PenguinVLQwen3Processor(ProcessorMixin):
747
  """
748
  Allocate `remaining` frames across windows proportionally by window width using floor,
749
  without redistributing leftover.
750
-
751
  This matches the spec:
752
  - prioritize large I-frame windows
753
  - use floor so the sum does not exceed `remaining`
@@ -1347,7 +1390,6 @@ class PenguinVLQwen3Processor(ProcessorMixin):
1347
  """
1348
  Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
1349
  conversations to turn them into a single tokenizable string.
1350
-
1351
  Args:
1352
  conversation (`List[Dict, str, str]`):
1353
  The conversation to format.
@@ -1432,11 +1474,9 @@ class PenguinVLQwen3Processor(ProcessorMixin):
1432
  Typed dictionary of kwargs specifically required by the model passed.
1433
  tokenizer_init_kwargs (`Dict`, *optional*):
1434
  Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
1435
-
1436
  Returns:
1437
  output_kwargs (`Dict`):
1438
  Dictionary of per-modality kwargs to be passed to each modality-specific processor.
1439
-
1440
  """
1441
  # Initialize dictionaries
1442
  output_kwargs = {
@@ -1518,3 +1558,4 @@ class PenguinVLQwen3Processor(ProcessorMixin):
1518
  for modality in output_kwargs:
1519
  output_kwargs[modality].update(output_kwargs["common_kwargs"])
1520
  return output_kwargs
 
 
1
+ Hugging Face's logo
2
+ Hugging Face
3
+ Models
4
+ Datasets
5
+ Spaces
6
+ Community
7
+ Docs
8
+ Pricing
9
+
10
+
11
+ tencent
12
+ /
13
+ Penguin-VL-8B
14
+
15
+ like
16
+ 27
17
+
18
+ Follow
19
+ Tencent
20
+ 8.99k
21
+ Text Generation
22
+ Transformers
23
+ Safetensors
24
+ English
25
+ penguinvl_qwen3
26
+ multi-modal
27
+ large-language-model
28
+ vision-language-model
29
+ vision-encoder
30
+ conversational
31
+ custom_code
32
+
33
+ arxiv:
34
+ 2603.06569
35
+
36
+ License:
37
+ apache-2.0
38
+ Model card
39
+ Files and versions
40
+ xet
41
+ Community
42
+ 2
43
+ Penguin-VL-8B
44
+ /
45
+ processing_penguinvl.py
46
+
47
+ Cyril666's picture
48
+ Cyril666
49
+ Update processing_penguinvl.py
50
+ 8beed62
51
+ verified
52
+ 2 minutes ago
53
+ raw
54
+
55
+ Copy download link
56
+ history
57
+ blame
58
+
59
+ 67.2 kB
60
  """Processor class for PenguinVL."""
61
 
62
  import copy
 
263
  return math.floor(number / factor) * factor
264
 
265
  def smart_resize(
266
+ height: int,
267
+ width: int,
268
+ factor: int = 14,
269
+ min_pixels: int = 0,
270
+ max_pixels: int = 16384,
271
+ ):
272
  """
273
+ Compute target (height, width) such that:
274
+ - Both dimensions are divisible by factor.
275
+ - Total pixels lie in [min_pixels, max_pixels].
276
+ - Aspect ratio is preserved as closely as possible.
 
 
 
277
  """
278
+ def round_by_factor(number: int, factor: int) -> int:
279
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
280
+ return round(number / factor) * factor
281
+ def ceil_by_factor(number: int, factor: int) -> int:
282
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
283
+ return math.ceil(number / factor) * factor
284
+ def floor_by_factor(number: int, factor: int) -> int:
285
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
286
+ return math.floor(number / factor) * factor
287
+
288
+ max_ratio = 200
289
+ if max(height, width) / min(height, width) > max_ratio:
290
  raise ValueError(
291
+ f"Aspect ratio must be < {max_ratio}, got {max(height, width) / min(height, width)}"
292
  )
293
+ h = max(factor, round_by_factor(height, factor))
294
+ w = max(factor, round_by_factor(width, factor))
295
+ if h * w > max_pixels:
296
+ scale = math.sqrt((height * width) / max_pixels)
297
+ h = floor_by_factor(height / scale, factor)
298
+ w = floor_by_factor(width / scale, factor)
299
+ elif h * w < min_pixels:
300
+ scale = math.sqrt(min_pixels / (height * width))
301
+ h = ceil_by_factor(height * scale, factor)
302
+ w = ceil_by_factor(width * scale, factor)
303
+ return max(h, factor), max(w, factor)
304
+
305
+ # Adapted from Keye-VL: https://github.com/Kwai-Keye/Keye
306
+ def get_frame_sim(
307
+ frame1: torch.Tensor,
308
+ frame2: torch.Tensor,
309
+ patch_size: int = 14,
310
+ threshold: float = 0.7,
311
+ epsilon: float = 1e-8,
312
+ ) -> float:
313
+ """Cosine similarity between two frames in HSV, averaged over patches. Returns mean similarity in [0, 1]."""
314
+ assert frame1.dim() == 3 and frame2.dim() == 3, "Frames must be 3D tensors [C, H, W]"
315
+
316
+ def to_hsv_tensor(tensor: torch.Tensor) -> torch.Tensor:
317
+ arr = tensor.cpu().permute(1, 2, 0).numpy()
318
+ if arr.dtype in (np.float32, np.float64):
319
+ arr = arr.astype(np.uint8)
320
+ hsv = cv2.cvtColor(arr, cv2.COLOR_RGB2HSV)
321
+ return torch.from_numpy(hsv).permute(2, 0, 1).to(tensor.device).float()
322
+
323
+ f1 = to_hsv_tensor(frame1)
324
+ f2 = to_hsv_tensor(frame2)
325
+ patch1 = rearrange(f1, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
326
+ patch2 = rearrange(f2, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
 
 
 
 
 
 
327
 
328
  norm1 = torch.norm(patch1, p=2, dim=-1, keepdim=True) + epsilon
329
  norm2 = torch.norm(patch2, p=2, dim=-1, keepdim=True) + epsilon
330
+ cos_sim = (patch1 / norm1 * patch2 / norm2).sum(dim=-1)
331
+
332
+ both_near_zero = (norm1.squeeze() < 0.01) & (norm2.squeeze() < 0.01)
333
+ similar = torch.ones_like(cos_sim)
334
+ similar[~both_near_zero] = (cos_sim[~both_near_zero] > threshold).float()
335
+ return similar[~both_near_zero].float().mean().item()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
 
 
 
 
 
 
337
 
338
+ # KI: keyframe indices (formerly slow/fast). 0 = key frame, 1 = intermediate frame.
339
+ K_PATCH = 14
340
+ K_MIN_PIXELS = 10 * 14 * 14
341
+ K_MAX_PIXELS = 10240 * 14 * 14
342
 
343
+ def extract_ki_frames(
344
+ frames: torch.Tensor,
345
+ threshold: float = MIN_FRAME_SIMILARITY,
346
+ ) -> list:
347
+ """
348
+ Label each frame as keyframe (0) or non-keyframe (1) by comparing to the previous keyframe.
349
+ First frame is always a keyframe; a new keyframe is chosen when similarity drops below threshold.
350
+ """
351
+ assert frames.dim() == 4, "Frames must be 4D tensor [N, C, H, W]"
352
+
353
+ def _keyframe_indices(f: torch.Tensor) -> list:
354
+ indices = [0]
355
+ key = f[0]
356
+ for i in range(1, f.size(0)):
357
+ if get_frame_sim(key, f[i]) < threshold:
358
+ indices.append(i)
359
+ key = f[i]
360
+ return indices
361
+
362
+ _, _, h, w = frames.shape
363
+ rh, rw = smart_resize(h, w, factor=K_PATCH, min_pixels=K_MIN_PIXELS, max_pixels=K_MAX_PIXELS)
364
+ resized = nn.functional.interpolate(frames, (rh, rw), mode="bilinear", antialias=True).float()
365
+ k_indices = _keyframe_indices(resized)
366
+ frame_types = torch.ones(frames.size(0), dtype=torch.int32)
367
+ frame_types[k_indices] = 0
368
+ return frame_types.tolist()
369
 
370
 
371
  class ChatTemplateKwargs(TypedDict, total=False):
 
509
  ):
510
  """
511
  Load and process a video file and return the frames and the timestamps of each frame.
 
512
  Args:
513
  video_path (str): Path to the video file.
514
  start_time (float, optional): Start time in seconds. Defaults to None.
 
519
  size_divisible (int, optional): Size divisible by this number. Defaults to 1.
520
  precise_time (bool, optional): Whether to use precise time. Defaults to False.
521
  verbose (bool, optional): Print ffmpeg output. Defaults to False.
 
522
  Returns:
523
  frames (List[PIL.Image]): List of frames.
524
  timestamps (List[float]): List of timestamps.
 
597
  timestamps = np.concatenate([timestamps, timestamps[-1:].repeat(pad_length) + np.arange(1, pad_length + 1) / fps])
598
 
599
  frames_tensor = torch.from_numpy(frames.copy()).float()
600
+ frame_types = extract_ki_frames(frames_tensor)
601
 
602
  frames = [frame for frame in frames]
603
  timestamps = [timestamp for timestamp in timestamps]
 
621
  """
622
  Load a video by prioritizing I-frames (keyframes) and dynamically sampling
623
  additional frames between adjacent I-frames up to `max_frames`.
 
624
  Notes:
625
  - Real codec I-frames (keyframes) are always used as-is and do NOT follow `fps`.
626
  - If `fps` is provided, it controls how we sample additional non-I frames between
627
  adjacent I-frames (and still respects `max_frames`).
628
  - This function does NOT call `load_video_from_ids`.
 
629
  Returns:
630
  frames: List[np.ndarray] where each is CHW (3, H, W) uint8
631
  timestamps: List[float] timestamps in seconds for each returned frame
 
791
  """
792
  Allocate `remaining` frames across windows proportionally by window width using floor,
793
  without redistributing leftover.
 
794
  This matches the spec:
795
  - prioritize large I-frame windows
796
  - use floor so the sum does not exceed `remaining`
 
1390
  """
1391
  Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
1392
  conversations to turn them into a single tokenizable string.
 
1393
  Args:
1394
  conversation (`List[Dict, str, str]`):
1395
  The conversation to format.
 
1474
  Typed dictionary of kwargs specifically required by the model passed.
1475
  tokenizer_init_kwargs (`Dict`, *optional*):
1476
  Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
 
1477
  Returns:
1478
  output_kwargs (`Dict`):
1479
  Dictionary of per-modality kwargs to be passed to each modality-specific processor.
 
1480
  """
1481
  # Initialize dictionaries
1482
  output_kwargs = {
 
1558
  for modality in output_kwargs:
1559
  output_kwargs[modality].update(output_kwargs["common_kwargs"])
1560
  return output_kwargs
1561
+