tencent
/

Penguin-VL-8B

@@ -204,120 +204,109 @@ def floor_by_factor(number: int, factor: int) -> int:
     return math.floor(number / factor) * factor
 def smart_resize(
-        height: int, width: int,
-        factor: int = 14,
-        min_pixels: int = 0,
-        max_pixels: int = 16384):
     """
-    Rescales the image so that the following conditions are met:
-    1. Both dimensions (height and width) are divisible by 'factor'.
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-    3. The aspect ratio of the image is maintained as closely as possible.
     """
-    if max(height, width) / min(height, width) > 200:
         raise ValueError(
-            f"absolute aspect ratio must be smaller than {200}, got {max(height, width) / min(height, width)}"
         )
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-    return max(h_bar, factor), max(w_bar, factor)
-def get_frame_sim(frame1, frame2,
-                  patch_size: int=14,
-                  threshold: float = 0.7,
-                  epsilon: float=1e-8):
-    assert frame1.dim() == 3 and frame2.dim() == 3, "输入必须是3D张量 [C, H, W]"
-    # 将PyTorch张量转换为OpenCV格式的numpy数组
-    def to_numpy_cvt(tensor):
-        # 确保张量在CPU上并转换为HWC格式
-        tensor = tensor.cpu().permute(1, 2, 0).numpy()
-        if tensor.dtype == np.float32 or tensor.dtype == np.float64:
-            tensor = (tensor).astype(np.uint8)
-        # 转换为HSV颜色空间
-        return cv2.cvtColor(tensor, cv2.COLOR_RGB2HSV)
-    # 转换颜色空间
-    frame1_hsv = to_numpy_cvt(frame1)
-    frame2_hsv = to_numpy_cvt(frame2)
-    # 将HSV图像转回PyTorch张量
-    frame1_tensor = torch.from_numpy(frame1_hsv).permute(2, 0, 1).to(frame1.device).float()
-    frame2_tensor = torch.from_numpy(frame2_hsv).permute(2, 0, 1).to(frame2.device).float()
-    # 分块处理
-    patch1 = rearrange(
-        frame1_tensor, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
-    patch2 = rearrange(
-        frame2_tensor, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
     norm1 = torch.norm(patch1, p=2, dim=-1, keepdim=True) + epsilon
     norm2 = torch.norm(patch2, p=2, dim=-1, keepdim=True) + epsilon
-    normalized1 = patch1 / norm1
-    normalized2 = patch2 / norm2
-    cos_sim = (normalized1 * normalized2).sum(dim=-1)
-    zero_vector_mask = (norm1.squeeze() < 0.01) & (norm2.squeeze() < 0.01)  # 全黑图
-    similar = torch.ones_like(cos_sim)  # 默认全部相似
-    non_zero_mask = ~zero_vector_mask
-    similar[non_zero_mask] = (cos_sim[non_zero_mask] > threshold).float()
-    return similar[non_zero_mask].float().mean().item()
-def extract_slow_fast_frames(frames, threshold = 0.95):
-    def _extract_slow_indices(frames):
-        assert frames.dim() == 4, "输入必须是4D张量 [N, C, H, W]"
-        # 首帧一定是Slow
-        slow_indices = [0]
-        # 定位这里，检查和image[0]报错是不是同一视频
-        last_key_frame = frames[0]
-        for i in range(1, frames.size(0)):
-            current_frame = frames[i]
-            sim = get_frame_sim(last_key_frame, current_frame)
-            if sim < threshold:
-                slow_indices.append(i)
-                last_key_frame = current_frame  # 更新关键帧
-        return slow_indices
-    _, _, height, width = frames.shape
-    resized_height, resized_width = smart_resize(
-        height,
-        width,
-        factor=14,
-        min_pixels=10 * 14 * 14,
-        max_pixels=10240 * 14 * 14,
-    )
-    resized_frames = nn.functional.interpolate(
-        frames,
-        [resized_height, resized_width],
-        mode="bilinear",
-        antialias=True,
-    ).float()
-    slow_indices = _extract_slow_indices(resized_frames)
-    frame_types = torch.ones(size=(frames.size(0), ), dtype=torch.int32)
-    frame_types[slow_indices] = 0
-    return list(frame_types)
 class ChatTemplateKwargs(TypedDict, total=False):

     return math.floor(number / factor) * factor
 def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 14,
+    min_pixels: int = 0,
+    max_pixels: int = 16384,
+):
     """
+    Compute target (height, width) such that:
+    - Both dimensions are divisible by factor.
+    - Total pixels lie in [min_pixels, max_pixels].
+    - Aspect ratio is preserved as closely as possible.
     """
+    def round_by_factor(number: int, factor: int) -> int:
+        """Returns the closest integer to 'number' that is divisible by 'factor'."""
+        return round(number / factor) * factor
+    def ceil_by_factor(number: int, factor: int) -> int:
+        """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+        return math.ceil(number / factor) * factor
+    def floor_by_factor(number: int, factor: int) -> int:
+        """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+        return math.floor(number / factor) * factor
+    max_ratio = 200
+    if max(height, width) / min(height, width) > max_ratio:
         raise ValueError(
+            f"Aspect ratio must be < {max_ratio}, got {max(height, width) / min(height, width)}"
         )
+    h = max(factor, round_by_factor(height, factor))
+    w = max(factor, round_by_factor(width, factor))
+    if h * w > max_pixels:
+        scale = math.sqrt((height * width) / max_pixels)
+        h = floor_by_factor(height / scale, factor)
+        w = floor_by_factor(width / scale, factor)
+    elif h * w < min_pixels:
+        scale = math.sqrt(min_pixels / (height * width))
+        h = ceil_by_factor(height * scale, factor)
+        w = ceil_by_factor(width * scale, factor)
+    return max(h, factor), max(w, factor)
+# Adapted from Keye-VL: https://github.com/Kwai-Keye/Keye
+def get_frame_sim(
+    frame1: torch.Tensor,
+    frame2: torch.Tensor,
+    patch_size: int = 14,
+    threshold: float = 0.7,
+    epsilon: float = 1e-8,
+) -> float:
+    """Cosine similarity between two frames in HSV, averaged over patches. Returns mean similarity in [0, 1]."""
+    assert frame1.dim() == 3 and frame2.dim() == 3, "Frames must be 3D tensors [C, H, W]"
+    def to_hsv_tensor(tensor: torch.Tensor) -> torch.Tensor:
+        arr = tensor.cpu().permute(1, 2, 0).numpy()
+        if arr.dtype in (np.float32, np.float64):
+            arr = arr.astype(np.uint8)
+        hsv = cv2.cvtColor(arr, cv2.COLOR_RGB2HSV)
+        return torch.from_numpy(hsv).permute(2, 0, 1).to(tensor.device).float()
+    f1 = to_hsv_tensor(frame1)
+    f2 = to_hsv_tensor(frame2)
+    patch1 = rearrange(f1, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
+    patch2 = rearrange(f2, "c (h p1) (w p2) -> h w (c p1 p2)", p1=patch_size, p2=patch_size).float()
     norm1 = torch.norm(patch1, p=2, dim=-1, keepdim=True) + epsilon
     norm2 = torch.norm(patch2, p=2, dim=-1, keepdim=True) + epsilon
+    cos_sim = (patch1 / norm1 * patch2 / norm2).sum(dim=-1)
+    both_near_zero = (norm1.squeeze() < 0.01) & (norm2.squeeze() < 0.01)
+    similar = torch.ones_like(cos_sim)
+    similar[~both_near_zero] = (cos_sim[~both_near_zero] > threshold).float()
+    return similar[~both_near_zero].float().mean().item()
+# KI: keyframe indices (formerly slow/fast). 0 = key frame, 1 = intermediate frame.
+K_PATCH = 14
+K_MIN_PIXELS = 10 * 14 * 14
+K_MAX_PIXELS = 10240 * 14 * 14
+def extract_ki_frames(
+    frames: torch.Tensor,
+    threshold: float = MIN_FRAME_SIMILARITY,
+) -> list:
+    """
+    Label each frame as keyframe (0) or non-keyframe (1) by comparing to the previous keyframe.
+    First frame is always a keyframe; a new keyframe is chosen when similarity drops below threshold.
+    """
+    assert frames.dim() == 4, "Frames must be 4D tensor [N, C, H, W]"
+    def _keyframe_indices(f: torch.Tensor) -> list:
+        indices = [0]
+        key = f[0]
+        for i in range(1, f.size(0)):
+            if get_frame_sim(key, f[i]) < threshold:
+                indices.append(i)
+                key = f[i]
+        return indices
+    _, _, h, w = frames.shape
+    rh, rw = smart_resize(h, w, factor=K_PATCH, min_pixels=K_MIN_PIXELS, max_pixels=K_MAX_PIXELS)
+    resized = nn.functional.interpolate(frames, (rh, rw), mode="bilinear", antialias=True).float()
+    k_indices = _keyframe_indices(resized)
+    frame_types = torch.ones(frames.size(0), dtype=torch.int32)
+    frame_types[k_indices] = 0
+    return frame_types.tolist()
 class ChatTemplateKwargs(TypedDict, total=False):