Kwai-Keye
/

Keye-VL-8B-Preview

Video-Text-to-Text

feature-extraction

Model card Files Files and versions

Kwai-Keye commited on Jun 26, 2025

Commit

a56823f

·

verified ·

1 Parent(s): 03ea770

Add model files

Files changed (2) hide show

image_processing_keye.py +6 -6
preprocessor_config.json +2 -2

image_processing_keye.py CHANGED Viewed

@@ -128,8 +128,8 @@ def smart_resize(
     height: int,
     width: int,
     factor: int = 28,
-    min_pixels: int = 56 * 56,
-    max_pixels: int = 14 * 14 * 4096,
 ):
     """Rescales the image so that the following conditions are met:
@@ -193,9 +193,9 @@ class SiglipImageProcessor(BaseImageProcessor):
             Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Whether to convert the image to RGB.
-        min_pixels (`int`, *optional*, defaults to `56 * 56`):
             The min pixels of the image to resize the image.
-        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
             The max pixels of the image to resize the image.
         patch_size (`int`, *optional*, defaults to 14):
             The spacial patch size of the vision encoder.
@@ -222,8 +222,8 @@ class SiglipImageProcessor(BaseImageProcessor):
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        min_pixels: int = 56 * 56,
-        max_pixels: int = 28 * 28 * 1280,
         patch_size: int = 14,
         temporal_patch_size: int = 1,
         merge_size: int = 2,

     height: int,
     width: int,
     factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1670,
 ):
     """Rescales the image so that the following conditions are met:
             Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `28 * 28 * 130`):
             The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1670`):
             The max pixels of the image to resize the image.
         patch_size (`int`, *optional*, defaults to 14):
             The spacial patch size of the vision encoder.
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
+        min_pixels: int = 28 * 28 * 130,
+        max_pixels: int = 28 * 28 * 1670,
         patch_size: int = 14,
         temporal_patch_size: int = 1,
         merge_size: int = 2,

preprocessor_config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "min_pixels": 3136,
-  "max_pixels": 1003520,
   "patch_size": 14,
   "temporal_patch_size": 1,
   "merge_size": 2,

 {
+  "min_pixels": 101920,
+  "max_pixels": 1309280,
   "patch_size": 14,
   "temporal_patch_size": 1,
   "merge_size": 2,