fix: type annotation bugs in media_utils, processor and config

#35

by ThanhNguyxn - opened Jan 31

base: refs/heads/main

←

from: refs/pr/35

Discussion Files changed

+569

-569

Files changed (3) hide show

configuration_kimi_k25.py +39 -36
kimi_k25_processor.py +168 -165
media_utils.py +362 -368

configuration_kimi_k25.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from transformers.configuration_utils import PretrainedConfig
 try:
@@ -7,36 +9,35 @@ except ImportError:
 class KimiK25VisionConfig(PretrainedConfig):
     def __init__(
-            self,
-            patch_size: int = 14,
-            init_pos_emb_height: int = 64,
-            init_pos_emb_width: int = 64,
-            init_pos_emb_time: int = 4,
-            pos_emb_type: str = 'divided_fixed',
-            vt_num_attention_heads: int = 16,
-            vt_num_hidden_layers: int = 27,
-            vt_hidden_size: int = 1152,
-            vt_intermediate_size: int = 4304,
-            merge_kernel_size: tuple = (2, 2),
-            video_attn_type: str = 'spatial_temporal',
-            merge_type: str = 'sd2_tpool',
-            _attn_implementation: str = 'flash_attention_2',
-            # MM Projector parameters
-            mm_projector_type: str = 'patchmerger',
-            mm_hidden_size: int | None = None,
-            projector_hidden_act: str = "gelu",
-            projector_ln_eps: float = 1e-5,
-            # Other parameters
-            ignore_index: int = -100,
-            media_placeholder_token_id: int = 163605,
-            pad_token_id: int = 0,
-            use_unified_vision_chunk: bool = True,
-            video_placeholder="<|kimi_k25_video_placeholder|>",
-            text_hidden_size=7168,
-            **vision_config_kwargs):
         self.patch_size = patch_size
         self.init_pos_emb_height = init_pos_emb_height
         self.init_pos_emb_width = init_pos_emb_width
@@ -53,7 +54,9 @@ class KimiK25VisionConfig(PretrainedConfig):
         # MM Projector config
         self.mm_projector_type = mm_projector_type
-        self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else vt_hidden_size
         self.projector_hidden_act = projector_hidden_act
         self.projector_ln_eps = projector_ln_eps
         self.text_hidden_size = text_hidden_size
@@ -64,7 +67,7 @@ class KimiK25Config(PretrainedConfig):
     Args:
         text_config (dict | DeepseekV3Config): Configuration for the text model.
         Vision Tower Parameters (from MoonViT3dConfig):
             patch_size (int): Patch size for vision tower.
             init_pos_emb_height (int): Initial position embedding height.
@@ -79,13 +82,13 @@ class KimiK25Config(PretrainedConfig):
             video_attn_type (str): Type of video attention.
             merge_type (str): Type of merge operation.
             _attn_implementation (str): Attention implementation type.
         MM Projector Parameters (from MultiModalProjectorConfig):
             mm_projector_type (str): Type of multimodal projector.
             mm_hidden_size (int): Hidden size from vision tower (should match vt_hidden_size).
             projector_hidden_act (str): Activation function for projector.
             projector_ln_eps (float): Layer norm epsilon for projector.
         Other Parameters:
             ignore_index (int): The ignore index for the loss function.
             media_placeholder_token_id (int): The token ID to use for media placeholders.
@@ -96,14 +99,14 @@ class KimiK25Config(PretrainedConfig):
     def __init__(
         self,
-        text_config: dict | DeepseekV3Config = None,
-        vision_config: dict | KimiK25VisionConfig = None,
         # Other parameters
         ignore_index: int = -100,
         media_placeholder_token_id: int = 163605,
         pad_token_id: int = 0,
         use_unified_vision_chunk: bool = True,
-        video_placeholder="<|kimi_k25_video_placeholder|>",
         **kwargs,
     ):
         if isinstance(text_config, dict):

+from typing import Optional, Union
 from transformers.configuration_utils import PretrainedConfig
 try:
 class KimiK25VisionConfig(PretrainedConfig):
     def __init__(
+        self,
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        init_pos_emb_time: int = 4,
+        pos_emb_type: str = "divided_fixed",
+        vt_num_attention_heads: int = 16,
+        vt_num_hidden_layers: int = 27,
+        vt_hidden_size: int = 1152,
+        vt_intermediate_size: int = 4304,
+        merge_kernel_size: tuple = (2, 2),
+        video_attn_type: str = "spatial_temporal",
+        merge_type: str = "sd2_tpool",
+        _attn_implementation: str = "flash_attention_2",
+        # MM Projector parameters
+        mm_projector_type: str = "patchmerger",
+        mm_hidden_size: int | None = None,
+        projector_hidden_act: str = "gelu",
+        projector_ln_eps: float = 1e-5,
+        # Other parameters
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        use_unified_vision_chunk: bool = True,
+        video_placeholder="<|kimi_k25_video_placeholder|>",
+        text_hidden_size=7168,
+        **vision_config_kwargs,
+    ):
         self.patch_size = patch_size
         self.init_pos_emb_height = init_pos_emb_height
         self.init_pos_emb_width = init_pos_emb_width
         # MM Projector config
         self.mm_projector_type = mm_projector_type
+        self.mm_hidden_size = (
+            mm_hidden_size if mm_hidden_size is not None else vt_hidden_size
+        )
         self.projector_hidden_act = projector_hidden_act
         self.projector_ln_eps = projector_ln_eps
         self.text_hidden_size = text_hidden_size
     Args:
         text_config (dict | DeepseekV3Config): Configuration for the text model.
         Vision Tower Parameters (from MoonViT3dConfig):
             patch_size (int): Patch size for vision tower.
             init_pos_emb_height (int): Initial position embedding height.
             video_attn_type (str): Type of video attention.
             merge_type (str): Type of merge operation.
             _attn_implementation (str): Attention implementation type.
         MM Projector Parameters (from MultiModalProjectorConfig):
             mm_projector_type (str): Type of multimodal projector.
             mm_hidden_size (int): Hidden size from vision tower (should match vt_hidden_size).
             projector_hidden_act (str): Activation function for projector.
             projector_ln_eps (float): Layer norm epsilon for projector.
         Other Parameters:
             ignore_index (int): The ignore index for the loss function.
             media_placeholder_token_id (int): The token ID to use for media placeholders.
     def __init__(
         self,
+        text_config: Optional[Union[dict, DeepseekV3Config]] = None,
+        vision_config: Optional[Union[dict, KimiK25VisionConfig]] = None,
         # Other parameters
         ignore_index: int = -100,
         media_placeholder_token_id: int = 163605,
         pad_token_id: int = 0,
         use_unified_vision_chunk: bool = True,
+        video_placeholder: str = "<|kimi_k25_video_placeholder|>",
         **kwargs,
     ):
         if isinstance(text_config, dict):

kimi_k25_processor.py CHANGED Viewed

@@ -1,165 +1,168 @@
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.processing_utils import ProcessorMixin
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class KimiK25Processor(ProcessorMixin):
-    r"""
-    Constructs a KimiK25 processor which wraps a KimiK25 image processor and a tokenizer into a single processor.
-    [`KimiK25Processor`] offers all the functionalities of [`KimiK25ImageProcessor`] and [`TikTokenTokenizer`]. See the
-    [`~KimiK25Processor.__call__`] and [`~KimiK25Processor.decode`] for more information.
-    Args:
-        image_processor ([`KimiK25ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`TikTokenTokenizer`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-    def __init__(
-        self,
-        image_processor=None,
-        tokenizer=None,
-        chat_template=None,
-        **kwargs,
-    ):
-        super().__init__(image_processor,
-                         tokenizer,
-                         chat_template=chat_template)
-        self.media_processor = image_processor
-        # A special temporal placeholder to be replaced by actual video placeholders
-        self.video_placeholder = "<|kimi_k25_video_placeholder|>"
-    def update_raw_text(self, text: str, video_prompts: list[str]) -> str:
-        # replace video prompt in text with video chunk prompts
-        video_count = text.count(self.video_placeholder)
-        if video_count == 0:
-            return text
-        assert video_count == len(video_prompts)
-        text_parts = text.split(self.video_placeholder)
-        assert len(text_parts) == len(video_prompts) + 1
-        text = "".join([
-            text_parts[i] + video_prompts[i] for i in range(len(video_prompts))
-        ])
-        text += text_parts[-1]
-        return text
-    def preprocess_medias(self, medias: list[dict]) -> list[dict]:
-        updated_medias = []
-        video_prompts = []
-        for media in medias:
-            if media['type'] == 'image':
-                updated_medias.append(media)
-            elif media['type'] == 'video':
-                video_chunks = self.media_processor.split_video_chunks(
-                    media['video'])
-                updated_medias.extend(video_chunks)
-                video_prompts.append("".join(
-                    [vc['prompt'] for vc in video_chunks]))
-            else:
-                raise ValueError(f"unsupported media type: {media['type']}")
-        return updated_medias, video_prompts
-    def __call__(self,
-                 messages: list[dict] = None,
-                 medias: list[dict] = None,
-                 text: str = None,
-                 return_tensors: str = "pt",
-                 **kwargs) -> BatchFeature:
-        """
-        Process multimodal inputs for Kimi-K2.5 model.
-        This processor accepts ordered messages and extracts both media and text in a single pass.
-        text will be automatically updated if video input detected in messages
-        Args:
-            messages: List of message dicts with 'role' and 'content' fields.
-                     If provided, medias and text will be extracted automatically.
-            medias: Pre-extracted list of media dicts. If None, extracted from messages.
-            text: Pre-formatted text string. If None, generated via apply_chat_template.
-            return_tensors: Format of returned tensors ('pt', 'np', 'tf'). Default: 'pt'.
-            **kwargs: Additional arguments passed to tokenizer.apply_chat_template.
-        Returns:
-            BatchFeature with fields: input_ids, attention_mask, pixel_values, grid_thws.
-        """
-        if messages is None and (medias is None or text is None):
-            raise ValueError(
-                "Provide either 'messages' or both 'medias' and 'text'")
-        if medias is not None and text is not None:
-            updated_medias, video_prompts = self.preprocess_medias(medias)
-            preprocessed = self.media_processor.preprocess(
-                updated_medias, return_tensors=return_tensors)
-            text = self.update_raw_text(text, video_prompts)
-            text_inputs = self.tokenizer(text, return_tensors=return_tensors)
-            return BatchFeature(data={**text_inputs, **preprocessed.data})
-        if medias is None:
-            medias = self._extract_medias_from_messages(messages)
-        updated_medias, video_prompts = self.preprocess_medias(medias)
-        preprocessed = self.media_processor.preprocess(
-            updated_medias, return_tensors=return_tensors)
-        # Generate text if not provided
-        if text is None:
-            text = self.tokenizer.apply_chat_template(messages, **kwargs)
-        text = self.update_raw_text(text, video_prompts)
-        text_inputs = self.tokenizer(text, return_tensors=return_tensors)
-        return BatchFeature(data={**text_inputs, **preprocessed.data})
-    @staticmethod
-    def _extract_medias_from_messages(messages: list[dict]) -> list[dict]:
-        """
-        Extract media items from messages in a single pass.
-        This is an optimized version that processes messages only once.
-        Kept as internal method since external callers should use __call__.
-        """
-        medias = []
-        for msg in messages:
-            if msg['role'] != 'user' or not msg.get('content'):
-                continue
-            for content_part in msg['content']:
-                if not isinstance(content_part, dict):
-                    continue
-                content_type = content_part.get('type')
-                if content_type in ['video_url', 'video']:
-                    medias.append({
-                        'type': 'video',
-                        'video': content_part['video_url']['url'],
-                        'first_frame_timestamp': 0.0
-                    })
-                elif content_type in ['image_url', 'image']:
-                    medias.append({
-                        'type': 'image',
-                        'image': content_part['image_url'],
-                    })
-        return medias
-    def apply_chat_template(self, messages, **kwargs):
-        return self.tokenizer.apply_chat_template(messages, **kwargs)
-    def batch_decode(self, *args, **kwargs):
-        return self.tokenizer.batch_decode(*args, **kwargs)
-    def decode(self, *args, **kwargs):
-        return self.tokenizer.decode(*args, **kwargs)
-    @property
-    def model_input_names(self):
-        return ['input_ids', 'attention_mask', 'pixel_values', 'grid_thws']

+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessorMixin
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class KimiK25Processor(ProcessorMixin):
+    r"""
+    Constructs a KimiK25 processor which wraps a KimiK25 image processor and a tokenizer into a single processor.
+    [`KimiK25Processor`] offers all the functionalities of [`KimiK25ImageProcessor`] and [`TikTokenTokenizer`]. See the
+    [`~KimiK25Processor.__call__`] and [`~KimiK25Processor.decode`] for more information.
+    Args:
+        image_processor ([`KimiK25ImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`TikTokenTokenizer`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        self.media_processor = image_processor
+        # A special temporal placeholder to be replaced by actual video placeholders
+        self.video_placeholder = "<|kimi_k25_video_placeholder|>"
+    def update_raw_text(self, text: str, video_prompts: list[str]) -> str:
+        # replace video prompt in text with video chunk prompts
+        video_count = text.count(self.video_placeholder)
+        if video_count == 0:
+            return text
+        assert video_count == len(video_prompts)
+        text_parts = text.split(self.video_placeholder)
+        assert len(text_parts) == len(video_prompts) + 1
+        text = "".join(
+            [text_parts[i] + video_prompts[i] for i in range(len(video_prompts))]
+        )
+        text += text_parts[-1]
+        return text
+    def preprocess_medias(self, medias: list[dict]) -> tuple[list[dict], list[str]]:
+        updated_medias = []
+        video_prompts = []
+        for media in medias:
+            if media["type"] == "image":
+                updated_medias.append(media)
+            elif media["type"] == "video":
+                video_chunks = self.media_processor.split_video_chunks(media["video"])
+                updated_medias.extend(video_chunks)
+                video_prompts.append("".join([vc["prompt"] for vc in video_chunks]))
+            else:
+                raise ValueError(f"unsupported media type: {media['type']}")
+        return updated_medias, video_prompts
+    def __call__(
+        self,
+        messages: list[dict] = None,
+        medias: list[dict] = None,
+        text: str = None,
+        return_tensors: str = "pt",
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Process multimodal inputs for Kimi-K2.5 model.
+        This processor accepts ordered messages and extracts both media and text in a single pass.
+        text will be automatically updated if video input detected in messages
+        Args:
+            messages: List of message dicts with 'role' and 'content' fields.
+                     If provided, medias and text will be extracted automatically.
+            medias: Pre-extracted list of media dicts. If None, extracted from messages.
+            text: Pre-formatted text string. If None, generated via apply_chat_template.
+            return_tensors: Format of returned tensors ('pt', 'np', 'tf'). Default: 'pt'.
+            **kwargs: Additional arguments passed to tokenizer.apply_chat_template.
+        Returns:
+            BatchFeature with fields: input_ids, attention_mask, pixel_values, grid_thws.
+        """
+        if messages is None and (medias is None or text is None):
+            raise ValueError("Provide either 'messages' or both 'medias' and 'text'")
+        if medias is not None and text is not None:
+            updated_medias, video_prompts = self.preprocess_medias(medias)
+            preprocessed = self.media_processor.preprocess(
+                updated_medias, return_tensors=return_tensors
+            )
+            text = self.update_raw_text(text, video_prompts)
+            text_inputs = self.tokenizer(text, return_tensors=return_tensors)
+            return BatchFeature(data={**text_inputs, **preprocessed.data})
+        if medias is None:
+            medias = self._extract_medias_from_messages(messages)
+        updated_medias, video_prompts = self.preprocess_medias(medias)
+        preprocessed = self.media_processor.preprocess(
+            updated_medias, return_tensors=return_tensors
+        )
+        # Generate text if not provided
+        if text is None:
+            text = self.tokenizer.apply_chat_template(messages, **kwargs)
+        text = self.update_raw_text(text, video_prompts)
+        text_inputs = self.tokenizer(text, return_tensors=return_tensors)
+        return BatchFeature(data={**text_inputs, **preprocessed.data})
+    @staticmethod
+    def _extract_medias_from_messages(messages: list[dict]) -> list[dict]:
+        """
+        Extract media items from messages in a single pass.
+        This is an optimized version that processes messages only once.
+        Kept as internal method since external callers should use __call__.
+        """
+        medias = []
+        for msg in messages:
+            if msg["role"] != "user" or not msg.get("content"):
+                continue
+            for content_part in msg["content"]:
+                if not isinstance(content_part, dict):
+                    continue
+                content_type = content_part.get("type")
+                if content_type in ["video_url", "video"]:
+                    medias.append(
+                        {
+                            "type": "video",
+                            "video": content_part["video_url"]["url"],
+                            "first_frame_timestamp": 0.0,
+                        }
+                    )
+                elif content_type in ["image_url", "image"]:
+                    medias.append(
+                        {
+                            "type": "image",
+                            "image": content_part["image_url"],
+                        }
+                    )
+        return medias
+    def apply_chat_template(self, messages, **kwargs):
+        return self.tokenizer.apply_chat_template(messages, **kwargs)
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        return ["input_ids", "attention_mask", "pixel_values", "grid_thws"]

media_utils.py CHANGED Viewed

@@ -1,368 +1,362 @@
-import base64
-import io
-import math
-import os
-from datetime import datetime, timezone
-from typing import List, Literal, Optional, TypedDict
-import numpy as np
-from PIL import Image
-from pydantic import BaseModel, Field
-try:
-    from mecord import VideoReader
-except ImportError:
-    VideoReader = None
-class VideoSpec(BaseModel):
-    media_type: str = Literal['video']
-    height: int = Field(..., gt=0, description="video frame height")
-    width: int = Field(..., gt=0, description="video frame width")
-    num_frames: int = Field(..., gt=0, description="num frames")
-    fps: float = Field(..., gt=0, description="average fps")
-    # optional, help to accelerate video reading
-    key_indices: list[int] = Field(None, description="key indices")
-    frame_time_info: dict = Field(None, description="frame time info")
-class ImageInput(TypedDict):
-    type: Literal['image']
-    image: Image.Image
-class VideoChunkInput(TypedDict):
-    type: Literal['video_chunk']
-    video_chunk: List[Image.Image]
-    prompt: Optional[str] = None
-MediaInput = ImageInput | VideoChunkInput
-def get_video_meta(video_src: bytes | str | os.PathLike,
-                   accurate: bool = True) -> dict:
-    """Get the dimensions of a video."""
-    if isinstance(video_src, os.PathLike):
-        video_src = str(video_src)
-    # if b64 string, decode to bytes
-    if isinstance(video_src,
-                  str) and video_src.startswith('data:video/mp4;base64,'):
-        video_src = base64.b64decode(video_src.split(',')[1])
-    video = VideoReader(video_src, auto_init=accurate, num_threads=1)
-    assert video.num_frames > 0, "Invalid video format."
-    assert video.original_width > 0 and video.original_height > 0, (
-        "Invalid video format.")
-    assert video.avg_fps > 0, "Invalid video format."
-    return VideoSpec(media_type='video',
-                     height=video.original_height,
-                     width=video.original_width,
-                     num_frames=video.num_frames,
-                     fps=video.avg_fps,
-                     key_indices=video.key_indices,
-                     frame_time_info=video.frame_time_info)
-def timestamp_as_str(timestamp: float,
-                     timestamp_mode: str = "hh:mm:ss.fff") -> str:
-    """Convert a timestamp to a string in the format of HH:MM:SS.mmm."""
-    if timestamp_mode == "hh:mm:ss.fff":
-        return (datetime.fromtimestamp(timestamp,
-                                       tz=timezone.utc).strftime("%H:%M:%S") +
-                f".{int((timestamp % 1) * 1000):03d}")
-    elif timestamp_mode == "mm:ss.fff":
-        return (datetime.fromtimestamp(timestamp,
-                                       tz=timezone.utc).strftime("%M:%S") +
-                f".{int((timestamp % 1) * 1000):03d}")
-    elif timestamp_mode == "mm:ss":
-        return datetime.fromtimestamp(timestamp,
-                                      tz=timezone.utc).strftime("%M:%S")
-    else:
-        raise ValueError(f"Invalid timestamp mode: {timestamp_mode}")
-def navit_resize_image(
-    width: int,
-    height: int,
-    patch_size: int,
-    merge_kernel_size: int,
-    in_patch_limit: int,
-    patch_limit_on_one_side: int,
-    fixed_output_tokens: int | None,
-):
-    # Apply the patch limits.
-    s1 = math.sqrt(
-        in_patch_limit /
-        (max(1.0, width // patch_size) * max(1.0, height // patch_size)))
-    s2 = patch_limit_on_one_side * patch_size / width
-    s3 = patch_limit_on_one_side * patch_size / height
-    scale = min(1.0, s1, s2, s3)
-    new_w, new_h = max(1, int(width * scale)), max(1, int(height * scale))
-    new_w = min(new_w, patch_limit_on_one_side * patch_size)
-    new_h = min(new_h, patch_limit_on_one_side * patch_size)
-    # Calculate the padding to make the height and width divisible by the merge kernel size and patch size.
-    factor = merge_kernel_size * patch_size
-    pad_height = (factor - new_h % factor) % factor
-    pad_width = (factor - new_w % factor) % factor
-    if fixed_output_tokens is not None:
-        num_tokens = fixed_output_tokens
-    else:
-        # Calculate new dimensions after padding and patching
-        token_height = (new_h + pad_height) // factor
-        token_width = (new_w + pad_width) // factor
-        assert token_height * merge_kernel_size <= patch_limit_on_one_side, (
-            f"token_height {token_height} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
-        )
-        assert token_width * merge_kernel_size <= patch_limit_on_one_side, (
-            f"token_width {token_width} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
-        )
-        num_tokens = token_height * token_width
-    return {
-        "num_tokens": num_tokens,
-        "new_width": new_w,
-        "new_height": new_h,
-        "pad_width": pad_width,
-        "pad_height": pad_height,
-        "sampled_nframes": 1,
-    }
-def navit_resize_video(
-    width: int,
-    height: int,
-    nframes: int,
-    avg_fps: float,
-    sample_fps: float,
-    patch_size: int,
-    merge_kernel_size: int,
-    in_patch_limit_each_frame: int,
-    patch_limit_on_one_side: int,
-    in_patch_limit_total: int | None,
-    max_num_frames_each_video: int | None,
-    fixed_output_tokens_each_frame: int | None,
-):
-    sample_fps = min(sample_fps, avg_fps)
-    # Calculate the number of frames to sample based on target FPS
-    sampled_nframes = max(round(nframes * sample_fps / avg_fps), 1)
-    if max_num_frames_each_video is not None:
-        sampled_nframes = min(sampled_nframes, max_num_frames_each_video)
-    if in_patch_limit_total is not None:
-        in_patch_limit_each_frame = min(
-            round(in_patch_limit_total / sampled_nframes),
-            in_patch_limit_each_frame)
-    ret = navit_resize_image(
-        width,
-        height,
-        patch_size,
-        merge_kernel_size,
-        in_patch_limit_each_frame,
-        patch_limit_on_one_side,
-        fixed_output_tokens_each_frame,
-    )
-    ret["sampled_nframes"] = sampled_nframes
-    return ret
-def real_sample_fps_and_max_num_frames(
-    type_name: Literal["video", "video_chunk"],
-    sample_fps: float,
-    max_num_frames_each_video: int | None,
-) -> tuple[int, int | None]:
-    if type_name == "video":
-        return sample_fps, max_num_frames_each_video
-    elif type_name == "video_chunk":
-        max_num_frames_each_video = None
-        sample_fps = math.inf
-        return sample_fps, max_num_frames_each_video
-    else:
-        return math.inf, None
-def _to_pil(data: str | bytes):
-    if isinstance(data, Image.Image):
-        return data.convert("RGB")
-    elif isinstance(data, str):
-        if data.startswith("data:"):
-            raw_base64 = data.split(",")[1]
-            return Image.open(io.BytesIO(
-                base64.b64decode(raw_base64))).convert("RGB")
-        else:
-            return Image.open(data).convert("RGB")
-    elif isinstance(data, bytes):
-        return Image.open(io.BytesIO(data)).convert("RGB")
-    else:
-        raise ValueError(f"Unsupported data type: {type(data)}")
-def ensure_media_type(media: MediaInput) -> MediaInput:
-    if media['type'] == 'image':
-        media['image'] = _to_pil(media['image'])
-        return media
-    elif media['type'] == 'video_chunk':
-        media['video_chunk'] = [
-            _to_pil(frame) for frame in media['video_chunk']
-        ]
-        return media
-    else:
-        raise ValueError(f"Unsupported media type: {media['type']}")
-def image_to_np(
-    image: Image.Image,
-    resize_to: tuple[int, int] | None = None,
-    mode: str = "resize",
-    raise_error_for_ill_resize: bool = True,
-) -> np.ndarray:
-    """Convert an image to a numpy array.
-    Args:
-        content: The image to convert.
-        resize_to: The size to resize the image to.
-        mode: The mode to resize the image to.
-        raise_error_for_ill_resize: Whether to raise an error for ill-sized resize.
-    Returns:
-        A numpy array.
-    """
-    assert isinstance(image, Image.Image), "image must be a PIL Image"
-    if resize_to is not None:
-        if mode == "resize":
-            image = image.resize(resize_to, resample=Image.Resampling.BICUBIC)
-        elif mode == "rescale_and_pad_to_center":
-            scale = min(resize_to[0] / image.width,
-                        resize_to[1] / image.height, 1.0)
-            new_width = round(image.width * scale)
-            new_height = round(image.height * scale)
-            if new_width == 0 or new_height == 0:
-                if raise_error_for_ill_resize:
-                    raise ValueError(
-                        f"Invalid resize to: {resize_to}, from image size: {image.size}"
-                    )
-                else:
-                    return np.zeros((resize_to[1], resize_to[0], 3),
-                                    dtype=np.uint8)
-            image = image.resize((new_width, new_height),
-                                 resample=Image.Resampling.BICUBIC)
-            padding_left = (resize_to[0] - new_width) // 2
-            padding_right = resize_to[0] - new_width - padding_left
-            padding_top = (resize_to[1] - new_height) // 2
-            padding_bottom = resize_to[1] - new_height - padding_top
-            image = np.asarray(image)
-            image = np.pad(
-                image,
-                ((padding_top, padding_bottom), (padding_left, padding_right),
-                 (0, 0)),
-                mode="constant",
-                constant_values=0,
-            )
-            assert image.shape == (resize_to[1], resize_to[0], 3)
-        elif mode == "rescale_and_pad_to_rightbottom":
-            scale = min(resize_to[0] / image.width,
-                        resize_to[1] / image.height, 1.0)
-            new_width = round(image.width * scale)
-            new_height = round(image.height * scale)
-            if new_width == 0 or new_height == 0:
-                if raise_error_for_ill_resize:
-                    raise ValueError(
-                        f"Invalid resize to: {resize_to}, from image size: {image.size}"
-                    )
-                else:
-                    return np.zeros((resize_to[1], resize_to[0], 3),
-                                    dtype=np.uint8)
-            image = image.resize((new_width, new_height),
-                                 resample=Image.Resampling.BICUBIC)
-            padding_right = resize_to[0] - new_width
-            padding_bottom = resize_to[1] - new_height
-            image = np.asarray(image)
-            image = np.pad(
-                image,
-                ((0, padding_bottom), (0, padding_right), (0, 0)),
-                mode="constant",
-                constant_values=0,
-            )
-            assert image.shape == (resize_to[1], resize_to[0], 3)
-        else:
-            raise ValueError(f"Invalid mode: {mode}")
-    if isinstance(image, Image.Image):
-        return np.asarray(image)
-    else:
-        return image
-def navit_patchify(pixel_values: np.ndarray,
-                   patch_size: int) -> dict[str, np.ndarray]:
-    """Reshape the pixel values to a navit shape.
-    Args:
-        pixel_values: np.ndarray, shape (t, h, w, c)
-        patch_size: int
-    Returns:
-        dict[str, np.ndarray]
-        - patches: np.ndarray, shape (t * h//patch_size * w//patch_size, c, patch_size, patch_size)
-        - grid_thw: np.ndarray, (t, h//patch_size, w//patch_size)
-    """
-    T, H, W, C = pixel_values.shape
-    assert C == 3, "pixel_values must have 3 channels"
-    patches = pixel_values.reshape(T, H // patch_size, patch_size,
-                                   W // patch_size, patch_size, C)
-    # (T, H//patch_size, W//patch_size, C, patch_size, patch_size)
-    patches = patches.transpose(0, 1, 3, 5, 2, 4)
-    patches = patches.reshape(-1, C, patch_size, patch_size)
-    grid_thw = np.array([T, H // patch_size, W // patch_size])
-    return {"pixel_values": patches, "grid_thw": grid_thw}
-def normalize(x: np.ndarray,
-              mean,
-              std_inv,
-              pixels_dtype: np.dtype = np.float32) -> np.ndarray:
-    """Normalize the image.
-    Args:
-        x: The image to normalize. The shape is (..., 3). The dtype is uint8. The range is [0, 255].
-        mean: The mean of the image.
-        std_inv: The inverse of the std of the image.
-        pixels_dtype: The dtype of the image.
-    Returns:
-        The normalized image. The shape is (..., 3). The dtype is determined by the pixels_dtype.
-    """
-    x = (x / 255.0).astype(pixels_dtype)
-    x -= mean
-    x *= std_inv
-    return x
-def _to_tensor(data, **kwargs):
-    import torch
-    if isinstance(data, np.ndarray):
-        return torch.from_numpy(data).to(**kwargs)
-    elif isinstance(data, torch.Tensor):
-        return data.to(**kwargs)
-    elif isinstance(data, list):
-        return [_to_tensor(item, **kwargs) for item in data]
-    elif isinstance(data, tuple):
-        return tuple(_to_tensor(item, **kwargs) for item in data)
-    elif isinstance(data, dict):
-        return {k: _to_tensor(v, **kwargs) for k, v in data.items()}
-    elif data is None:
-        return None
-    else:
-        raise ValueError(f"Unsupported data type: {type(data)}")

+import base64
+import io
+import math
+import os
+from datetime import datetime, timezone
+from typing import List, Literal, NotRequired, Optional, TypedDict
+import numpy as np
+from PIL import Image
+from pydantic import BaseModel, Field
+try:
+    from mecord import VideoReader
+except ImportError:
+    VideoReader = None
+class VideoSpec(BaseModel):
+    media_type: Literal["video"] = "video"
+    height: int = Field(..., gt=0, description="video frame height")
+    width: int = Field(..., gt=0, description="video frame width")
+    num_frames: int = Field(..., gt=0, description="num frames")
+    fps: float = Field(..., gt=0, description="average fps")
+    # optional, help to accelerate video reading
+    key_indices: list[int] = Field(None, description="key indices")
+    frame_time_info: dict = Field(None, description="frame time info")
+class ImageInput(TypedDict):
+    type: Literal["image"]
+    image: Image.Image
+class VideoChunkInput(TypedDict):
+    type: Literal["video_chunk"]
+    video_chunk: List[Image.Image]
+    prompt: NotRequired[str]
+MediaInput = ImageInput | VideoChunkInput
+def get_video_meta(video_src: bytes | str | os.PathLike, accurate: bool = True) -> dict:
+    """Get the dimensions of a video."""
+    if isinstance(video_src, os.PathLike):
+        video_src = str(video_src)
+    # if b64 string, decode to bytes
+    if isinstance(video_src, str) and video_src.startswith("data:video/mp4;base64,"):
+        video_src = base64.b64decode(video_src.split(",")[1])
+    video = VideoReader(video_src, auto_init=accurate, num_threads=1)
+    assert video.num_frames > 0, "Invalid video format."
+    assert video.original_width > 0 and video.original_height > 0, (
+        "Invalid video format."
+    )
+    assert video.avg_fps > 0, "Invalid video format."
+    return VideoSpec(
+        media_type="video",
+        height=video.original_height,
+        width=video.original_width,
+        num_frames=video.num_frames,
+        fps=video.avg_fps,
+        key_indices=video.key_indices,
+        frame_time_info=video.frame_time_info,
+    )
+def timestamp_as_str(timestamp: float, timestamp_mode: str = "hh:mm:ss.fff") -> str:
+    """Convert a timestamp to a string in the format of HH:MM:SS.mmm."""
+    if timestamp_mode == "hh:mm:ss.fff":
+        return (
+            datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime("%H:%M:%S")
+            + f".{int((timestamp % 1) * 1000):03d}"
+        )
+    elif timestamp_mode == "mm:ss.fff":
+        return (
+            datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime("%M:%S")
+            + f".{int((timestamp % 1) * 1000):03d}"
+        )
+    elif timestamp_mode == "mm:ss":
+        return datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime("%M:%S")
+    else:
+        raise ValueError(f"Invalid timestamp mode: {timestamp_mode}")
+def navit_resize_image(
+    width: int,
+    height: int,
+    patch_size: int,
+    merge_kernel_size: int,
+    in_patch_limit: int,
+    patch_limit_on_one_side: int,
+    fixed_output_tokens: int | None,
+):
+    # Apply the patch limits.
+    s1 = math.sqrt(
+        in_patch_limit
+        / (max(1.0, width // patch_size) * max(1.0, height // patch_size))
+    )
+    s2 = patch_limit_on_one_side * patch_size / width
+    s3 = patch_limit_on_one_side * patch_size / height
+    scale = min(1.0, s1, s2, s3)
+    new_w, new_h = max(1, int(width * scale)), max(1, int(height * scale))
+    new_w = min(new_w, patch_limit_on_one_side * patch_size)
+    new_h = min(new_h, patch_limit_on_one_side * patch_size)
+    # Calculate the padding to make the height and width divisible by the merge kernel size and patch size.
+    factor = merge_kernel_size * patch_size
+    pad_height = (factor - new_h % factor) % factor
+    pad_width = (factor - new_w % factor) % factor
+    if fixed_output_tokens is not None:
+        num_tokens = fixed_output_tokens
+    else:
+        # Calculate new dimensions after padding and patching
+        token_height = (new_h + pad_height) // factor
+        token_width = (new_w + pad_width) // factor
+        assert token_height * merge_kernel_size <= patch_limit_on_one_side, (
+            f"token_height {token_height} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
+        )
+        assert token_width * merge_kernel_size <= patch_limit_on_one_side, (
+            f"token_width {token_width} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
+        )
+        num_tokens = token_height * token_width
+    return {
+        "num_tokens": num_tokens,
+        "new_width": new_w,
+        "new_height": new_h,
+        "pad_width": pad_width,
+        "pad_height": pad_height,
+        "sampled_nframes": 1,
+    }
+def navit_resize_video(
+    width: int,
+    height: int,
+    nframes: int,
+    avg_fps: float,
+    sample_fps: float,
+    patch_size: int,
+    merge_kernel_size: int,
+    in_patch_limit_each_frame: int,
+    patch_limit_on_one_side: int,
+    in_patch_limit_total: int | None,
+    max_num_frames_each_video: int | None,
+    fixed_output_tokens_each_frame: int | None,
+):
+    sample_fps = min(sample_fps, avg_fps)
+    # Calculate the number of frames to sample based on target FPS
+    sampled_nframes = max(round(nframes * sample_fps / avg_fps), 1)
+    if max_num_frames_each_video is not None:
+        sampled_nframes = min(sampled_nframes, max_num_frames_each_video)
+    if in_patch_limit_total is not None:
+        in_patch_limit_each_frame = min(
+            round(in_patch_limit_total / sampled_nframes), in_patch_limit_each_frame
+        )
+    ret = navit_resize_image(
+        width,
+        height,
+        patch_size,
+        merge_kernel_size,
+        in_patch_limit_each_frame,
+        patch_limit_on_one_side,
+        fixed_output_tokens_each_frame,
+    )
+    ret["sampled_nframes"] = sampled_nframes
+    return ret
+def real_sample_fps_and_max_num_frames(
+    type_name: Literal["video", "video_chunk"],
+    sample_fps: float,
+    max_num_frames_each_video: int | None,
+) -> tuple[int, int | None]:
+    if type_name == "video":
+        return sample_fps, max_num_frames_each_video
+    elif type_name == "video_chunk":
+        max_num_frames_each_video = None
+        sample_fps = math.inf
+        return sample_fps, max_num_frames_each_video
+    else:
+        return math.inf, None
+def _to_pil(data: str | bytes):
+    if isinstance(data, Image.Image):
+        return data.convert("RGB")
+    elif isinstance(data, str):
+        if data.startswith("data:"):
+            raw_base64 = data.split(",")[1]
+            return Image.open(io.BytesIO(base64.b64decode(raw_base64))).convert("RGB")
+        else:
+            return Image.open(data).convert("RGB")
+    elif isinstance(data, bytes):
+        return Image.open(io.BytesIO(data)).convert("RGB")
+    else:
+        raise ValueError(f"Unsupported data type: {type(data)}")
+def ensure_media_type(media: MediaInput) -> MediaInput:
+    if media["type"] == "image":
+        media["image"] = _to_pil(media["image"])
+        return media
+    elif media["type"] == "video_chunk":
+        media["video_chunk"] = [_to_pil(frame) for frame in media["video_chunk"]]
+        return media
+    else:
+        raise ValueError(f"Unsupported media type: {media['type']}")
+def image_to_np(
+    image: Image.Image,
+    resize_to: tuple[int, int] | None = None,
+    mode: str = "resize",
+    raise_error_for_ill_resize: bool = True,
+) -> np.ndarray:
+    """Convert an image to a numpy array.
+    Args:
+        content: The image to convert.
+        resize_to: The size to resize the image to.
+        mode: The mode to resize the image to.
+        raise_error_for_ill_resize: Whether to raise an error for ill-sized resize.
+    Returns:
+        A numpy array.
+    """
+    assert isinstance(image, Image.Image), "image must be a PIL Image"
+    if resize_to is not None:
+        if mode == "resize":
+            image = image.resize(resize_to, resample=Image.Resampling.BICUBIC)
+        elif mode == "rescale_and_pad_to_center":
+            scale = min(resize_to[0] / image.width, resize_to[1] / image.height, 1.0)
+            new_width = round(image.width * scale)
+            new_height = round(image.height * scale)
+            if new_width == 0 or new_height == 0:
+                if raise_error_for_ill_resize:
+                    raise ValueError(
+                        f"Invalid resize to: {resize_to}, from image size: {image.size}"
+                    )
+                else:
+                    return np.zeros((resize_to[1], resize_to[0], 3), dtype=np.uint8)
+            image = image.resize(
+                (new_width, new_height), resample=Image.Resampling.BICUBIC
+            )
+            padding_left = (resize_to[0] - new_width) // 2
+            padding_right = resize_to[0] - new_width - padding_left
+            padding_top = (resize_to[1] - new_height) // 2
+            padding_bottom = resize_to[1] - new_height - padding_top
+            image = np.asarray(image)
+            image = np.pad(
+                image,
+                ((padding_top, padding_bottom), (padding_left, padding_right), (0, 0)),
+                mode="constant",
+                constant_values=0,
+            )
+            assert image.shape == (resize_to[1], resize_to[0], 3)
+        elif mode == "rescale_and_pad_to_rightbottom":
+            scale = min(resize_to[0] / image.width, resize_to[1] / image.height, 1.0)
+            new_width = round(image.width * scale)
+            new_height = round(image.height * scale)
+            if new_width == 0 or new_height == 0:
+                if raise_error_for_ill_resize:
+                    raise ValueError(
+                        f"Invalid resize to: {resize_to}, from image size: {image.size}"
+                    )
+                else:
+                    return np.zeros((resize_to[1], resize_to[0], 3), dtype=np.uint8)
+            image = image.resize(
+                (new_width, new_height), resample=Image.Resampling.BICUBIC
+            )
+            padding_right = resize_to[0] - new_width
+            padding_bottom = resize_to[1] - new_height
+            image = np.asarray(image)
+            image = np.pad(
+                image,
+                ((0, padding_bottom), (0, padding_right), (0, 0)),
+                mode="constant",
+                constant_values=0,
+            )
+            assert image.shape == (resize_to[1], resize_to[0], 3)
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+    if isinstance(image, Image.Image):
+        return np.asarray(image)
+    else:
+        return image
+def navit_patchify(pixel_values: np.ndarray, patch_size: int) -> dict[str, np.ndarray]:
+    """Reshape the pixel values to a navit shape.
+    Args:
+        pixel_values: np.ndarray, shape (t, h, w, c)
+        patch_size: int
+    Returns:
+        dict[str, np.ndarray]
+        - patches: np.ndarray, shape (t * h//patch_size * w//patch_size, c, patch_size, patch_size)
+        - grid_thw: np.ndarray, (t, h//patch_size, w//patch_size)
+    """
+    T, H, W, C = pixel_values.shape
+    assert C == 3, "pixel_values must have 3 channels"
+    patches = pixel_values.reshape(
+        T, H // patch_size, patch_size, W // patch_size, patch_size, C
+    )
+    # (T, H//patch_size, W//patch_size, C, patch_size, patch_size)
+    patches = patches.transpose(0, 1, 3, 5, 2, 4)
+    patches = patches.reshape(-1, C, patch_size, patch_size)
+    grid_thw = np.array([T, H // patch_size, W // patch_size])
+    return {"pixel_values": patches, "grid_thw": grid_thw}
+def normalize(
+    x: np.ndarray, mean, std_inv, pixels_dtype: np.dtype = np.float32
+) -> np.ndarray:
+    """Normalize the image.
+    Args:
+        x: The image to normalize. The shape is (..., 3). The dtype is uint8. The range is [0, 255].
+        mean: The mean of the image.
+        std_inv: The inverse of the std of the image.
+        pixels_dtype: The dtype of the image.
+    Returns:
+        The normalized image. The shape is (..., 3). The dtype is determined by the pixels_dtype.
+    """
+    x = (x / 255.0).astype(pixels_dtype)
+    x -= mean
+    x *= std_inv
+    return x
+def _to_tensor(data, **kwargs):
+    import torch
+    if isinstance(data, np.ndarray):
+        return torch.from_numpy(data).to(**kwargs)
+    elif isinstance(data, torch.Tensor):
+        return data.to(**kwargs)
+    elif isinstance(data, list):
+        return [_to_tensor(item, **kwargs) for item in data]
+    elif isinstance(data, tuple):
+        return tuple(_to_tensor(item, **kwargs) for item in data)
+    elif isinstance(data, dict):
+        return {k: _to_tensor(v, **kwargs) for k, v in data.items()}
+    elif data is None:
+        return None
+    else:
+        raise ValueError(f"Unsupported data type: {type(data)}")