import math import numpy as np import torch import torchvision.transforms as T from decord import VideoReader, cpu from PIL import Image from torchvision.transforms.functional import InterpolationMode from transformers import AutoModel, AutoTokenizer from typing import Optional, Dict, Any, Union, List from .base import BaseVideoModel IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) class InternVLModel(BaseVideoModel): def __init__(self, model_name: str = "OpenGVLab/InternVL3_5-8B"): super().__init__(model_name) self.model = AutoModel.from_pretrained(model_name) self.tokenizer = AutoTokenizer.from_pretrained(model_name) def chat( self, prompt: str, video_path: str, fps: float = 1.0, max_new_tokens: int = 512, temperature: float = 0.7, ) -> str: pass def chat_with_confidence( self, prompt: str, video_path: str, fps: float = 1.0, max_new_tokens: int = 512, temperature: float = 0.7, token_choices: Optional[List[str]] = ["Yes", "No"], logits_temperature: Optional[float] = 1.0, return_confidence: Optional[bool] = False, debug: Optional[bool] = False, ) -> Dict[str, Any]: pass