Spaces:
Sleeping
Sleeping
| import math | |
| import numpy as np | |
| import torch | |
| import torchvision.transforms as T | |
| from decord import VideoReader, cpu | |
| from PIL import Image | |
| from torchvision.transforms.functional import InterpolationMode | |
| from transformers import AutoModel, AutoTokenizer | |
| from typing import Optional, Dict, Any, Union, List | |
| from .base import BaseVideoModel | |
| IMAGENET_MEAN = (0.485, 0.456, 0.406) | |
| IMAGENET_STD = (0.229, 0.224, 0.225) | |
| class InternVLModel(BaseVideoModel): | |
| def __init__(self, model_name: str = "OpenGVLab/InternVL3_5-8B"): | |
| super().__init__(model_name) | |
| self.model = AutoModel.from_pretrained(model_name) | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def chat( | |
| self, | |
| prompt: str, | |
| video_path: str, | |
| fps: float = 1.0, | |
| max_new_tokens: int = 512, | |
| temperature: float = 0.7, | |
| ) -> str: | |
| pass | |
| def chat_with_confidence( | |
| self, | |
| prompt: str, | |
| video_path: str, | |
| fps: float = 1.0, | |
| max_new_tokens: int = 512, | |
| temperature: float = 0.7, | |
| token_choices: Optional[List[str]] = ["Yes", "No"], | |
| logits_temperature: Optional[float] = 1.0, | |
| return_confidence: Optional[bool] = False, | |
| debug: Optional[bool] = False, | |
| ) -> Dict[str, Any]: | |
| pass | |