jena-shreyas's picture
Initial commit without videos
80ceab0
import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
from typing import Optional, Dict, Any, Union, List
from .base import BaseVideoModel
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
class InternVLModel(BaseVideoModel):
def __init__(self, model_name: str = "OpenGVLab/InternVL3_5-8B"):
super().__init__(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def chat(
self,
prompt: str,
video_path: str,
fps: float = 1.0,
max_new_tokens: int = 512,
temperature: float = 0.7,
) -> str:
pass
def chat_with_confidence(
self,
prompt: str,
video_path: str,
fps: float = 1.0,
max_new_tokens: int = 512,
temperature: float = 0.7,
token_choices: Optional[List[str]] = ["Yes", "No"],
logits_temperature: Optional[float] = 1.0,
return_confidence: Optional[bool] = False,
debug: Optional[bool] = False,
) -> Dict[str, Any]:
pass