Spaces:

jena-shreyas
/

Video-Inference-Demo

Paused

App Files Files Community

jena-shreyas commited on 23 days ago

Commit

80ceab0

1 Parent(s): ae099b5

Initial commit without videos

Browse files

Files changed (10) hide show

.gitattributes +4 -0
.gitignore +3 -0
app.py +414 -0
models/__init__.py +121 -0
models/base.py +27 -0
models/internvl.py +44 -0
models/llava_video.py +154 -0
models/qwen2_5.py +288 -0
models/qwen3vl.py +299 -0
requirements.txt +149 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.webm filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
+*.mov filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.gradio/
+models/__pycache__/
+SETUP_VIDEO_LFS.md

app.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import os
+import sys
+from pathlib import Path
+import gradio as gr
+# Allow importing your models package
+sys.path.insert(0, str(Path(__file__).parent))
+from models import load_model
+from models.base import BaseVideoModel
+# ----------------------
+# CONFIG
+# ----------------------
+MODEL_PATH = "lmms-lab/LLaVA-Video-7B-Qwen2"
+DEVICE_MAP = "cuda:0"
+VIDEO_DIR = str(Path(__file__).parent / "videos")
+FPS = 1.0
+MAX_NEW_TOKENS = 512
+TEMPERATURE = 0.01
+# ----------------------
+# Load model ONCE
+# ----------------------
+print("Loading LLaVa-Video-7B-Qwen2...")
+model: BaseVideoModel = load_model(
+    MODEL_PATH,
+    device_map=DEVICE_MAP,
+)
+print("Model loaded.")
+# ----------------------
+# Collect video IDs
+# ----------------------
+VIDEO_IDS = sorted([
+    os.path.splitext(f)[0]
+    for f in os.listdir(VIDEO_DIR)
+    if f.endswith(".webm")
+])
+# ----------------------
+# Helpers
+# ----------------------
+def get_video_path(video_id: str):
+    if not video_id:
+        return None
+    path = os.path.join(VIDEO_DIR, video_id + ".webm")
+    return path if os.path.exists(path) else None
+# ----------------------
+# Inference function
+# ----------------------
+def video_qa(video_id: str, prompt: str) -> str:
+    if not video_id:
+        return "❌ Please select a video ID."
+    if not prompt.strip():
+        return "❌ Please enter a prompt."
+    video_path = get_video_path(video_id)
+    if video_path is None:
+        return f"❌ Video not found: {video_id}.webm"
+    try:
+        response = model.chat(
+            prompt=prompt,
+            video_path=video_path,
+            fps=FPS,
+            max_new_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE,
+        )
+        return response
+    except Exception as e:
+        return f"❌ Error during inference: {str(e)}"
+# ----------------------
+# Gradio UI
+# ----------------------
+with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2") as demo:
+    gr.Markdown("## 🎥 Video Question Answering (LLaVa-Video-7B-Qwen2)")
+    with gr.Row():
+        # LEFT COLUMN
+        with gr.Column(scale=1):
+            video_id = gr.Dropdown(
+                choices=VIDEO_IDS,
+                label="Video ID",
+                filterable=True,
+                interactive=True
+            )
+            video_player = gr.Video(
+                label="Selected Video",
+                autoplay=True,
+                height=240
+            )
+        # RIGHT COLUMN
+        with gr.Column(scale=2):
+            prompt = gr.Textbox(
+                label="Prompt",
+                placeholder="Ask a question about the selected video",
+                lines=4
+            )
+            answer = gr.Textbox(
+                label="Model Answer",
+                lines=8
+            )
+            run = gr.Button("Run Inference 🚀")
+    # Update video player when dropdown changes
+    video_id.change(
+        fn=get_video_path,
+        inputs=video_id,
+        outputs=video_player
+    )
+    # Run inference
+    run.click(
+        fn=video_qa,
+        inputs=[video_id, prompt],
+        outputs=answer
+    )
+demo.launch(
+    server_name="0.0.0.0",
+    server_port=7860,
+    share=True
+)
+# #---------------
+# #---------------
+# #---------------
+# # Feb 5, 2026
+# #---------------
+# import os
+# import sys
+# import json
+# from pathlib import Path
+# import gradio as gr
+# # Allow importing your models package
+# sys.path.insert(0, str(Path(__file__).parent))
+# from models import load_model
+# from models.base import BaseVideoModel
+# # ----------------------
+# # CONFIG
+# # ----------------------
+# QWEN_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct"
+# LLAVA_MODEL_PATH = "lmms-lab/LLaVA-Video-7B-Qwen2"
+# DEVICE_MAP_QWEN = "cuda:0"
+# DEVICE_MAP_LLAVA = "cuda:0"  # Both models on same GPU
+# VIDEO_DIR = "/home/raman/Gradio_Qwen3vl4bInstruct/videos"
+# LABELS_JSON = "/home/raman/Gradio_Qwen3vl4bInstruct/SSv2_prepost_sampled.json"
+# DEFAULT_FPS = 1.0
+# MAX_NEW_TOKENS = 512
+# TEMPERATURE = 0.01
+# # ----------------------
+# # Load video labels
+# # ----------------------
+# print("Loading video labels...")
+# video_labels = {}
+# try:
+#     with open(LABELS_JSON, 'r') as f:
+#         labels_data = json.load(f)
+#         for item in labels_data:
+#             video_labels[item['id']] = {
+#                 'label': item['label'],
+#                 'template': item.get('template', ''),
+#                 'action_group': item.get('action_group', '')
+#             }
+#     print(f"Loaded {len(video_labels)} video labels.")
+# except Exception as e:
+#     print(f"Warning: Could not load labels JSON: {e}")
+# # ----------------------
+# # Load models
+# # ----------------------
+# print("Loading Qwen3-VL-4B-Instruct...")
+# qwen_model: BaseVideoModel = load_model(
+#     QWEN_MODEL_PATH,
+#     device_map=DEVICE_MAP_QWEN,
+# )
+# print("Qwen model loaded.")
+# print("Loading LLaVA-Video-7B...")
+# llava_model: BaseVideoModel = load_model(
+#     LLAVA_MODEL_PATH,
+#     device_map=DEVICE_MAP_LLAVA,
+# )
+# print("LLaVA model loaded.")
+# # ----------------------
+# # Collect video IDs
+# # ----------------------
+# VIDEO_IDS = sorted([
+#     os.path.splitext(f)[0]
+#     for f in os.listdir(VIDEO_DIR)
+#     if f.endswith(".mp4")
+# ])
+# print(f"Found {len(VIDEO_IDS)} videos.")
+# # ----------------------
+# # Helpers
+# # ----------------------
+# def get_video_path(video_id: str):
+#     if not video_id:
+#         return None
+#     path = os.path.join(VIDEO_DIR, video_id + ".mp4")
+#     return path if os.path.exists(path) else None
+# def get_video_label(video_id: str):
+#     if not video_id:
+#         return ""
+#     info = video_labels.get(video_id, {})
+#     label = info.get('label', 'No label available')
+#     action_group = info.get('action_group', '')
+#     if action_group:
+#         return f"**Label:** {label}\n\n**Action Group:** {action_group}"
+#     return f"**Label:** {label}"
+# def update_video_info(video_id: str):
+#     """Returns video path and label when video is selected"""
+#     video_path = get_video_path(video_id)
+#     label = get_video_label(video_id)
+#     return video_path, label
+# # ----------------------
+# # Inference functions
+# # ----------------------
+# def qwen_inference(video_id: str, prompt: str, fps: float) -> str:
+#     if not video_id:
+#         return "❌ Please select a video ID."
+#     if not prompt.strip():
+#         return "❌ Please enter a prompt."
+#     video_path = get_video_path(video_id)
+#     if video_path is None:
+#         return f"❌ Video not found: {video_id}.mp4"
+#     try:
+#         response = qwen_model.chat(
+#             prompt=prompt,
+#             video_path=video_path,
+#             fps=fps,
+#             max_new_tokens=MAX_NEW_TOKENS,
+#             temperature=TEMPERATURE,
+#         )
+#         return response
+#     except Exception as e:
+#         return f"❌ Error during Qwen inference: {str(e)}"
+# def llava_inference(video_id: str, prompt: str, fps: float) -> str:
+#     if not video_id:
+#         return "❌ Please select a video ID."
+#     if not prompt.strip():
+#         return "❌ Please enter a prompt."
+#     video_path = get_video_path(video_id)
+#     if video_path is None:
+#         return f"❌ Video not found: {video_id}.mp4"
+#     try:
+#         response = llava_model.chat(
+#             prompt=prompt,
+#             video_path=video_path,
+#             fps=fps,
+#             max_new_tokens=MAX_NEW_TOKENS,
+#             temperature=TEMPERATURE,
+#         )
+#         return response
+#     except Exception as e:
+#         return f"❌ Error during LLaVA inference: {str(e)}"
+# # ----------------------
+# # Gradio UI
+# # ----------------------
+# with gr.Blocks(title="Video QA – Qwen3-VL & LLaVA-Video", theme=gr.themes.Soft()) as demo:
+#     gr.Markdown("# 🎥 Video Question Answering Demo")
+#     gr.Markdown("Compare **Qwen3-VL-4B-Instruct** and **LLaVA-Video-7B-Qwen2** on the same videos")
+#     # TOP SECTION: Video Selection and Display
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             video_id = gr.Dropdown(
+#                 choices=VIDEO_IDS,
+#                 label="📁 Select Video ID",
+#                 filterable=True,
+#                 interactive=True,
+#                 value=VIDEO_IDS[0] if VIDEO_IDS else None
+#             )
+#             video_label = gr.Markdown(
+#                 value=get_video_label(VIDEO_IDS[0]) if VIDEO_IDS else "",
+#                 label="Video Information"
+#             )
+#             fps_slider = gr.Slider(
+#                 minimum=0.5,
+#                 maximum=5.0,
+#                 step=0.5,
+#                 value=DEFAULT_FPS,
+#                 label="🎞️ Frames Per Second (FPS)",
+#                 info="Higher FPS = more frames analyzed (slower but more detailed)"
+#             )
+#         with gr.Column(scale=2):
+#             video_player = gr.Video(
+#                 label="Selected Video",
+#                 autoplay=False,
+#                 height=360,
+#                 value=get_video_path(VIDEO_IDS[0]) if VIDEO_IDS else None
+#             )
+#     gr.Markdown("---")
+#     # BOTTOM SECTION: Two Models Side by Side
+#     with gr.Row():
+#         # QWEN COLUMN
+#         with gr.Column(scale=1):
+#             gr.Markdown("### 🤖 Qwen3-VL-4B-Instruct")
+#             qwen_prompt = gr.Textbox(
+#                 label="Prompt",
+#                 placeholder="Ask a question about the video...",
+#                 lines=4,
+#                 value="Describe what is happening in this video."
+#             )
+#             qwen_answer = gr.Textbox(
+#                 label="Qwen Answer",
+#                 lines=10,
+#                 interactive=False
+#             )
+#             qwen_run = gr.Button("🚀 Run Qwen Inference", variant="primary")
+#         # LLAVA COLUMN
+#         with gr.Column(scale=1):
+#             gr.Markdown("### 🎬 LLaVA-Video-7B-Qwen2")
+#             llava_prompt = gr.Textbox(
+#                 label="Prompt",
+#                 placeholder="Ask a question about the video...",
+#                 lines=4,
+#                 value="Describe what is happening in this video."
+#             )
+#             llava_answer = gr.Textbox(
+#                 label="LLaVA Answer",
+#                 lines=10,
+#                 interactive=False
+#             )
+#             llava_run = gr.Button("🚀 Run LLaVA Inference", variant="primary")
+#     # Model info footer
+#     gr.Markdown("""
+#     ---
+#     **Model Information:**
+#     - **Qwen3-VL-4B-Instruct**: 4B parameter vision-language model
+#     - **LLaVA-Video-7B-Qwen2**: 7B parameter video understanding model
+#     **Settings:** Max Tokens={}, Temperature={}
+#     """.format(MAX_NEW_TOKENS, TEMPERATURE))
+#     # ----------------------
+#     # Event Handlers
+#     # ----------------------
+#     # Update video player and label when dropdown changes
+#     video_id.change(
+#         fn=update_video_info,
+#         inputs=video_id,
+#         outputs=[video_player, video_label]
+#     )
+#     # Run Qwen inference
+#     qwen_run.click(
+#         fn=qwen_inference,
+#         inputs=[video_id, qwen_prompt, fps_slider],
+#         outputs=qwen_answer
+#     )
+#     # Run LLaVA inference
+#     llava_run.click(
+#         fn=llava_inference,
+#         inputs=[video_id, llava_prompt, fps_slider],
+#         outputs=llava_answer
+#     )
+# # Launch
+# demo.launch(
+#     server_name="0.0.0.0",
+#     server_port=7860,
+#     share=True
+# )

models/__init__.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from .base import BaseVideoModel
+from packaging import version
+import torch
+from typing import Optional, Union, Dict
+# Required versions
+qwen_required_version = version.parse("4.57.0")
+llava_required_version = version.parse("4.40.0")
+# Conditional imports based on transformers version
+try:
+    import transformers
+    # More robust import path for newer transformers
+    from transformers.generation import LogitsProcessor
+    transformers_version = version.parse(transformers.__version__)
+    QWEN_MODELS_AVAILABLE = False
+    LLAVA_MODELS_AVAILABLE = False
+    # Qwen condition
+    if transformers_version >= qwen_required_version:
+        from .qwen2_5 import Qwen2_5VLModel
+        from .qwen3vl import Qwen3VLModel
+        QWEN_MODELS_AVAILABLE = True
+    else:
+        print(
+            f"Warning: Qwen models require transformers>=4.57.0, "
+            f"but found {transformers.__version__}. "
+            f"Qwen models will not be available."
+        )
+    # LLaVA condition
+    if transformers_version <= llava_required_version:
+        from .llava_video import LLaVAVideoModel
+        LLAVA_MODELS_AVAILABLE = True
+    else:
+        print(
+            f"Warning: LLaVA models require transformers<=4.40.0, "
+            f"but found {transformers.__version__}. "
+            f"LLaVA models will not be available."
+        )
+except ImportError as e:
+    print("Warning: Could not import transformers correctly.")
+    raise e
+# Build __all__ list dynamically
+__all__ = []
+if QWEN_MODELS_AVAILABLE:
+    __all__.extend(["Qwen2_5VLModel", "Qwen3VLModel"])
+if LLAVA_MODELS_AVAILABLE:
+    __all__.append("LLaVAVideoModel")
+# Function to get the model by mapping model ID to the correct model class
+def load_model(
+    model_path: str,
+    dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
+    device_map: Optional[Union[str, Dict]] = "auto",
+    attn_implementation: Optional[str] = "flash_attention_2",
+) -> BaseVideoModel:
+    if "LLaVA-Video" in model_path:
+        if not LLAVA_MODELS_AVAILABLE:
+            raise ImportError(
+                "LLaVA models require transformers<=4.40.0. "
+                "Please downgrade transformers."
+            )
+        return LLaVAVideoModel(
+            model_path,
+            dtype=dtype,
+            device_map=device_map,
+            attn_implementation=attn_implementation,
+        )
+    elif "Qwen" in model_path:
+        if not QWEN_MODELS_AVAILABLE:
+            raise ImportError(
+                "Qwen models require transformers>=4.57.0. "
+                "Please upgrade transformers."
+            )
+        if "Qwen3" in model_path:
+            return Qwen3VLModel(
+                model_path,
+                dtype=dtype,
+                device_map=device_map,
+                attn_implementation=attn_implementation,
+            )
+        else:
+            return Qwen2_5VLModel(
+                model_path,
+                dtype=dtype,
+                device_map=device_map,
+                attn_implementation=attn_implementation,
+            )
+    else:
+        raise ValueError(f"Unsupported model path: {model_path}")
+class LogitsCaptureProcessor(LogitsProcessor):
+    """
+    Custom LogitsProcessor that captures the processed logits right before sampling.
+    """
+    def __init__(self):
+        self.captured_logits = []
+    def __call__(
+        self,
+        input_ids: torch.LongTensor,
+        scores: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        self.captured_logits.append(scores.detach().clone().cpu())
+        return scores
+    def reset(self):
+        self.captured_logits = []

models/base.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Union, Any
+class BaseVideoModel(ABC):
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.model = None
+        self.processor = None
+    @abstractmethod
+    def chat(
+        self,
+        prompt: str,
+        video_path: str,
+        generation_config: Optional[Dict[str, Any]] = None,
+    ) -> str:
+        pass
+    @abstractmethod
+    def chat_with_confidence(
+        self,
+        prompt: str,
+        video_path: str,
+        generation_config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Union[str, float]]:
+        pass

models/internvl.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+from typing import Optional, Dict, Any, Union, List
+from .base import BaseVideoModel
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+class InternVLModel(BaseVideoModel):
+    def __init__(self, model_name: str = "OpenGVLab/InternVL3_5-8B"):
+        super().__init__(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def chat(
+        self,
+        prompt: str,
+        video_path: str,
+        fps: float = 1.0,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+    ) -> str:
+        pass
+    def chat_with_confidence(
+        self,
+        prompt: str,
+        video_path: str,
+        fps: float = 1.0,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        token_choices: Optional[List[str]] = ["Yes", "No"],
+        logits_temperature: Optional[float] = 1.0,
+        return_confidence: Optional[bool] = False,
+        debug: Optional[bool] = False,
+    ) -> Dict[str, Any]:
+        pass

models/llava_video.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Run with `conda activate llava`
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import (
+    get_model_name_from_path,
+    process_images,
+    tokenizer_image_token,
+)
+from llava.constants import (
+    IMAGE_TOKEN_INDEX,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IM_END_TOKEN,
+    IGNORE_INDEX,
+)
+from llava.conversation import conv_templates, SeparatorStyle
+from PIL import Image
+import requests
+import copy
+import torch
+import sys
+from typing import Optional, Union, Dict, List, Any
+import warnings
+from decord import VideoReader, cpu
+import numpy as np
+# Handle both relative and absolute imports
+try:
+    from .base import BaseVideoModel
+except ImportError:
+    from base import BaseVideoModel
+warnings.filterwarnings("ignore")
+class LLaVAVideoModel(BaseVideoModel):
+    def __init__(
+        self,
+        model_name: str = "lmms-lab/LLaVA-Video-7B-Qwen2",
+        dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
+        device_map: Optional[Union[str, Dict]] = "auto",
+        attn_implementation: Optional[str] = "flash_attention_2",
+    ):
+        super().__init__(model_name)
+        base_model = "llava_qwen"
+        self.dtype = dtype
+        # Convert torch dtype to string for safety, since LLaVA-Video only accepts torch_dtype as a string
+        if dtype == torch.bfloat16:
+            torch_dtype = "bfloat16"
+        elif dtype == torch.float16:
+            torch_dtype = "float16"
+        self.tokenizer, self.model, self.image_processor, max_length = (
+            load_pretrained_model(
+                model_name,
+                None,
+                base_model,
+                torch_dtype=torch_dtype,
+                device_map=device_map,
+            )
+        )  # Add any other thing you want to pass in llava_model_args
+        self.model.eval()
+        # Ensure all model components are on the same device
+        # The vision tower and mm_projector may not be on the correct device with device_map using `load_pretrained_model`, so need to explicitly move to the model's device
+        if hasattr(self.model, "get_vision_tower"):
+            vision_tower = self.model.get_vision_tower()
+            if vision_tower is not None:
+                vision_tower.to(self.model.device)
+        if hasattr(self.model, "get_model"):
+            model_inner = self.model.get_model()
+            if hasattr(model_inner, "mm_projector"):
+                model_inner.mm_projector.to(self.model.device)
+    def load_video(
+        self,
+        video_path: str,
+        fps: float = 1.0,
+        max_frames_num: int = -1,
+        force_sample: bool = False,
+    ):
+        if max_frames_num == 0:
+            return np.zeros((1, 336, 336, 3))
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        total_frame_num = len(vr)
+        video_time = total_frame_num / vr.get_avg_fps()
+        fps = round(vr.get_avg_fps() / fps)
+        frame_idx = [i for i in range(0, len(vr), fps)]
+        frame_time = [i / fps for i in frame_idx]
+        if (max_frames_num > 0 and len(frame_idx) > max_frames_num) or force_sample:
+            sample_fps = max_frames_num
+            uniform_sampled_frames = np.linspace(
+                0, total_frame_num - 1, sample_fps, dtype=int
+            )
+            frame_idx = uniform_sampled_frames.tolist()
+            frame_time = [i / vr.get_avg_fps() for i in frame_idx]
+        frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+        spare_frames = vr.get_batch(frame_idx).asnumpy()
+        return spare_frames, frame_time, video_time
+    def chat(
+        self,
+        prompt: str,
+        video_path: str,
+        fps: float = 1.0,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+    ) -> str:
+        video, _, _ = self.load_video(video_path, fps)
+        video = self.image_processor.preprocess(video, return_tensors="pt")[
+            "pixel_values"
+        ].to(device=self.model.device, dtype=self.dtype)
+        video = [video]
+        conv_template = (
+            "qwen_1_5"  # Make sure you use correct chat template for different models
+        )
+        question = DEFAULT_IMAGE_TOKEN + f"\n{prompt}"
+        conv = copy.deepcopy(conv_templates[conv_template])
+        conv.append_message(conv.roles[0], question)
+        conv.append_message(conv.roles[1], None)
+        prompt_question = conv.get_prompt()
+        input_ids = (
+            tokenizer_image_token(
+                prompt_question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+            )
+            .unsqueeze(0)
+            .to(self.model.device)
+        )
+        cont = self.model.generate(
+            input_ids,
+            images=video,
+            modalities=["video"],
+            do_sample=False,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+        )
+        text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[
+            0
+        ].strip()
+        return text_outputs
+    def chat_with_confidence(
+        self,
+        prompt: str,
+        video_path: str,
+        fps: float = 1.0,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        token_choices: Optional[List[str]] = ["Yes", "No"],
+        logits_temperature: Optional[float] = 1.0,
+        return_confidence: Optional[bool] = False,
+        debug: Optional[bool] = False,
+    ) -> Dict[str, Any]:
+        pass

models/qwen2_5.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# This script requires transformers==4.57.0
+import torch
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+)
+from typing import Optional, Dict, Any, Union, List
+from qwen_vl_utils import process_vision_info
+# Handle both relative and absolute imports
+try:
+    from .base import BaseVideoModel
+except ImportError:
+    from base import BaseVideoModel
+class Qwen2_5VLModel(BaseVideoModel):
+    def __init__(
+        self,
+        model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct",
+        dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
+        device_map: Optional[Union[str, Dict]] = "auto",
+        attn_implementation: Optional[str] = "flash_attention_2",
+    ):
+        super().__init__(model_name)
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_name,
+            dtype=dtype,
+            device_map=device_map,
+            attn_implementation=attn_implementation,
+        )
+        self.processor = AutoProcessor.from_pretrained(model_name)
+    def chat(
+        self,
+        prompt: str,
+        video_path: str,
+        fps: float = 1.0,
+        temperature: float = 0.7,
+        max_new_tokens: int = 512,
+    ) -> str:
+        # Messages containing a local video path and a text query
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_path,
+                        # "max_pixels": 360 * 420,
+                        "fps": fps,
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs, video_kwargs = process_vision_info(
+            messages, return_video_kwargs=True
+        )
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+            **video_kwargs,
+        )
+        inputs = inputs.to(self.model.device)
+        # Inference
+        generated_ids = self.model.generate(
+            **inputs,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+        )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_response = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        return output_response
+    def chat_with_confidence(
+        self,
+        prompt: str,
+        video_path: str,
+        fps: float = 1.0,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        token_choices: Optional[List[str]] = ["Yes", "No"],
+        logits_temperature: Optional[float] = 1.0,
+        return_confidence: Optional[bool] = False,
+        debug: Optional[bool] = False,
+    ) -> Dict[str, Any]:
+        """
+        Returns the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
+        Args:
+            prompt (str): The text prompt to generate a response for.
+            video_path (str): The path to the video file.
+            fps (float, optional): The frames per second of the video. Defaults to 1.0.
+            max_new_tokens (int, optional): The maximum number of new tokens to generate. Defaults to 128.
+            temperature (float, optional): The temperature to use for generation. Defaults to 0.7.
+            logits_temperature (float, optional): The logits temperature to use for generation. Defaults to 1.0.
+            token_choices (List[str], optional): The list of token choices to return logits for. Defaults to ["Yes", "No"].
+            return_confidence (bool, optional): Whether to return the confidence of the response. Defaults to False.
+            debug (bool, optional): Whether to run in debug mode. Defaults to False.
+        Returns:
+            Dict[str, Any]: A dictionary containing the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
+        e.g., return_confidence: False
+            Output:
+                {
+                    "response": "Yes",
+                    "logits": {
+                        "Yes": 12.0,
+                        "No": 9.0
+                    }
+                }
+        e.g., return_confidence: True
+            Output:
+                {
+                    "response": "Yes",
+                    "confidence": 0.9999
+                }
+        """
+        # Messages containing a local video path and a text query
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_path,
+                        # "max_pixels": 360 * 420,
+                        "fps": fps,
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs, video_kwargs = process_vision_info(
+            messages, return_video_kwargs=True
+        )
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+            **video_kwargs,
+        )
+        inputs = inputs.to(self.model.device)
+        # Inference with scores
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                output_scores=True,
+                return_dict_in_generate=True,
+            )
+        generated_ids = outputs.sequences
+        scores = outputs.scores  # Tuple of tensors, one per generated token
+        scores = tuple(
+            s / logits_temperature for s in scores
+        )  # Scales the logits by a factor for normalization during reporting
+        print(f"Number of generated tokens: {len(scores)}")
+        print(f"Vocabulary size: {scores[0].shape[1]}")
+        # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
+        if debug:
+            print("****Running inference in debug mode****")
+            # Print first token scores shape and max/min scores in debug mode
+            print(f"Single token scores shape: {scores[0].shape}")
+            print(
+                f"First token max/min scores: {scores[0].max().item()}, {scores[0].min().item()}"
+            )
+            # Print details about top 3 tokens
+            top_3_tokens = torch.topk(scores[0], k=3, dim=-1)
+            for i in range(3):
+                print(
+                    f"Pos 0 | {i+1}th Token: {self.processor.decode(top_3_tokens.indices[0, i].item())}"
+                )
+                print(
+                    f"Pos 0 | {i+1}th Token logit: {top_3_tokens.values[0, i].item()}"
+                )
+        # Trim the prompt tokens from generated sequences
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode the text
+        output_response = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        # Convert scores to probabilities
+        # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
+        selected_token_probs = []
+        selected_token_logits = []
+        first_token_probs = torch.softmax(scores[0], dim=-1)
+        # Now, find indices of tokens in token_choices and get their probabilities
+        for token_choice in token_choices:
+            # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
+            token_index = self.processor.tokenizer.encode(
+                token_choice, add_special_tokens=False
+            )[0]
+            selected_token_probs.append(first_token_probs[0, token_index].item())
+            selected_token_logits.append(scores[0][0, token_index].item())
+        # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
+        if return_confidence:
+            first_token_id = generated_ids_trimmed[0][
+                0
+            ].item()  # First token of the first sequence
+            confidence = (
+                first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
+                if sum(selected_token_probs) > 0
+                else 0.0
+            )
+            return {
+                "response": output_response,
+                "confidence": confidence,
+            }
+        # Retrn token logits
+        else:
+            token_logits = dict(zip(token_choices, selected_token_logits))
+            return {
+                "response": output_response,
+                "logits": token_logits,
+            }
+if __name__ == "__main__":
+    model_path = "Qwen/Qwen2.5-VL-7B-Instruct"  # "Qwen/Qwen2.5-VL-7B-Instruct"
+    model = Qwen2_5VLModel(model_path)
+    prompt = (
+        "Which of the following exist in the video? Answer in A or B.\nA: Hand\nB: Face"
+    )
+    token_choices = ["A", "B"]
+    ext = ".webm"
+    video_path = "/home/shreyasj/Syed/data/Something-Something-V2/videos/101917" + ext
+    generation_config = {
+        "max_new_tokens": 128,
+        "temperature": 0.7,
+        "logits_temperature": 5.0,
+        "fps": 3.0,
+        "return_confidence": False,
+        "debug": True,
+    }
+    output = model.chat_with_confidence(
+        prompt, video_path, token_choices=token_choices, **generation_config
+    )
+    response = output["response"]
+    print(f"Response: {response}")
+    if generation_config["return_confidence"]:
+        confidence = output["confidence"]
+        print(f"Confidence: {confidence}")
+    else:
+        selected_token_logits = output["logits"]
+        print(f"Selected token logits: {selected_token_logits}")
+        print(f"Logits temperature: {generation_config['logits_temperature']}")

models/qwen3vl.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# This script requires transformers==4.57.0
+import torch
+from transformers import (
+    Qwen3VLForConditionalGeneration,
+    AutoProcessor,
+)
+from typing import Optional, Dict, Any, Union, List
+from qwen_vl_utils import process_vision_info
+# Handle both relative and absolute imports
+try:
+    from .base import BaseVideoModel
+except ImportError:
+    from base import BaseVideoModel
+class Qwen3VLModel(BaseVideoModel):
+    def __init__(
+        self,
+        model_name: str = "Qwen/Qwen3-VL-8B-Instruct",
+        dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
+        device_map: Optional[Union[str, Dict]] = "auto",
+        attn_implementation: Optional[str] = "flash_attention_2",
+    ):
+        super().__init__(model_name)
+        self.model = Qwen3VLForConditionalGeneration.from_pretrained(
+            model_name,
+            dtype=dtype,
+            device_map=device_map,
+            attn_implementation=attn_implementation,
+        )
+        self.processor = AutoProcessor.from_pretrained(model_name)
+    def chat(
+        self,
+        prompt: str,
+        video_path: str,
+        fps: float = 1.0,
+        temperature: float = 0.7,
+        max_new_tokens: int = 512,
+    ) -> str:
+        # Messages containing a local video path and a text query
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_path,
+                        # "max_pixels": 360 * 420,
+                        "fps": fps,
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.model.device)
+        generated_ids = self.model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+        )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_response = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        return output_response
+    def chat_with_confidence(
+        self,
+        prompt: str,
+        video_path: str,
+        fps: float = 1.0,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        token_choices: Optional[List[str]] = ["Yes", "No"],
+        logits_temperature: Optional[float] = 1.0,
+        return_confidence: Optional[bool] = False,
+        debug: Optional[bool] = False,
+    ) -> Dict[str, Any]:
+        """
+        Returns the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
+        Args:
+            prompt (str): The text prompt to generate a response for.
+            video_path (str): The path to the video file.
+            temperature (float, optional): The temperature to use for generation. Defaults to 0.7.
+            max_new_tokens (int, optional): The maximum number of new tokens to generate. Defaults to 512.
+            token_choices (List[str], optional): The list of token choices to return logits for. Defaults to ["Yes", "No"].
+            generation_config (Dict[str, Any], optional): The generation configuration. Defaults to None.
+            return_confidence (bool, optional): Whether to return the confidence of the response. Defaults to False.
+            debug (bool, optional): Whether to run in debug mode. Defaults to False.
+        Returns:
+            Dict[str, Any]: A dictionary containing the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
+        e.g., return_confidence: False
+            Output:
+                {
+                    "response": "Yes",
+                    "logits": {
+                        "Yes": 12.0,
+                        "No": 9.0
+                    }
+                }
+        e.g., return_confidence: True
+            Output:
+                {
+                    "response": "Yes",
+                    "confidence": 0.9999
+                }
+        """
+        # Messages containing a local video path and a text query
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_path,
+                        # "max_pixels": 360 * 420,
+                        "fps": fps,
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, videos, video_kwargs = process_vision_info(
+            messages,
+            image_patch_size=16,
+            return_video_kwargs=True,
+            return_video_metadata=True,
+        )
+        # Extract out videos and video metadata
+        if videos is not None:
+            videos, video_metadatas = zip(*videos)
+            videos, video_metadatas = list(videos), list(video_metadatas)
+        else:
+            video_metadatas = None
+        inputs = self.processor(
+            text=text,
+            images=image_inputs,
+            videos=videos,
+            video_metadata=video_metadatas,
+            return_tensors="pt",
+            do_resize=False,
+            **video_kwargs,
+        )
+        inputs = inputs.to(self.model.device)
+        # Inference with scores
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                output_scores=True,
+                return_dict_in_generate=True,
+            )
+        generated_ids = outputs.sequences
+        scores = outputs.scores  # Tuple of tensors, one per generated token
+        scores = tuple(
+            s / logits_temperature for s in scores
+        )  # Scales the logits by a factor for normalization during reporting
+        print(f"Number of generated tokens: {len(scores)}")
+        print(f"Vocabulary size: {scores[0].shape[1]}")
+        # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
+        if debug:
+            print("****Running inference in debug mode****")
+            # Print first token scores shape and max/min scores in debug mode
+            print(f"Single token scores shape: {scores[0].shape}")
+            print(
+                f"First token max/min scores: {scores[0].max().item()}, {scores[0].min().item()}"
+            )
+            # Print details about top 3 tokens
+            top_3_tokens = torch.topk(scores[0], k=3, dim=-1)
+            for i in range(3):
+                print(
+                    f"Pos 0 | {i+1}th Token: {self.processor.decode(top_3_tokens.indices[0, i].item())}"
+                )
+                print(
+                    f"Pos 0 | {i+1}th Token logit: {top_3_tokens.values[0, i].item()}"
+                )
+        # Trim the prompt tokens from generated sequences
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode the text
+        output_response = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        # Convert scores to probabilities
+        # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
+        selected_token_probs = []
+        selected_token_logits = []
+        first_token_probs = torch.softmax(scores[0], dim=-1)
+        # Now, find indices of tokens in token_choices and get their probabilities
+        for token_choice in token_choices:
+            # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
+            token_index = self.processor.tokenizer.encode(
+                token_choice, add_special_tokens=False
+            )[0]
+            selected_token_probs.append(first_token_probs[0, token_index].item())
+            selected_token_logits.append(scores[0][0, token_index].item())
+        # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
+        if return_confidence:
+            first_token_id = generated_ids_trimmed[0][
+                0
+            ].item()  # First token of the first sequence
+            confidence = (
+                first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
+                if sum(selected_token_probs) > 0
+                else 0.0
+            )
+            return {
+                "response": output_response,
+                "confidence": confidence,
+            }
+        # Retrn token logits
+        else:
+            token_logits = dict(zip(token_choices, selected_token_logits))
+            return {
+                "response": output_response,
+                "logits": token_logits,
+            }
+if __name__ == "__main__":
+    model_path = "Qwen/Qwen3-VL-4B-Instruct"  # "Qwen/Qwen3-VL-8B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct"
+    model = Qwen3VLModel(model_path)
+    prompt = "Describe this video."
+    ext = ".mp4"
+    video_path = (
+        "/home/shreyasj/Syed/data/Something-Something-V2/pre-post/videos/1586" + ext
+    )
+    response = model.chat(prompt, video_path)
+    print("Response: ", response)
+    token_choices = ["A", "B"]
+    ext = ".webm"
+    video_path = "/home/shreyasj/Syed/data/Something-Something-V2/videos/101917" + ext
+    generation_config = {
+        "max_new_tokens": 128,
+        "temperature": 0.7,
+        "logits_temperature": 5.0,
+        "fps": 3.0,
+        "return_confidence": False,
+        "debug": True,
+    }
+    output = model.chat_with_confidence(
+        prompt, video_path, token_choices=token_choices, **generation_config
+    )
+    response = output["response"]
+    print(f"Response: {response}")
+    if generation_config["return_confidence"]:
+        confidence = output["confidence"]
+        print(f"Confidence: {confidence}")
+    else:
+        selected_token_logits = output["logits"]
+        print(f"Selected token logits: {selected_token_logits}")
+        print(f"Logits temperature: {generation_config['logits_temperature']}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,149 @@

+accelerate==0.33.0
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+anyio==4.12.1
+asttokens==3.0.1
+async-timeout==4.0.3
+attrs==24.2.0
+av==12.3.0
+beautifulsoup4==4.14.3
+bitsandbytes==0.41.0
+black==25.12.0
+cachetools==6.2.4
+certifi==2024.8.30
+cfgv==3.5.0
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.3.2
+cuda-bindings==12.9.4
+cuda-pathfinder==1.3.3
+cycler==0.12.1
+datasets==2.16.1
+decorator==5.2.1
+decord==0.6.0
+deepspeed==0.14.2
+dill==0.3.7
+distlib==0.4.0
+distro==1.9.0
+docker-pycreds==0.4.0
+docstring_parser==0.16
+einops==0.6.1
+einops-exts==0.0.4
+exceptiongroup==1.3.1
+executing==2.2.1
+filelock==3.20.3
+flash-attn==2.5.7
+fonttools==4.61.1
+frozenlist==1.4.1
+fsspec==2023.10.0
+ftfy==6.2.3
+gdown==5.2.1
+gitdb==4.0.11
+GitPython==3.1.43
+gradio==6.2.0
+gradio_client==2.0.2
+h11==0.16.0
+hf-xet==1.2.0
+hf_transfer==0.1.8
+hjson==3.1.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==1.4.1
+identify==2.6.16
+ipython==8.38.0
+jedi==0.19.2
+Jinja2==3.1.4
+jiter==0.6.1
+joblib==1.5.3
+kiwisolver==1.4.9
+latex2mathml==3.77.0
+llava @ git+https://github.com/LLaVA-VL/LLaVA-NeXT.git@e9835311c6f515a13702eb7a7750fcd936f65ed8
+markdown-it-py==3.0.0
+markdown2==2.5.0
+MarkupSafe==2.1.5
+matplotlib==3.10.8
+matplotlib-inline==0.2.1
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.15
+mypy_extensions==1.1.0
+networkx==3.4.2
+ninja==1.11.1.1
+nltk==3.9.2
+nodeenv==1.10.0
+numpy==1.26.4
+open_clip_torch==2.26.1
+openai==1.52.2
+opencv-python==4.10.0.84
+packaging==26.0
+pandas==2.3.3
+parso==0.8.5
+pathspec==1.0.3
+peft==0.4.0
+pexpect==4.9.0
+pillow==12.1.0
+platformdirs==4.2.2
+pre_commit==4.5.1
+prompt_toolkit==3.0.52
+protobuf==5.28.0
+psutil==7.2.1
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pyarrow==17.0.0
+pyarrow-hotfix==0.6
+pydantic_core==2.41.5
+Pygments==2.18.0
+pynvml==13.0.1
+pyparsing==3.3.2
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+pytokens==0.3.0
+pytz==2024.1
+PyYAML
+regex==2026.1.15
+requests==2.32.3
+rich==13.8.0
+safetensors==0.7.0
+scikit-learn==1.7.2
+scipy==1.15.3
+seaborn==0.13.2
+sentence-transformers==5.2.2
+sentencepiece==0.1.99
+sentry-sdk==2.13.0
+setproctitle==1.3.3
+shellingham==1.5.4
+shortuuid==1.0.13
+shtab==1.7.1
+six==1.16.0
+smmap==5.0.1
+soupsieve==2.8.3
+stack-data==0.6.3
+svgwrite==1.4.3
+sympy==1.14.0
+termcolor==3.3.0
+threadpoolctl==3.6.0
+timm==1.0.9
+tokenizers==0.22.2
+tomli==2.4.0
+torch==2.2.1
+torchvision==0.17.1
+tqdm==4.67.3
+traitlets==5.14.3
+transformers==5.1.0
+triton==2.2.0
+typer==0.20.0
+typer-slim==0.21.1
+typing_extensions==4.15.0
+tyro==0.8.10
+tzdata==2025.3
+urllib3==1.26.20
+uvicorn==0.30.6
+virtualenv==20.36.1
+wandb==0.17.8
+wavedrom==2.0.3.post3
+wcwidth==0.2.13
+websockets==13.0.1
+xxhash==3.5.0
+yarl==1.9.7