Spaces:

jena-shreyas
/

Video-Inference-Demo

Paused

App Files Files Community

jena-shreyas commited on Feb 9

Commit

ef7643d

1 Parent(s): cf5f08b

Add transformers v5 integration to models

Browse files

Files changed (6) hide show

app.py +2 -2
models/__init__.py +25 -92
models/base.py +8 -8
models/llava_video.py +222 -243
models/qwen2_5vl.py +232 -222
models/qwen3vl.py +431 -477

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from models.base import BaseVideoModel
 # ----------------------
 # CONFIG
 # ----------------------
-MODEL_PATH = "lmms-lab/LLaVA-Video-7B-Qwen2"
 DEVICE_MAP = "cuda:0"
 VIDEO_DIR = str(Path(__file__).parent / "videos")
@@ -130,7 +130,7 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
             fps_slider = gr.Slider(
                 minimum=0.5,
-                maximum=5.0,
                 step=0.5,
                 value=FPS,
                 label="🎞️ Frames Per Second (FPS)",

 # ----------------------
 # CONFIG
 # ----------------------
+MODEL_PATH = "Isotr0py/LLaVA-Video-7B-Qwen2-hf"
 DEVICE_MAP = "cuda:0"
 VIDEO_DIR = str(Path(__file__).parent / "videos")
             fps_slider = gr.Slider(
                 minimum=0.5,
+                maximum=10.0,
                 step=0.5,
                 value=FPS,
                 label="🎞️ Frames Per Second (FPS)",

models/__init__.py CHANGED Viewed

@@ -3,66 +3,27 @@ from packaging import version
 import torch
 from typing import Optional, Union, Dict
 # IMP: Add required versions here
-qwen_required_version = version.parse("4.57.0")
-internvl_required_version = version.parse("4.45.0")
-llava_required_version = version.parse("4.40.0")
 # Conditional imports based on transformers version
-try:
-    import transformers
-    from transformers.generation.logits_process import LogitsProcessor
-    # Check transformers version
-    transformers_version = version.parse(transformers.__version__)
-    QWEN_MODELS_AVAILABLE = False
-    INTERNVL_MODELS_AVAILABLE = False
-    LLAVA_MODELS_AVAILABLE = False
-    # Qwen condition
-    if transformers_version >= qwen_required_version:
-        from .qwen2_5vl import Qwen2_5VLModel
-        from .qwen3vl import Qwen3VLModel
-        QWEN_MODELS_AVAILABLE = True
-    else:
-        print(
-            f"Warning: Qwen models require transformers>=4.57.0, but found {transformers.__version__}. Qwen models will not be available. Please upgrade to transformers>=4.57.0 or switch conda environments to use Qwen models."
-        )
-    # InternVL condition
-    if transformers_version >= internvl_required_version:
-        from .internvl import InternVLModel
-        INTERNVL_MODELS_AVAILABLE = True
-    else:
-        print(
-            f"Warning: InternVL models require transformers>=4.45.0, but found {transformers.__version__}. InternVL models will not be available. Please downgrade to transformers<=4.45.0 or switch conda environments to use InternVL models."
-        )
-    # LLaVA condition
-    if transformers_version <= llava_required_version:
-        from .llava_video import LLaVAVideoModel
-        LLAVA_MODELS_AVAILABLE = True
-    else:
-        print(
-            f"Warning: LLaVA models require transformers<=4.40.0, but found {transformers.__version__}. LLaVA models will not be available. Please downgrade to transformers<=4.40.0 or switch conda environments to use LLaVA models."
-        )
-except ImportError:
-    print(
-        "Warning: Could not check transformers version. Please re-check transformers installation."
-    )
-# Build __all__ list dynamically
-__all__ = []
-if QWEN_MODELS_AVAILABLE:
-    __all__.extend(["Qwen2_5VLModel", "Qwen3VLModel"])
-if INTERNVL_MODELS_AVAILABLE:
-    __all__.append("InternVLModel")
-if LLAVA_MODELS_AVAILABLE:
-    __all__.append("LLaVAVideoModel")
 # Function to get the model by mapping model ID to the correct model class
@@ -71,31 +32,27 @@ def load_model(
     dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
     device_map: Optional[Union[str, Dict]] = "auto",
     attn_implementation: Optional[str] = "flash_attention_2",
 ) -> BaseVideoModel:
     if "LLaVA-Video" in model_path:
-        if not LLAVA_MODELS_AVAILABLE:
-            raise ImportError(
-                f"LLaVA models require transformers<=4.40.0."
-                f"Please downgrade transformers: pip install transformers<=4.40.0"
-            )
         return LLaVAVideoModel(
             model_path,
             dtype=dtype,
             device_map=device_map,
             attn_implementation=attn_implementation,
         )
     elif "Qwen" in model_path:
-        if not QWEN_MODELS_AVAILABLE:
-            raise ImportError(
-                f"Qwen models require transformers>=4.57.0."
-                f"Please upgrade transformers: pip install transformers>=4.57.0"
-            )
         if "Qwen3" in model_path:
             return Qwen3VLModel(
                 model_path,
                 dtype=dtype,
                 device_map=device_map,
                 attn_implementation=attn_implementation,
             )
         else:
             return Qwen2_5VLModel(
@@ -103,39 +60,15 @@ def load_model(
                 dtype=dtype,
                 device_map=device_map,
                 attn_implementation=attn_implementation,
             )
     elif "Intern" in model_path:
-        if not INTERNVL_MODELS_AVAILABLE:
-            raise ImportError(
-                f"InternVL models require transformers>=4.45.0."
-                f"Please upgrade transformers: pip install transformers>=4.45.0"
-            )
         return InternVLModel(
             model_path,
             dtype=dtype,
             device_map=device_map,
             attn_implementation=attn_implementation,
         )
-class LogitsCaptureProcessor(LogitsProcessor):
-    """
-    Custom LogitsProcessor that captures the processed logits right before sampling.
-    This allows us to see what the actual distribution looks like after all other
-    processors have been applied.
-    """
-    def __init__(self):
-        self.captured_logits = []
-    def __call__(
-        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
-    ) -> torch.FloatTensor:
-        # Store a copy of the logits at this point in generation
-        self.captured_logits.append(scores.detach().clone().cpu())
-        # Return scores unchanged - we're just observing
-        return scores
-    def reset(self):
-        """Clear captured logits for a new generation"""
-        self.captured_logits = []

 import torch
 from typing import Optional, Union, Dict
 # IMP: Add required versions here
+transformers_required_version = version.parse("5.0.0")
 # Conditional imports based on transformers version
+import transformers
+from transformers import BitsAndBytesConfig
+# Check transformers version
+transformers_version = version.parse(transformers.__version__)
+# transformers v5 condition
+if transformers_version >= transformers_required_version:
+    from .qwen2_5vl import Qwen2_5VLModel
+    from .qwen3vl import Qwen3VLModel
+    from .internvl import InternVLModel
+    from .llava_video import LLaVAVideoModel
+    TRANSFORMERS_MODELS_AVAILABLE = True
+else:
+    raise ValueError(f"Transformers v5 models require transformers>=5.0.0, but found {transformers.__version__}. Transformers v5 models will not be available. Please upgrade to transformers>=5.0.0 or switch conda environments to use Transformers v5 models.")
 # Function to get the model by mapping model ID to the correct model class
     dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
     device_map: Optional[Union[str, Dict]] = "auto",
     attn_implementation: Optional[str] = "flash_attention_2",
+    load_8bit: Optional[bool] = False,
+    load_4bit: Optional[bool] = False,
 ) -> BaseVideoModel:
     if "LLaVA-Video" in model_path:
         return LLaVAVideoModel(
             model_path,
             dtype=dtype,
             device_map=device_map,
             attn_implementation=attn_implementation,
+            load_8bit=load_8bit,
+            load_4bit=load_4bit,
         )
     elif "Qwen" in model_path:
         if "Qwen3" in model_path:
             return Qwen3VLModel(
                 model_path,
                 dtype=dtype,
                 device_map=device_map,
                 attn_implementation=attn_implementation,
+                load_8bit=load_8bit,
+                load_4bit=load_4bit,
             )
         else:
             return Qwen2_5VLModel(
                 dtype=dtype,
                 device_map=device_map,
                 attn_implementation=attn_implementation,
+                load_8bit=load_8bit,
+                load_4bit=load_4bit,
             )
     elif "Intern" in model_path:
         return InternVLModel(
             model_path,
             dtype=dtype,
             device_map=device_map,
             attn_implementation=attn_implementation,
+            load_8bit=load_8bit,
+            load_4bit=load_4bit,
         )

models/base.py CHANGED Viewed

@@ -17,11 +17,11 @@ class BaseVideoModel(ABC):
     ) -> str:
         pass
-    @abstractmethod
-    def chat_with_confidence(
-        self,
-        prompt: str,
-        video_path: str,
-        generation_config: Optional[Dict[str, Any]] = None,
-    ) -> Dict[str, Union[str, float]]:
-        pass

     ) -> str:
         pass
+    # @abstractmethod
+    # def chat_with_confidence(
+    #     self,
+    #     prompt: str,
+    #     video_path: str,
+    #     generation_config: Optional[Dict[str, Any]] = None,
+    # ) -> Dict[str, Union[str, float]]:
+    #     pass

models/llava_video.py CHANGED Viewed

@@ -1,26 +1,11 @@
 # Run with `conda activate llava`
-from llava.model.builder import load_pretrained_model
-from llava.mm_utils import (
-    get_model_name_from_path,
-    process_images,
-    tokenizer_image_token,
-)
-from llava.constants import (
-    IMAGE_TOKEN_INDEX,
-    DEFAULT_IMAGE_TOKEN,
-    DEFAULT_IM_START_TOKEN,
-    DEFAULT_IM_END_TOKEN,
-    IGNORE_INDEX,
-)
-from llava.conversation import conv_templates, SeparatorStyle
-from PIL import Image
-import requests
 import copy
 import torch
-from typing import Optional, Union, Dict, List, Tuple, Any
-import warnings
-from decord import VideoReader, cpu
 import numpy as np
 # Handle both relative and absolute imports
 try:
@@ -30,46 +15,37 @@ except ImportError:
 warnings.filterwarnings("ignore")
 class LLaVAVideoModel(BaseVideoModel):
     def __init__(
         self,
-        model_name: str = "lmms-lab/LLaVA-Video-7B-Qwen2",
         dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
         device_map: Optional[Union[str, Dict]] = "auto",
         attn_implementation: Optional[str] = "flash_attention_2",
     ):
         super().__init__(model_name)
-        base_model = "llava_qwen"
         self.dtype = dtype
-        # Convert torch dtype to string for safety, since LLaVA-Video only accepts torch_dtype as a string
-        if dtype == torch.bfloat16:
-            torch_dtype = "bfloat16"
-        elif dtype == torch.float16:
-            torch_dtype = "float16"
-        self.tokenizer, self.model, self.image_processor, max_length = (
-            load_pretrained_model(
-                model_name,
-                None,
-                base_model,
-                torch_dtype=torch_dtype,
-                device_map=device_map,
-            )
-        )  # Add any other thing you want to pass in llava_model_args
-        self.model.eval()
-        # Ensure all model components are on the same device
-        # The vision tower and mm_projector may not be on the correct device with device_map using `load_pretrained_model`, so need to explicitly move to the model's device
-        if hasattr(self.model, "get_vision_tower"):
-            vision_tower = self.model.get_vision_tower()
-            if vision_tower is not None:
-                vision_tower.to(self.model.device)
-        if hasattr(self.model, "get_model"):
-            model_inner = self.model.get_model()
-            if hasattr(model_inner, "mm_projector"):
-                model_inner.mm_projector.to(self.model.device)
     def load_video(
         self,
@@ -101,224 +77,227 @@ class LLaVAVideoModel(BaseVideoModel):
         self,
         prompt: str,
         video_path: str,
-        fps: float = 1.0,
         max_new_tokens: int = 512,
         do_sample: Optional[
             bool
         ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
         temperature: float = 0.7,
         video_mode: Optional[str] = "video",
-        video_frames: Optional[int] = 10,
         **kwargs: Any,
     ) -> str:
         if video_mode == "frames":
-            video, _, _ = self.load_video(video_path, max_frames_num=video_frames)
         elif video_mode == "video":
-            video, _, _ = self.load_video(video_path, fps)
-        video = self.image_processor.preprocess(video, return_tensors="pt")[
-            "pixel_values"
-        ].to(device=self.model.device, dtype=self.dtype)
-        video = [video]
-        conv_template = (
-            "qwen_1_5"  # Make sure you use correct chat template for different models
-        )
-        question = DEFAULT_IMAGE_TOKEN + f"\n{prompt}"
-        conv = copy.deepcopy(conv_templates[conv_template])
-        conv.append_message(conv.roles[0], question)
-        conv.append_message(conv.roles[1], None)
-        prompt_question = conv.get_prompt()
-        input_ids = (
-            tokenizer_image_token(
-                prompt_question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
-            )
-            .unsqueeze(0)
-            .to(self.model.device)
-        )
-        cont = self.model.generate(
-            input_ids,
-            images=video,
-            modalities=["video"],
-            do_sample=do_sample,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            **kwargs,
-        )
-        text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[
-            0
-        ].strip()
-        return text_outputs
-    def chat_with_confidence(
-        self,
-        prompt: str,
-        video_path: str,
-        fps: float = 1.0,
-        max_new_tokens: int = 512,
-        temperature: float = 0.7,
-        do_sample: Optional[
-            bool
-        ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
-        token_choices: Optional[List[str]] = ["Yes", "No"],
-        logits_temperature: Optional[float] = 1.0,
-        return_confidence: Optional[bool] = False,
-        top_k_tokens: Optional[int] = 10,
-        debug: Optional[bool] = False,
-    ) -> Dict[str, Any]:
-        video, _, _ = self.load_video(video_path, fps)
-        video = self.image_processor.preprocess(video, return_tensors="pt")[
-            "pixel_values"
-        ].to(device=self.model.device, dtype=self.dtype)
-        video = [video]
-        conv_template = (
-            "qwen_1_5"  # Make sure you use correct chat template for different models
-        )
-        question = DEFAULT_IMAGE_TOKEN + f"\n{prompt}"
-        conv = copy.deepcopy(conv_templates[conv_template])
-        conv.append_message(conv.roles[0], question)
-        conv.append_message(conv.roles[1], None)
-        prompt_question = conv.get_prompt()
-        input_ids = (
-            tokenizer_image_token(
-                prompt_question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
-            )
-            .unsqueeze(0)
-            .to(self.model.device)
-        )
         with torch.no_grad():
-            outputs = self.model.generate(
-                input_ids,
-                images=video,
-                modalities=["video"],
-                do_sample=do_sample,  # Was set to False, i.e., greedy sampling, which invalidates things like temperature, top-K, top-P!
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
-                output_scores=True,
-                return_dict_in_generate=True,
             )
-        generated_ids = outputs.sequences
-        scores = outputs.scores  # Tuple of tensors, one per generated token
-        print(f"Number of generated tokens: {len(scores)}")
-        print(f"Vocabulary size: {scores[0].shape[1]}")
-        # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
-        if debug:
-            print("****Running inference in debug mode****")
-            # Print first token scores shape and max/min scores in debug mode
-            print(f"Single token scores shape: {scores[0].shape}")
-            print(
-                f"Max score: {scores[0].max().item():.4f} | Min score: {scores[0].min().item():.4f}"
-            )
-            # Print details about top 10 tokens based on logits
-            logits_type = "POST-PROCESSED" if do_sample is True else "RAW"
-            print(f"\n{'─'*80}")
-            print(
-                f"TOP {top_k_tokens} TOKENS FROM {logits_type} LOGITS (outputs.scores):"
-            )
-            print(f"{'─'*80}")
-            top_k_tokens_scores = torch.topk(scores[0], k=top_k_tokens, dim=-1)
-            for i in range(top_k_tokens):
-                score = top_k_tokens_scores.values[0, i].item()
-                score_index = top_k_tokens_scores.indices[0, i].item()
-                token = self.tokenizer.decode(score_index)
-                print(f"#{i+1}th Token: {token}")
-                print(f"#{i+1}th Token index: {score_index}")
-                print(f"#{i+1}th Token score: {score}")
-                print("--------------------------------")
-        # Decode the text
-        output_response = self.tokenizer.batch_decode(
-            generated_ids,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0]
-        # Convert scores to probabilities
-        # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
-        selected_token_probs = []
-        selected_token_logits = []
-        first_token_probs = torch.softmax(scores[0], dim=-1)
-        # Now, find indices of tokens in token_choices and get their probabilities
-        for token_choice in token_choices:
-            # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
-            token_index = self.tokenizer.encode(token_choice, add_special_tokens=False)[
-                0
-            ]
-            selected_token_probs.append(first_token_probs[0, token_index].item())
-            selected_token_logits.append(scores[0][0, token_index].item())
-        # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
-        if return_confidence:
-            first_token_id = generated_ids[0][
-                0
-            ].item()  # First token of the first sequence
-            confidence = (
-                first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
-                if sum(selected_token_probs) > 0
-                else 0.0
-            )
-            return {
-                "response": output_response,
-                "confidence": confidence,
-            }
-        # Return token logits
-        else:
-            token_logits = dict(zip(token_choices, selected_token_logits))
-            top_k_logits_indices = torch.topk(scores[0], k=top_k_tokens, dim=-1)
-            top_k_tokens_list: List[Tuple[str, int, float]] = []
-            for i in range(top_k_tokens):
-                logit_index = top_k_logits_indices.indices[0, i].item()
-                token = self.tokenizer.decode(logit_index)
-                logit = top_k_logits_indices.values[0, i].item()
-                top_k_tokens_list.append((token, logit_index, logit))
-            return {
-                "response": output_response,
-                "top_k_tokens": top_k_tokens_list,
-                "token_logits": token_logits,
-            }
-if __name__ == "__main__":
-    model_path = "lmms-lab/LLaVA-Video-7B-Qwen2"  # "Qwen/Qwen3-VL-8B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct"
-    device_map = "cuda:0"
-    model = LLaVAVideoModel(model_path, device_map=device_map)
-    prompt = 'Does the following action accurately describe the one shown in the video? \nAnswer with "Yes" or "No".\n\nAction: Trying but failing to attach clip to ring because it doesn\'t stick\n\nConditions which may/may not be true BEFORE the aforementioned action occurs:\n- Clip is physically separate from the ring\n- Hand is holding the clip\n\nConditions which may/may not be true AFTER the aforementioned action occurs:\n- Clip is physically separate from the ring\n- Clip is not attached to the ring\n\nAnswer:'
-    token_choices = ["Yes", "No"]
-    video_path = (
-        "/home/shreyasj/Syed/data/Something-Something-V2/pre-post/videos/101917.mp4"
-    )
-    generation_config = {
-        "max_new_tokens": 128,
-        "do_sample": False,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P. Allows return of raw logits
-        "temperature": 0.7,
-        "logits_temperature": 1.0,
-        "fps": 1.0,
-        "return_confidence": False,
-        "top_k_tokens": 10,
-        "debug": False,
-    }
-    output = model.chat_with_confidence(
-        prompt, video_path, token_choices=token_choices, **generation_config
-    )
-    response = output["response"]
-    print(f"Response: {response}")
-    if generation_config["return_confidence"]:
-        confidence = output["confidence"]
-        print(f"Confidence: {confidence}")
-    else:
-        # If do_sample is True, logits pass through logit warpers which filter out un-important tokens (based on logits) to -inf,
-        # otherwise, the raw logits are used, which are not filtered.
-        logits_type = "POST-PROCESSED" if generation_config["do_sample"] else "RAW"
-        print(f"\n{'─'*80}")
-        print(f"TOP 10 TOKENS FROM {logits_type} LOGITS (outputs.scores):")
-        print(f"{'─'*80}")
-        top_k_tokens = output["top_k_tokens"]
-        for i in range(len(top_k_tokens)):
-            print(f"Top {i+1} token: {top_k_tokens[i][0]}")
-            print(f"Top {i+1} token index: {top_k_tokens[i][1]}")
-            print(f"Top {i+1} token logit: {top_k_tokens[i][2]}")
-            print("--------------------------------")

 # Run with `conda activate llava`
+import warnings
 import copy
 import torch
 import numpy as np
+from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig
+from typing import Optional, Dict, Any, Union, List
+from decord import VideoReader, cpu
 # Handle both relative and absolute imports
 try:
 warnings.filterwarnings("ignore")
 class LLaVAVideoModel(BaseVideoModel):
     def __init__(
         self,
+        model_name: str = "Isotr0py/LLaVA-Video-7B-Qwen2-hf",
         dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
         device_map: Optional[Union[str, Dict]] = "auto",
         attn_implementation: Optional[str] = "flash_attention_2",
+        load_8bit: Optional[bool] = False,
+        load_4bit: Optional[bool] = False,
     ):
         super().__init__(model_name)
         self.dtype = dtype
+        # For quantized models (8-bit or 4-bit), device_map must be "auto" or a dict, not a device string
+        quantization_config = None
+        if load_8bit or load_4bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=load_8bit,
+                load_in_4bit=load_4bit,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16
+            )
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            model_name,
+            quantization_config=quantization_config,
+            device_map=device_map,
+            attn_implementation=attn_implementation,
+            dtype=dtype,
+        )
+        self.processor = AutoProcessor.from_pretrained(model_name)
     def load_video(
         self,
         self,
         prompt: str,
         video_path: str,
         max_new_tokens: int = 512,
         do_sample: Optional[
             bool
         ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
         temperature: float = 0.7,
         video_mode: Optional[str] = "video",
+        fps: Optional[float] = 1.0,
+        num_frames: Optional[int] = 10,
         **kwargs: Any,
     ) -> str:
+        # Ensure only one of fps or num_frames is provided
         if video_mode == "frames":
+            fps = None
         elif video_mode == "video":
+            num_frames = None
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_path,
+                    },
+                    {"type": "text", "text": prompt}
+                ],
+            },
+        ]
+        inputs = self.processor.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            do_sample_frames=True,
+            fps=fps,
+            num_frames=num_frames
+        ).to(self.model.device)
         with torch.no_grad():
+            out = self.model.generate(
+                **inputs,
+                do_sample=do_sample,
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
+                **kwargs,
             )
+        raw_response = self.processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
+        response = raw_response.split("assistant")[1].strip()
+        return response
+#     def chat_with_confidence(
+#         self,
+#         prompt: str,
+#         video_path: str,
+#         fps: float = 1.0,
+#         max_new_tokens: int = 512,
+#         temperature: float = 0.7,
+#         do_sample: Optional[
+#             bool
+#         ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
+#         token_choices: Optional[List[str]] = ["Yes", "No"],
+#         logits_temperature: Optional[float] = 1.0,
+#         return_confidence: Optional[bool] = False,
+#         top_k_tokens: Optional[int] = 10,
+#         debug: Optional[bool] = False,
+#     ) -> Dict[str, Any]:
+#         video, _, _ = self.load_video(video_path, fps)
+#         video = self.image_processor.preprocess(video, return_tensors="pt")[
+#             "pixel_values"
+#         ].to(device=self.model.device, dtype=self.dtype)
+#         video = [video]
+#         conv_template = (
+#             "qwen_1_5"  # Make sure you use correct chat template for different models
+#         )
+#         question = DEFAULT_IMAGE_TOKEN + f"\n{prompt}"
+#         conv = copy.deepcopy(conv_templates[conv_template])
+#         conv.append_message(conv.roles[0], question)
+#         conv.append_message(conv.roles[1], None)
+#         prompt_question = conv.get_prompt()
+#         input_ids = (
+#             tokenizer_image_token(
+#                 prompt_question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+#             )
+#             .unsqueeze(0)
+#             .to(self.model.device)
+#         )
+#         with torch.no_grad():
+#             outputs = self.model.generate(
+#                 input_ids,
+#                 images=video,
+#                 modalities=["video"],
+#                 do_sample=do_sample,  # Was set to False, i.e., greedy sampling, which invalidates things like temperature, top-K, top-P!
+#                 temperature=temperature,
+#                 max_new_tokens=max_new_tokens,
+#                 output_scores=True,
+#                 return_dict_in_generate=True,
+#             )
+#         generated_ids = outputs.sequences
+#         scores = outputs.scores  # Tuple of tensors, one per generated token
+#         print(f"Number of generated tokens: {len(scores)}")
+#         print(f"Vocabulary size: {scores[0].shape[1]}")
+#         # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
+#         if debug:
+#             print("****Running inference in debug mode****")
+#             # Print first token scores shape and max/min scores in debug mode
+#             print(f"Single token scores shape: {scores[0].shape}")
+#             print(
+#                 f"Max score: {scores[0].max().item():.4f} | Min score: {scores[0].min().item():.4f}"
+#             )
+#             # Print details about top 10 tokens based on logits
+#             logits_type = "POST-PROCESSED" if do_sample is True else "RAW"
+#             print(f"\n{'─'*80}")
+#             print(
+#                 f"TOP {top_k_tokens} TOKENS FROM {logits_type} LOGITS (outputs.scores):"
+#             )
+#             print(f"{'─'*80}")
+#             top_k_tokens_scores = torch.topk(scores[0], k=top_k_tokens, dim=-1)
+#             for i in range(top_k_tokens):
+#                 score = top_k_tokens_scores.values[0, i].item()
+#                 score_index = top_k_tokens_scores.indices[0, i].item()
+#                 token = self.tokenizer.decode(score_index)
+#                 print(f"#{i+1}th Token: {token}")
+#                 print(f"#{i+1}th Token index: {score_index}")
+#                 print(f"#{i+1}th Token score: {score}")
+#                 print("--------------------------------")
+#         # Decode the text
+#         output_response = self.tokenizer.batch_decode(
+#             generated_ids,
+#             skip_special_tokens=True,
+#             clean_up_tokenization_spaces=False,
+#         )[0]
+#         # Convert scores to probabilities
+#         # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
+#         selected_token_probs = []
+#         selected_token_logits = []
+#         first_token_probs = torch.softmax(scores[0], dim=-1)
+#         # Now, find indices of tokens in token_choices and get their probabilities
+#         for token_choice in token_choices:
+#             # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
+#             token_index = self.tokenizer.encode(token_choice, add_special_tokens=False)[
+#                 0
+#             ]
+#             selected_token_probs.append(first_token_probs[0, token_index].item())
+#             selected_token_logits.append(scores[0][0, token_index].item())
+#         # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
+#         if return_confidence:
+#             first_token_id = generated_ids[0][
+#                 0
+#             ].item()  # First token of the first sequence
+#             confidence = (
+#                 first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
+#                 if sum(selected_token_probs) > 0
+#                 else 0.0
+#             )
+#             return {
+#                 "response": output_response,
+#                 "confidence": confidence,
+#             }
+#         # Return token logits
+#         else:
+#             token_logits = dict(zip(token_choices, selected_token_logits))
+#             top_k_logits_indices = torch.topk(scores[0], k=top_k_tokens, dim=-1)
+#             top_k_tokens_list: List[Tuple[str, int, float]] = []
+#             for i in range(top_k_tokens):
+#                 logit_index = top_k_logits_indices.indices[0, i].item()
+#                 token = self.tokenizer.decode(logit_index)
+#                 logit = top_k_logits_indices.values[0, i].item()
+#                 top_k_tokens_list.append((token, logit_index, logit))
+#             return {
+#                 "response": output_response,
+#                 "top_k_tokens": top_k_tokens_list,
+#                 "token_logits": token_logits,
+#             }
+# if __name__ == "__main__":
+#     model_path = "lmms-lab/LLaVA-Video-7B-Qwen2"  # "Qwen/Qwen3-VL-8B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct"
+#     device_map = "cuda:0"
+#     model = LLaVAVideoModel(model_path, device_map=device_map)
+#     prompt = 'Does the following action accurately describe the one shown in the video? \nAnswer with "Yes" or "No".\n\nAction: Trying but failing to attach clip to ring because it doesn\'t stick\n\nConditions which may/may not be true BEFORE the aforementioned action occurs:\n- Clip is physically separate from the ring\n- Hand is holding the clip\n\nConditions which may/may not be true AFTER the aforementioned action occurs:\n- Clip is physically separate from the ring\n- Clip is not attached to the ring\n\nAnswer:'
+#     token_choices = ["Yes", "No"]
+#     video_path = (
+#         "/home/shreyasj/Syed/data/Something-Something-V2/pre-post/videos/101917.mp4"
+#     )
+#     generation_config = {
+#         "max_new_tokens": 128,
+#         "do_sample": False,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P. Allows return of raw logits
+#         "temperature": 0.7,
+#         "logits_temperature": 1.0,
+#         "fps": 1.0,
+#         "return_confidence": False,
+#         "top_k_tokens": 10,
+#         "debug": False,
+#     }
+#     output = model.chat_with_confidence(
+#         prompt, video_path, token_choices=token_choices, **generation_config
+#     )
+#     response = output["response"]
+#     print(f"Response: {response}")
+#     if generation_config["return_confidence"]:
+#         confidence = output["confidence"]
+#         print(f"Confidence: {confidence}")
+#     else:
+#         # If do_sample is True, logits pass through logit warpers which filter out un-important tokens (based on logits) to -inf,
+#         # otherwise, the raw logits are used, which are not filtered.
+#         logits_type = "POST-PROCESSED" if generation_config["do_sample"] else "RAW"
+#         print(f"\n{'─'*80}")
+#         print(f"TOP 10 TOKENS FROM {logits_type} LOGITS (outputs.scores):")
+#         print(f"{'─'*80}")
+#         top_k_tokens = output["top_k_tokens"]
+#         for i in range(len(top_k_tokens)):
+#             print(f"Top {i+1} token: {top_k_tokens[i][0]}")
+#             print(f"Top {i+1} token index: {top_k_tokens[i][1]}")
+#             print(f"Top {i+1} token logit: {top_k_tokens[i][2]}")
+#             print("--------------------------------")

models/qwen2_5vl.py CHANGED Viewed

@@ -2,11 +2,12 @@
 import torch
 from transformers import (
-    Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
 )
 from typing import Optional, Dict, Any, Union, List
-from qwen_vl_utils import process_vision_info
 # Handle both relative and absolute imports
 try:
@@ -22,10 +23,22 @@ class Qwen2_5VLModel(BaseVideoModel):
         dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
         device_map: Optional[Union[str, Dict]] = "auto",
         attn_implementation: Optional[str] = "flash_attention_2",
     ):
         super().__init__(model_name)
-        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
             model_name,
             dtype=dtype,
             device_map=device_map,
             attn_implementation=attn_implementation,
@@ -36,257 +49,254 @@ class Qwen2_5VLModel(BaseVideoModel):
         self,
         prompt: str,
         video_path: str,
-        fps: float = 1.0,
         temperature: float = 0.7,
         max_new_tokens: int = 512,
         do_sample: Optional[bool] = True,
         **kwargs: Any,
     ) -> str:
         # Messages containing a local video path and a text query
-        messages = [
             {
                 "role": "user",
                 "content": [
                     {
-                        "type": "video",
                         "video": video_path,
-                        # "max_pixels": 360 * 420,
-                        "fps": fps,
                     },
-                    {"type": "text", "text": prompt},
                 ],
-            }
         ]
-        text = self.processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs, video_kwargs = process_vision_info(
-            messages, return_video_kwargs=True
-        )
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
             return_tensors="pt",
-            **video_kwargs,
-        )
-        inputs = inputs.to(self.model.device)
-        # Inference
-        generated_ids = self.model.generate(
-            **inputs,
-            do_sample=do_sample,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            **kwargs,
-        )
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :]
-            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_response = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0]
-        return output_response
-    def chat_with_confidence(
-        self,
-        prompt: str,
-        video_path: str,
-        fps: float = 1.0,
-        max_new_tokens: int = 512,
-        temperature: float = 0.7,
-        token_choices: Optional[List[str]] = ["Yes", "No"],
-        logits_temperature: Optional[float] = 1.0,
-        return_confidence: Optional[bool] = False,
-        debug: Optional[bool] = False,
-    ) -> Dict[str, Any]:
-        """
-        Returns the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
-        Args:
-            prompt (str): The text prompt to generate a response for.
-            video_path (str): The path to the video file.
-            fps (float, optional): The frames per second of the video. Defaults to 1.0.
-            max_new_tokens (int, optional): The maximum number of new tokens to generate. Defaults to 128.
-            temperature (float, optional): The temperature to use for generation. Defaults to 0.7.
-            logits_temperature (float, optional): The logits temperature to use for generation. Defaults to 1.0.
-            token_choices (List[str], optional): The list of token choices to return logits for. Defaults to ["Yes", "No"].
-            return_confidence (bool, optional): Whether to return the confidence of the response. Defaults to False.
-            debug (bool, optional): Whether to run in debug mode. Defaults to False.
-        Returns:
-            Dict[str, Any]: A dictionary containing the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
-        e.g., return_confidence: False
-            Output:
-                {
-                    "response": "Yes",
-                    "logits": {
-                        "Yes": 12.0,
-                        "No": 9.0
-                    }
-                }
-        e.g., return_confidence: True
-            Output:
-                {
-                    "response": "Yes",
-                    "confidence": 0.9999
-                }
-        """
-        # Messages containing a local video path and a text query
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "video",
-                        "video": video_path,
-                        # "max_pixels": 360 * 420,
-                        "fps": fps,
-                    },
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-        text = self.processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs, video_kwargs = process_vision_info(
-            messages, return_video_kwargs=True
-        )
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-            **video_kwargs,
-        )
-        inputs = inputs.to(self.model.device)
-        # Inference with scores
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                temperature=temperature,
-                max_new_tokens=max_new_tokens,
-                output_scores=True,
-                return_dict_in_generate=True,
-            )
-        generated_ids = outputs.sequences
-        scores = outputs.scores  # Tuple of tensors, one per generated token
-        scores = tuple(
-            s / logits_temperature for s in scores
-        )  # Scales the logits by a factor for normalization during reporting
-        print(f"Number of generated tokens: {len(scores)}")
-        print(f"Vocabulary size: {scores[0].shape[1]}")
-        # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
-        if debug:
-            print("****Running inference in debug mode****")
-            # Print first token scores shape and max/min scores in debug mode
-            print(f"Single token scores shape: {scores[0].shape}")
-            print(
-                f"First token max/min scores: {scores[0].max().item()}, {scores[0].min().item()}"
-            )
-            # Print details about top 3 tokens
-            top_3_tokens = torch.topk(scores[0], k=3, dim=-1)
-            for i in range(3):
-                print(
-                    f"Pos 0 | {i+1}th Token: {self.processor.decode(top_3_tokens.indices[0, i].item())}"
-                )
-                print(
-                    f"Pos 0 | {i+1}th Token logit: {top_3_tokens.values[0, i].item()}"
-                )
-        # Trim the prompt tokens from generated sequences
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :]
-            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        # Decode the text
-        output_response = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0]
-        # Convert scores to probabilities
-        # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
-        selected_token_probs = []
-        selected_token_logits = []
-        first_token_probs = torch.softmax(scores[0], dim=-1)
-        # Now, find indices of tokens in token_choices and get their probabilities
-        for token_choice in token_choices:
-            # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
-            token_index = self.processor.tokenizer.encode(
-                token_choice, add_special_tokens=False
-            )[0]
-            selected_token_probs.append(first_token_probs[0, token_index].item())
-            selected_token_logits.append(scores[0][0, token_index].item())
-        # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
-        if return_confidence:
-            first_token_id = generated_ids_trimmed[0][
-                0
-            ].item()  # First token of the first sequence
-            confidence = (
-                first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
-                if sum(selected_token_probs) > 0
-                else 0.0
-            )
-            return {
-                "response": output_response,
-                "confidence": confidence,
-            }
-        # Retrn token logits
-        else:
-            token_logits = dict(zip(token_choices, selected_token_logits))
-            return {
-                "response": output_response,
-                "logits": token_logits,
-            }
-if __name__ == "__main__":
-    model_path = "Qwen/Qwen2.5-VL-7B-Instruct"  # "Qwen/Qwen2.5-VL-7B-Instruct"
-    model = Qwen2_5VLModel(model_path)
-    prompt = (
-        "Which of the following exist in the video? Answer in A or B.\nA: Hand\nB: Face"
-    )
-    token_choices = ["A", "B"]
-    ext = ".webm"
-    video_path = "/home/shreyasj/Syed/data/Something-Something-V2/videos/101917" + ext
-    generation_config = {
-        "max_new_tokens": 128,
-        "temperature": 0.7,
-        "logits_temperature": 5.0,
-        "fps": 3.0,
-        "return_confidence": False,
-        "debug": True,
-    }
-    output = model.chat_with_confidence(
-        prompt, video_path, token_choices=token_choices, **generation_config
-    )
-    response = output["response"]
-    print(f"Response: {response}")
-    if generation_config["return_confidence"]:
-        confidence = output["confidence"]
-        print(f"Confidence: {confidence}")
-    else:
-        selected_token_logits = output["logits"]
-        print(f"Selected token logits: {selected_token_logits}")
-        print(f"Logits temperature: {generation_config['logits_temperature']}")

 import torch
 from transformers import (
+    AutoModelForImageTextToText,
     AutoProcessor,
+    BitsAndBytesConfig,
 )
 from typing import Optional, Dict, Any, Union, List
+# from qwen_vl_utils import process_vision_info
 # Handle both relative and absolute imports
 try:
         dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
         device_map: Optional[Union[str, Dict]] = "auto",
         attn_implementation: Optional[str] = "flash_attention_2",
+        load_8bit: Optional[bool] = False,
+        load_4bit: Optional[bool] = False,
     ):
         super().__init__(model_name)
+        self.dtype = dtype
+        quantization_config = None
+        if load_8bit or load_4bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=load_8bit,
+                load_in_4bit=load_4bit,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16
+            )
+        self.model = AutoModelForImageTextToText.from_pretrained(
             model_name,
+            quantization_config=quantization_config,
             dtype=dtype,
             device_map=device_map,
             attn_implementation=attn_implementation,
         self,
         prompt: str,
         video_path: str,
         temperature: float = 0.7,
         max_new_tokens: int = 512,
         do_sample: Optional[bool] = True,
+        fps: Optional[float] = 1.0,
+        num_frames: Optional[int] = 10,
+        video_mode: Optional[str] = "video",
         **kwargs: Any,
     ) -> str:
+        # Ensure only one of fps or num_frames is provided
+        if video_mode == "frames":
+            fps = None
+        elif video_mode == "video":
+            num_frames = None
         # Messages containing a local video path and a text query
+        conversation = [
             {
                 "role": "user",
                 "content": [
                     {
+                        "type": "video",
                         "video": video_path,
                     },
+                    {"type": "text", "text": prompt}
                 ],
+            },
         ]
+        inputs = self.processor.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
             return_tensors="pt",
+            do_sample_frames=True,
+            fps=fps,
+            num_frames=num_frames
+        ).to(self.model.device)
+        with torch.no_grad():
+            out = self.model.generate(
+                **inputs,
+                do_sample=do_sample,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                **kwargs,
+            )
+        raw_response = self.processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
+        response = raw_response.split("assistant")[1].strip()
+        return response
+#     def chat_with_confidence(
+#         self,
+#         prompt: str,
+#         video_path: str,
+#         fps: Optional[float] = 1.0,
+#         num_frames: Optional[int] = 10,
+#         max_new_tokens: int = 512,
+#         temperature: float = 0.7,
+#         do_sample: Optional[bool] = True,
+#         video_mode: Optional[str] = "video",
+#         token_choices: Optional[List[str]] = ["Yes", "No"],
+#         logits_temperature: Optional[float] = 1.0,
+#         return_confidence: Optional[bool] = False,
+#         debug: Optional[bool] = False,
+#         **kwargs: Any,
+#     ) -> Dict[str, Any]:
+#         """
+#         Returns the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
+#         Args:
+#             prompt (str): The text prompt to generate a response for.
+#             video_path (str): The path to the video file.
+#             fps (float, optional): The frames per second of the video. Defaults to 1.0.
+#             max_new_tokens (int, optional): The maximum number of new tokens to generate. Defaults to 128.
+#             temperature (float, optional): The temperature to use for generation. Defaults to 0.7.
+#             logits_temperature (float, optional): The logits temperature to use for generation. Defaults to 1.0.
+#             token_choices (List[str], optional): The list of token choices to return logits for. Defaults to ["Yes", "No"].
+#             return_confidence (bool, optional): Whether to return the confidence of the response. Defaults to False.
+#             debug (bool, optional): Whether to run in debug mode. Defaults to False.
+#         Returns:
+#             Dict[str, Any]: A dictionary containing the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
+#         e.g., return_confidence: False
+#             Output:
+#                 {
+#                     "response": "Yes",
+#                     "logits": {
+#                         "Yes": 12.0,
+#                         "No": 9.0
+#                     }
+#                 }
+#         e.g., return_confidence: True
+#             Output:
+#                 {
+#                     "response": "Yes",
+#                     "confidence": 0.9999
+#                 }
+#         """
+#         # Messages containing a local video path and a text query
+#         messages = [
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {
+#                         "type": "video",
+#                         "video": video_path,
+#                         # "max_pixels": 360 * 420,
+#                         "fps": fps,
+#                     },
+#                     {"type": "text", "text": prompt},
+#                 ],
+#             }
+#         ]
+#         text = self.processor.apply_chat_template(
+#             messages, tokenize=False, add_generation_prompt=True
+#         )
+#         image_inputs, video_inputs, video_kwargs = process_vision_info(
+#             messages, return_video_kwargs=True
+#         )
+#         inputs = self.processor(
+#             text=[text],
+#             images=image_inputs,
+#             videos=video_inputs,
+#             padding=True,
+#             return_tensors="pt",
+#             **video_kwargs,
+#         )
+#         inputs = inputs.to(self.model.device)
+#         # Inference with scores
+#         with torch.no_grad():
+#             outputs = self.model.generate(
+#                 **inputs,
+#                 temperature=temperature,
+#                 max_new_tokens=max_new_tokens,
+#                 output_scores=True,
+#                 return_dict_in_generate=True,
+#             )
+#         generated_ids = outputs.sequences
+#         scores = outputs.scores  # Tuple of tensors, one per generated token
+#         scores = tuple(
+#             s / logits_temperature for s in scores
+#         )  # Scales the logits by a factor for normalization during reporting
+#         print(f"Number of generated tokens: {len(scores)}")
+#         print(f"Vocabulary size: {scores[0].shape[1]}")
+#         # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
+#         if debug:
+#             print("****Running inference in debug mode****")
+#             # Print first token scores shape and max/min scores in debug mode
+#             print(f"Single token scores shape: {scores[0].shape}")
+#             print(
+#                 f"First token max/min scores: {scores[0].max().item()}, {scores[0].min().item()}"
+#             )
+#             # Print details about top 3 tokens
+#             top_3_tokens = torch.topk(scores[0], k=3, dim=-1)
+#             for i in range(3):
+#                 print(
+#                     f"Pos 0 | {i+1}th Token: {self.processor.decode(top_3_tokens.indices[0, i].item())}"
+#                 )
+#                 print(
+#                     f"Pos 0 | {i+1}th Token logit: {top_3_tokens.values[0, i].item()}"
+#                 )
+#         # Trim the prompt tokens from generated sequences
+#         generated_ids_trimmed = [
+#             out_ids[len(in_ids) :]
+#             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+#         ]
+#         # Decode the text
+#         output_response = self.processor.batch_decode(
+#             generated_ids_trimmed,
+#             skip_special_tokens=True,
+#             clean_up_tokenization_spaces=False,
+#         )[0]
+#         # Convert scores to probabilities
+#         # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
+#         selected_token_probs = []
+#         selected_token_logits = []
+#         first_token_probs = torch.softmax(scores[0], dim=-1)
+#         # Now, find indices of tokens in token_choices and get their probabilities
+#         for token_choice in token_choices:
+#             # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
+#             token_index = self.processor.tokenizer.encode(
+#                 token_choice, add_special_tokens=False
+#             )[0]
+#             selected_token_probs.append(first_token_probs[0, token_index].item())
+#             selected_token_logits.append(scores[0][0, token_index].item())
+#         # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
+#         if return_confidence:
+#             first_token_id = generated_ids_trimmed[0][
+#                 0
+#             ].item()  # First token of the first sequence
+#             confidence = (
+#                 first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
+#                 if sum(selected_token_probs) > 0
+#                 else 0.0
+#             )
+#             return {
+#                 "response": output_response,
+#                 "confidence": confidence,
+#             }
+#         # Retrn token logits
+#         else:
+#             token_logits = dict(zip(token_choices, selected_token_logits))
+#             return {
+#                 "response": output_response,
+#                 "logits": token_logits,
+#             }
+# if __name__ == "__main__":
+#     model_path = "Qwen/Qwen2.5-VL-7B-Instruct"  # "Qwen/Qwen2.5-VL-7B-Instruct"
+#     model = Qwen2_5VLModel(model_path)
+#     prompt = (
+#         "Which of the following exist in the video? Answer in A or B.\nA: Hand\nB: Face"
+#     )
+#     token_choices = ["A", "B"]
+#     ext = ".webm"
+#     video_path = "/home/shreyasj/Syed/data/Something-Something-V2/videos/101917" + ext
+#     generation_config = {
+#         "max_new_tokens": 128,
+#         "temperature": 0.7,
+#         "logits_temperature": 5.0,
+#         "fps": 3.0,
+#         "return_confidence": False,
+#         "debug": True,
+#     }
+#     output = model.chat_with_confidence(
+#         prompt, video_path, token_choices=token_choices, **generation_config
+#     )
+#     response = output["response"]
+#     print(f"Response: {response}")
+#     if generation_config["return_confidence"]:
+#         confidence = output["confidence"]
+#         print(f"Confidence: {confidence}")
+#     else:
+#         selected_token_logits = output["logits"]
+#         print(f"Selected token logits: {selected_token_logits}")
+#         print(f"Logits temperature: {generation_config['logits_temperature']}")

models/qwen3vl.py CHANGED Viewed

@@ -2,14 +2,11 @@
 import torch
 from transformers import (
-    Qwen3VLForConditionalGeneration,
     AutoProcessor,
 )
 from typing import Optional, Dict, Any, Union, List, Tuple
-from qwen_vl_utils import process_vision_info
-import cv2
-import numpy as np
-from PIL import Image
 # Handle both relative and absolute imports
 try:
@@ -18,50 +15,31 @@ except ImportError:
     from base import BaseVideoModel
-def downsample_video(video_path, max_dim=720, num_frames=10):
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    frames = []
-    frame_indices = np.linspace(
-        0, total_frames - 1, min(total_frames, num_frames), dtype=int
-    )
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            h, w = image.shape[:2]
-            scale = max_dim / max(h, w)
-            if scale < 1:
-                image = cv2.resize(
-                    image,
-                    (int(w * scale), int(h * scale)),
-                    interpolation=cv2.INTER_AREA,
-                )
-            pil_image = Image.fromarray(image)
-            frames.append(pil_image)
-    vidcap.release()
-    return frames
 class Qwen3VLModel(BaseVideoModel):
     def __init__(
         self,
-        model_name: str = "Qwen/Qwen3-VL-8B-Instruct",
         dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
         device_map: Optional[Union[str, Dict]] = "auto",
         attn_implementation: Optional[str] = "flash_attention_2",
     ):
         super().__init__(model_name)
-        self.model = Qwen3VLForConditionalGeneration.from_pretrained(
             model_name,
-            dtype=dtype,
             device_map=device_map,
             attn_implementation=attn_implementation,
         )
         self.processor = AutoProcessor.from_pretrained(model_name)
@@ -69,467 +47,443 @@ class Qwen3VLModel(BaseVideoModel):
         self,
         prompt: str,
         video_path: str,
-        fps: float = 1.0,
         temperature: float = 0.7,
         do_sample: Optional[
             bool
         ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
         max_new_tokens: int = 512,
         video_mode: Optional[str] = "video",  # Choose from "video" or "frames"
-        video_frames: Optional[int] = 10,
         **kwargs: Any,
     ) -> str:
-        # Messages containing a local video path and a text query
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-        if video_mode == "video":
-            messages[0]["content"].append(
-                {
-                    "type": "video",
-                    "video": video_path,
-                    # "max_pixels": 360 * 420,
-                    "fps": fps,
-                }
-            )
-            inputs = self.processor.apply_chat_template(
-                messages,
-                tokenize=True,
-                add_generation_prompt=True,
-                return_dict=True,
-                return_tensors="pt",
-            )
-        elif video_mode == "frames":
-            frames = downsample_video(video_path, max_dim=720, num_frames=video_frames)
-            images_for_processor = []
-            for frame in frames:
-                messages[0]["content"].append({"type": "image"})
-                images_for_processor.append(frame)
-            prompt_full = self.processor.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True
-            )
-            inputs = self.processor(
-                text=[prompt_full],
-                images=images_for_processor,
-                return_tensors="pt",
-                padding=True,
-            )
-        inputs = inputs.to(self.model.device)
-        generated_ids = self.model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            do_sample=do_sample,
-            **kwargs,
-        )
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :]
-            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_response = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0]
-        return output_response
-    def chat_with_confidence(
-        self,
-        prompt: str,
-        video_path: str,
-        fps: float = 1.0,
-        max_new_tokens: int = 512,
-        temperature: float = 0.7,
-        do_sample: Optional[
-            bool
-        ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
-        token_choices: Optional[List[str]] = ["Yes", "No"],
-        logits_temperature: Optional[float] = 1.0,
-        return_confidence: Optional[bool] = False,
-        top_k_tokens: Optional[int] = 10,
-        debug: Optional[bool] = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-        """
-        Returns the response and confidence of the response, if return_confidence is True. Else, returns the top k tokens and their logits.
-        Args:
-            prompt (str): The text prompt to generate a response for.
-            video_path (str): The path to the video file.
-            temperature (float, optional): The temperature to use for generation. Defaults to 0.7.
-            max_new_tokens (int, optional): The maximum number of new tokens to generate. Defaults to 512.
-            token_choices (List[str], optional): The list of token choices to return logits for. Defaults to ["Yes", "No"].
-            generation_config (Dict[str, Any], optional): The generation configuration. Defaults to None.
-            return_confidence (bool, optional): Whether to return the confidence of the response. Defaults to False.
-            top_k_tokens (int, optional): The number of top tokens to return. Defaults to 10. Only applicable if return_confidence is False.
-            debug (bool, optional): Whether to run in debug mode. Defaults to False.
-        Returns:
-            Dict[str, Any]: A dictionary containing the response and confidence of the response, if return_confidence is True. Else, returns the top k tokens and their logits.
-        e.g., return_confidence: False
-            Output:
-                {
-                    "response": "Yes",
-                    "top_k_tokens": [("Yes", 12.0, 12), ("No", 9.0, 9)],
-                }
-        e.g., return_confidence: True
-            Output:
-                {
-                    "response": "Yes",
-                    "confidence": 0.9999
-                }
-        """
-        # Messages containing a local video path and a text query
-        messages = [
             {
                 "role": "user",
                 "content": [
                     {
-                        "type": "video",
                         "video": video_path,
-                        # "max_pixels": 360 * 420,
-                        "fps": fps,
                     },
-                    {"type": "text", "text": prompt},
                 ],
-            }
         ]
         inputs = self.processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
             return_tensors="pt",
-        )
-        inputs = inputs.to(self.model.device)
-        # In debug mode, inspect what logits processors will be used
-        if debug:
-            print("\n" + "=" * 80)
-            print("INSPECTING GENERATION CONFIG & WARPERS")
-            print("=" * 80)
-            # Get the generation config to see what processors will be added
-            gen_config = self.model.generation_config
-            print(f"Generation config attributes:")
-            print(f"  Processor-related:")
-            print(
-                f"    - repetition_penalty: {getattr(gen_config, 'repetition_penalty', None)}"
-            )
-            print(
-                f"    - no_repeat_ngram_size: {getattr(gen_config, 'no_repeat_ngram_size', None)}"
-            )
-            print(
-                f"    - encoder_no_repeat_ngram_size: {getattr(gen_config, 'encoder_no_repeat_ngram_size', None)}"
-            )
-            print(f"    - bad_words_ids: {getattr(gen_config, 'bad_words_ids', None)}")
-            print(f"    - min_length: {getattr(gen_config, 'min_length', None)}")
-            print(
-                f"    - forced_bos_token_id: {getattr(gen_config, 'forced_bos_token_id', None)}"
-            )
-            print(
-                f"    - forced_eos_token_id: {getattr(gen_config, 'forced_eos_token_id', None)}"
-            )
-            print(f"  Warper-related (THESE MASK TOKENS TO -INF):")
-            print(f"    - temperature: {temperature} (passed as arg)")
-            print(
-                f"    - do_sample: {getattr(gen_config, 'do_sample', 'Not set (will be inferred)')}"
-            )
-            print(f"    - top_k: {getattr(gen_config, 'top_k', None)}")
-            print(f"    - top_p: {getattr(gen_config, 'top_p', None)}")
-            print(f"    - typical_p: {getattr(gen_config, 'typical_p', None)}")
-            print(
-                f"    - epsilon_cutoff: {getattr(gen_config, 'epsilon_cutoff', None)}"
-            )
-            print(f"    - eta_cutoff: {getattr(gen_config, 'eta_cutoff', None)}")
-            print(
-                f"\n  ⚠️  If top_k or top_p are set, they will mask non-selected tokens to -inf!"
-            )
-            print("=" * 80 + "\n")
-        # Inference with scores
         with torch.no_grad():
-            outputs = self.model.generate(
                 **inputs,
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
-                do_sample=do_sample,
-                output_scores=True,
-                output_logits=True,  # Get TRUE raw logits before any processing
-                return_dict_in_generate=True,
                 **kwargs,
             )
-        generated_ids = outputs.sequences
-        scores = outputs.scores  # Tuple of tensors - PROCESSED logits used for sampling
-        logits = (
-            outputs.logits if hasattr(outputs, "logits") else None
-        )  # TRUE raw logits from model
-        scores = tuple(
-            s / logits_temperature for s in scores
-        )  # Scales the logits by a factor for normalization during reporting
-        print(f"Number of generated tokens: {len(scores)}")
-        print(f"Vocabulary size: {scores[0].shape[1]}")
-        # Check if logits differ from scores
-        if debug and logits is not None:
-            print(f"\n[IMPORTANT] output_logits available: True")
-            print(
-                f"[IMPORTANT] Comparing outputs.logits (raw) vs outputs.scores (processed):"
-            )
-            logits_raw = logits[0] / logits_temperature  # First token's raw logits
-            scores_first = scores[0]  # First token's processed scores
-            logits_diff = (logits_raw.cpu() - scores_first.cpu()).abs()
-            max_diff = logits_diff.max().item()
-            if max_diff > 0.001:
-                print(
-                    f"[IMPORTANT] ⚠️  outputs.scores ARE DIFFERENT from outputs.logits!"
-                )
-                print(f"[IMPORTANT]    Max difference: {max_diff:.6f}")
-                print(
-                    f"[IMPORTANT]    This means outputs.scores are PROCESSED, not raw!"
-                )
-            else:
-                print(f"[IMPORTANT] ✓ outputs.scores == outputs.logits (both are raw)")
-        elif debug:
-            print(
-                f"\n[IMPORTANT] output_logits not available in this transformers version"
-            )
-        # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
-        if debug:
-            print("\n" + "=" * 80)
-            print("****Running inference in debug mode****")
-            print("=" * 80)
-            # Use truly raw logits if available, otherwise use scores
-            raw_logits_to_show = (
-                logits[0] / logits_temperature if logits is not None else scores[0]
-            )
-            logits_label = (
-                "TRUE RAW LOGITS (from outputs.logits)"
-                if logits is not None
-                else "LOGITS (from outputs.scores)"
-            )
-            # Print first token scores shape and max/min scores in debug mode
-            print(
-                f"\n[{logits_label}] Single token scores shape: {raw_logits_to_show.shape}"
-            )
-            print(
-                f"[{logits_label}] First token max/min: {raw_logits_to_show.max().item():.4f}, {raw_logits_to_show.min().item():.4f}"
-            )
-            # Print details about top 3 tokens from RAW logits
-            print(f"\n{'─'*80}")
-            print(f"TOP 3 TOKENS FROM {logits_label}:")
-            print(f"{'─'*80}")
-            top_3_tokens = torch.topk(raw_logits_to_show, k=3, dim=-1)
-            for i in range(3):
-                token_id = top_3_tokens.indices[0, i].item()
-                token_text = self.processor.decode(token_id)
-                token_logit = top_3_tokens.values[0, i].item()
-                print(
-                    f"  #{i+1}: Token='{token_text}' | ID={token_id} | Logit={token_logit:.4f}"
-                )
-            # Now compare with POST-PROCESSED logits (outputs.scores)
-            scores_first = scores[0] / logits_temperature
-            print(f"\n{'─'*80}")
-            print("TOP 3 TOKENS FROM LOGITS CAPTURE (after all processors):")
-            print(f"{'─'*80}")
-            print(
-                f"[POST-PROCESSED] Max/min logits: {scores_first.max().item():.4f}, {scores_first.min().item():.4f}"
-            )
-            top_3_processed = torch.topk(scores_first, k=3, dim=-1)
-            for i in range(3):
-                token_id = top_3_processed.indices[0, i].item()
-                token_text = self.processor.decode(token_id)
-                token_logit = top_3_processed.values[0, i].item()
-                print(
-                    f"  #{i+1}: Token='{token_text}' | ID={token_id} | Logit={token_logit:.4f}"
-                )
-            # Check if the distributions differ (compare against truly raw logits if available)
-            print(f"\n{'─'*80}")
-            print("DIFFERENCE ANALYSIS (Raw → Post-Processed):")
-            print(f"{'─'*80}")
-            logit_diff = (scores_first.cpu() - raw_logits_to_show.cpu()).abs()
-            max_diff = logit_diff.max().item()
-            num_changed = (logit_diff > 0.001).sum().item()
-            print(f"  Max logit difference: {max_diff:.6f}")
-            print(
-                f"  Number of tokens with changed logits: {num_changed}/{raw_logits_to_show.shape[1]}"
-            )
-            if max_diff > 0.001:
-                print(f"\n  ⚠️  LOGITS WERE MODIFIED BY PROCESSORS!")
-                # Show which tokens changed the most
-                top_changes = torch.topk(logit_diff[0], k=min(5, num_changed))
-                print(f"\n  Top 5 most changed tokens:")
-                for i in range(min(5, len(top_changes.indices))):
-                    token_id = top_changes.indices[i].item()
-                    token_text = self.processor.decode(token_id)
-                    raw_logit = raw_logits_to_show[0, token_id].item()
-                    processed_logit = scores_first[0, token_id].item()
-                    diff = top_changes.values[i].item()
-                    print(f"    Token='{token_text}' | ID={token_id}")
-                    print(
-                        f"      Raw: {raw_logit:.4f} → Processed: {processed_logit:.4f} (Δ={diff:.4f})"
-                    )
-            else:
-                print(f"  ✓ No significant modifications detected")
-            # Show what token was actually selected
-            print(f"\n{'─'*80}")
-            print("ACTUALLY GENERATED TOKEN:")
-            print(f"{'─'*80}")
-            first_generated_id = generated_ids[0, len(inputs.input_ids[0])].item()
-            first_generated_token = self.processor.decode(first_generated_id)
-            raw_logit_for_generated = raw_logits_to_show[0, first_generated_id].item()
-            print(f"  Token: '{first_generated_token}' | ID={first_generated_id}")
-            print(f"  Raw logit: {raw_logit_for_generated:.4f}")
-            processed_logit_for_generated = scores_first[0, first_generated_id].item()
-            print(f"  Post-processed logit: {processed_logit_for_generated:.4f}")
-            # Check if this token is in top-k of raw logits
-            top_k_raw_indices = torch.topk(
-                raw_logits_to_show, k=min(10, raw_logits_to_show.shape[1]), dim=-1
-            ).indices[0]
-            is_in_top10_raw = first_generated_id in top_k_raw_indices
-            print(f"  In top-10 of RAW logits: {is_in_top10_raw}")
-            if not is_in_top10_raw:
-                print(
-                    f"\n  🚨 CRITICAL: Generated token was NOT in top-10 of raw logits!"
-                )
-                print(
-                    f"      This proves that logits processors modified the distribution."
-                )
-                # Find the rank of the generated token in raw logits
-                sorted_raw = torch.argsort(raw_logits_to_show[0], descending=True)
-                raw_rank = (sorted_raw == first_generated_id).nonzero(as_tuple=True)[
-                    0
-                ].item() + 1
-                print(f"      Raw logits rank: {raw_rank}")
-            print("=" * 80 + "\n")
-        # Trim the prompt tokens from generated sequences
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :]
-            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        # Decode the text
-        output_response = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0]
-        # Convert scores to probabilities
-        # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
-        selected_token_probs = []
-        selected_token_logits = []
-        first_token_probs = torch.softmax(scores[0], dim=-1)
-        # Now, find indices of tokens in token_choices and get their probabilities
-        for token_choice in token_choices:
-            # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
-            token_index = self.processor.tokenizer.encode(
-                token_choice, add_special_tokens=False
-            )[0]
-            selected_token_probs.append(first_token_probs[0, token_index].item())
-            selected_token_logits.append(scores[0][0, token_index].item())
-        # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
-        if return_confidence:
-            first_token_id = generated_ids_trimmed[0][
-                0
-            ].item()  # First token of the first sequence
-            confidence = (
-                first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
-                if sum(selected_token_probs) > 0
-                else 0.0
-            )
-            return {
-                "response": output_response,
-                "confidence": confidence,
-            }
-        # Return token logits
-        else:
-            token_logits = dict(zip(token_choices, selected_token_logits))
-            top_k_logits_indices = torch.topk(scores[0], k=top_k_tokens, dim=-1)
-            top_k_tokens_list: List[Tuple[str, int, float]] = []
-            for i in range(top_k_tokens):
-                logit_index = top_k_logits_indices.indices[0, i].item()
-                token = self.processor.decode(logit_index)
-                logit = top_k_logits_indices.values[0, i].item()
-                top_k_tokens_list.append((token, logit_index, logit))
-            return {
-                "response": output_response,
-                "top_k_tokens": top_k_tokens_list,
-                "token_logits": token_logits,
-            }
-if __name__ == "__main__":
-    model_path = "Qwen/Qwen3-VL-4B-Instruct"  # "Qwen/Qwen3-VL-8B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct"
-    model = Qwen3VLModel(model_path)
-    prompt = 'Does the following action accurately describe the one shown in the video? \nAnswer with "Yes" or "No".\n\nAction: Trying to bend stick so nothing happens\n\nConditions which may/may not be true BEFORE the aforementioned action occurs:\n- Stick is held by hands at two distinct points\n- Stick is intact\n\nConditions which may/may not be true AFTER the aforementioned action occurs:\n- Stick retains its original geometric shape\n- Stick remains intact\n\nAnswer:'
-    token_choices = ["Yes", "No"]
-    video_path = (
-        "/home/shreyasj/Syed/data/Something-Something-V2/pre-post/videos/188064.mp4"
-    )
-    generation_config = {
-        "max_new_tokens": 128,
-        "do_sample": True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P. Allows return of raw logits
-        "temperature": 0.7,
-        "logits_temperature": 1.0,
-        "fps": 1.0,
-        "return_confidence": False,
-        "top_k_tokens": 10,
-        "debug": False,
-    }
-    output = model.chat_with_confidence(
-        prompt, video_path, token_choices=token_choices, **generation_config
-    )
-    response = output["response"]
-    print(f"Response: {response}")
-    if generation_config["return_confidence"]:
-        confidence = output["confidence"]
-        print(f"Confidence: {confidence}")
-    else:
-        # If do_sample is True, logits pass through logit warpers which filter out un-important tokens (based on logits) to -inf,
-        # otherwise, the raw logits are used, which are not filtered.
-        logits_type = "POST-PROCESSED" if generation_config["do_sample"] else "RAW"
-        top_k_tokens = output["top_k_tokens"]
-        for i in range(len(top_k_tokens)):
-            print(f"Top {i+1} token: {top_k_tokens[i][0]}")
-            print(f"Top {i+1} token logit: {top_k_tokens[i][2]}")
-            print("--------------------------------")

 import torch
 from transformers import (
+    AutoModelForImageTextToText,
     AutoProcessor,
+    BitsAndBytesConfig,
 )
 from typing import Optional, Dict, Any, Union, List, Tuple
 # Handle both relative and absolute imports
 try:
     from base import BaseVideoModel
 class Qwen3VLModel(BaseVideoModel):
     def __init__(
         self,
+        model_name: str = "Qwen/Qwen3-VL-4B-Instruct",
         dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
         device_map: Optional[Union[str, Dict]] = "auto",
         attn_implementation: Optional[str] = "flash_attention_2",
+        load_8bit: Optional[bool] = False,
+        load_4bit: Optional[bool] = False,
     ):
         super().__init__(model_name)
+        quantization_config = None
+        if load_8bit or load_4bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=load_8bit,
+                load_in_4bit=load_4bit,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16
+            )
+        self.model = AutoModelForImageTextToText.from_pretrained(
             model_name,
+            quantization_config=quantization_config,
             device_map=device_map,
             attn_implementation=attn_implementation,
+            dtype=dtype,
         )
         self.processor = AutoProcessor.from_pretrained(model_name)
         self,
         prompt: str,
         video_path: str,
         temperature: float = 0.7,
         do_sample: Optional[
             bool
         ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
         max_new_tokens: int = 512,
         video_mode: Optional[str] = "video",  # Choose from "video" or "frames"
+        fps: Optional[float] = 1.0,
+        num_frames: Optional[int] = 10,
         **kwargs: Any,
     ) -> str:
+        # Ensure only one of fps or num_frames is provided
+        if video_mode == "frames":
+            fps = None
+        elif video_mode == "video":
+            num_frames = None
+        conversation = [
             {
                 "role": "user",
                 "content": [
                     {
+                        "type": "video",
                         "video": video_path,
                     },
+                    {"type": "text", "text": prompt}
                 ],
+            },
         ]
         inputs = self.processor.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
             return_tensors="pt",
+            do_sample_frames=True,
+            fps=fps,
+            num_frames=num_frames
+        ).to(self.model.device)
         with torch.no_grad():
+            out = self.model.generate(
                 **inputs,
+                do_sample=do_sample,
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
                 **kwargs,
             )
+        raw_response = self.processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
+        response = raw_response.split("assistant")[1].strip()
+        return response
+#     def chat_with_confidence(
+#         self,
+#         prompt: str,
+#         video_path: str,
+#         max_new_tokens: int = 512,
+#         temperature: float = 0.7,
+#         do_sample: Optional[
+#             bool
+#         ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
+#         fps: Optional[float] = 1.0,
+#         num_frames: Optional[int] = 10,
+#         token_choices: Optional[List[str]] = ["Yes", "No"],
+#         logits_temperature: Optional[float] = 1.0,
+#         return_confidence: Optional[bool] = False,
+#         top_k_tokens: Optional[int] = 10,
+#         debug: Optional[bool] = False,
+#         **kwargs: Any,
+#     ) -> Dict[str, Any]:
+#         """
+#         Returns the response and confidence of the response, if return_confidence is True. Else, returns the top k tokens and their logits.
+#         Args:
+#             prompt (str): The text prompt to generate a response for.
+#             video_path (str): The path to the video file.
+#             temperature (float, optional): The temperature to use for generation. Defaults to 0.7.
+#             max_new_tokens (int, optional): The maximum number of new tokens to generate. Defaults to 512.
+#             token_choices (List[str], optional): The list of token choices to return logits for. Defaults to ["Yes", "No"].
+#             generation_config (Dict[str, Any], optional): The generation configuration. Defaults to None.
+#             return_confidence (bool, optional): Whether to return the confidence of the response. Defaults to False.
+#             top_k_tokens (int, optional): The number of top tokens to return. Defaults to 10. Only applicable if return_confidence is False.
+#             debug (bool, optional): Whether to run in debug mode. Defaults to False.
+#         Returns:
+#             Dict[str, Any]: A dictionary containing the response and confidence of the response, if return_confidence is True. Else, returns the top k tokens and their logits.
+#         e.g., return_confidence: False
+#             Output:
+#                 {
+#                     "response": "Yes",
+#                     "top_k_tokens": [("Yes", 12.0, 12), ("No", 9.0, 9)],
+#                 }
+#         e.g., return_confidence: True
+#             Output:
+#                 {
+#                     "response": "Yes",
+#                     "confidence": 0.9999
+#                 }
+#         """
+#         # Messages containing a local video path and a text query
+#         messages = [
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {
+#                         "type": "video",
+#                         "video": video_path,
+#                         # "max_pixels": 360 * 420,
+#                         "fps": fps,
+#                     },
+#                     {"type": "text", "text": prompt},
+#                 ],
+#             }
+#         ]
+#         inputs = self.processor.apply_chat_template(
+#             messages,
+#             tokenize=True,
+#             add_generation_prompt=True,
+#             return_dict=True,
+#             return_tensors="pt",
+#         )
+#         inputs = inputs.to(self.model.device)
+#         # In debug mode, inspect what logits processors will be used
+#         if debug:
+#             print("\n" + "=" * 80)
+#             print("INSPECTING GENERATION CONFIG & WARPERS")
+#             print("=" * 80)
+#             # Get the generation config to see what processors will be added
+#             gen_config = self.model.generation_config
+#             print(f"Generation config attributes:")
+#             print(f"  Processor-related:")
+#             print(
+#                 f"    - repetition_penalty: {getattr(gen_config, 'repetition_penalty', None)}"
+#             )
+#             print(
+#                 f"    - no_repeat_ngram_size: {getattr(gen_config, 'no_repeat_ngram_size', None)}"
+#             )
+#             print(
+#                 f"    - encoder_no_repeat_ngram_size: {getattr(gen_config, 'encoder_no_repeat_ngram_size', None)}"
+#             )
+#             print(f"    - bad_words_ids: {getattr(gen_config, 'bad_words_ids', None)}")
+#             print(f"    - min_length: {getattr(gen_config, 'min_length', None)}")
+#             print(
+#                 f"    - forced_bos_token_id: {getattr(gen_config, 'forced_bos_token_id', None)}"
+#             )
+#             print(
+#                 f"    - forced_eos_token_id: {getattr(gen_config, 'forced_eos_token_id', None)}"
+#             )
+#             print(f"  Warper-related (THESE MASK TOKENS TO -INF):")
+#             print(f"    - temperature: {temperature} (passed as arg)")
+#             print(
+#                 f"    - do_sample: {getattr(gen_config, 'do_sample', 'Not set (will be inferred)')}"
+#             )
+#             print(f"    - top_k: {getattr(gen_config, 'top_k', None)}")
+#             print(f"    - top_p: {getattr(gen_config, 'top_p', None)}")
+#             print(f"    - typical_p: {getattr(gen_config, 'typical_p', None)}")
+#             print(
+#                 f"    - epsilon_cutoff: {getattr(gen_config, 'epsilon_cutoff', None)}"
+#             )
+#             print(f"    - eta_cutoff: {getattr(gen_config, 'eta_cutoff', None)}")
+#             print(
+#                 f"\n  ⚠️  If top_k or top_p are set, they will mask non-selected tokens to -inf!"
+#             )
+#             print("=" * 80 + "\n")
+#         # Inference with scores
+#         with torch.no_grad():
+#             outputs = self.model.generate(
+#                 **inputs,
+#                 temperature=temperature,
+#                 max_new_tokens=max_new_tokens,
+#                 do_sample=do_sample,
+#                 output_scores=True,
+#                 output_logits=True,  # Get TRUE raw logits before any processing
+#                 return_dict_in_generate=True,
+#                 **kwargs,
+#             )
+#         generated_ids = outputs.sequences
+#         scores = outputs.scores  # Tuple of tensors - PROCESSED logits used for sampling
+#         logits = (
+#             outputs.logits if hasattr(outputs, "logits") else None
+#         )  # TRUE raw logits from model
+#         scores = tuple(
+#             s / logits_temperature for s in scores
+#         )  # Scales the logits by a factor for normalization during reporting
+#         print(f"Number of generated tokens: {len(scores)}")
+#         print(f"Vocabulary size: {scores[0].shape[1]}")
+#         # Check if logits differ from scores
+#         if debug and logits is not None:
+#             print(f"\n[IMPORTANT] output_logits available: True")
+#             print(
+#                 f"[IMPORTANT] Comparing outputs.logits (raw) vs outputs.scores (processed):"
+#             )
+#             logits_raw = logits[0] / logits_temperature  # First token's raw logits
+#             scores_first = scores[0]  # First token's processed scores
+#             logits_diff = (logits_raw.cpu() - scores_first.cpu()).abs()
+#             max_diff = logits_diff.max().item()
+#             if max_diff > 0.001:
+#                 print(
+#                     f"[IMPORTANT] ⚠️  outputs.scores ARE DIFFERENT from outputs.logits!"
+#                 )
+#                 print(f"[IMPORTANT]    Max difference: {max_diff:.6f}")
+#                 print(
+#                     f"[IMPORTANT]    This means outputs.scores are PROCESSED, not raw!"
+#                 )
+#             else:
+#                 print(f"[IMPORTANT] ✓ outputs.scores == outputs.logits (both are raw)")
+#         elif debug:
+#             print(
+#                 f"\n[IMPORTANT] output_logits not available in this transformers version"
+#             )
+#         # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
+#         if debug:
+#             print("\n" + "=" * 80)
+#             print("****Running inference in debug mode****")
+#             print("=" * 80)
+#             # Use truly raw logits if available, otherwise use scores
+#             raw_logits_to_show = (
+#                 logits[0] / logits_temperature if logits is not None else scores[0]
+#             )
+#             logits_label = (
+#                 "TRUE RAW LOGITS (from outputs.logits)"
+#                 if logits is not None
+#                 else "LOGITS (from outputs.scores)"
+#             )
+#             # Print first token scores shape and max/min scores in debug mode
+#             print(
+#                 f"\n[{logits_label}] Single token scores shape: {raw_logits_to_show.shape}"
+#             )
+#             print(
+#                 f"[{logits_label}] First token max/min: {raw_logits_to_show.max().item():.4f}, {raw_logits_to_show.min().item():.4f}"
+#             )
+#             # Print details about top 3 tokens from RAW logits
+#             print(f"\n{'─'*80}")
+#             print(f"TOP 3 TOKENS FROM {logits_label}:")
+#             print(f"{'─'*80}")
+#             top_3_tokens = torch.topk(raw_logits_to_show, k=3, dim=-1)
+#             for i in range(3):
+#                 token_id = top_3_tokens.indices[0, i].item()
+#                 token_text = self.processor.decode(token_id)
+#                 token_logit = top_3_tokens.values[0, i].item()
+#                 print(
+#                     f"  #{i+1}: Token='{token_text}' | ID={token_id} | Logit={token_logit:.4f}"
+#                 )
+#             # Now compare with POST-PROCESSED logits (outputs.scores)
+#             scores_first = scores[0] / logits_temperature
+#             print(f"\n{'─'*80}")
+#             print("TOP 3 TOKENS FROM LOGITS CAPTURE (after all processors):")
+#             print(f"{'─'*80}")
+#             print(
+#                 f"[POST-PROCESSED] Max/min logits: {scores_first.max().item():.4f}, {scores_first.min().item():.4f}"
+#             )
+#             top_3_processed = torch.topk(scores_first, k=3, dim=-1)
+#             for i in range(3):
+#                 token_id = top_3_processed.indices[0, i].item()
+#                 token_text = self.processor.decode(token_id)
+#                 token_logit = top_3_processed.values[0, i].item()
+#                 print(
+#                     f"  #{i+1}: Token='{token_text}' | ID={token_id} | Logit={token_logit:.4f}"
+#                 )
+#             # Check if the distributions differ (compare against truly raw logits if available)
+#             print(f"\n{'─'*80}")
+#             print("DIFFERENCE ANALYSIS (Raw → Post-Processed):")
+#             print(f"{'─'*80}")
+#             logit_diff = (scores_first.cpu() - raw_logits_to_show.cpu()).abs()
+#             max_diff = logit_diff.max().item()
+#             num_changed = (logit_diff > 0.001).sum().item()
+#             print(f"  Max logit difference: {max_diff:.6f}")
+#             print(
+#                 f"  Number of tokens with changed logits: {num_changed}/{raw_logits_to_show.shape[1]}"
+#             )
+#             if max_diff > 0.001:
+#                 print(f"\n  ⚠️  LOGITS WERE MODIFIED BY PROCESSORS!")
+#                 # Show which tokens changed the most
+#                 top_changes = torch.topk(logit_diff[0], k=min(5, num_changed))
+#                 print(f"\n  Top 5 most changed tokens:")
+#                 for i in range(min(5, len(top_changes.indices))):
+#                     token_id = top_changes.indices[i].item()
+#                     token_text = self.processor.decode(token_id)
+#                     raw_logit = raw_logits_to_show[0, token_id].item()
+#                     processed_logit = scores_first[0, token_id].item()
+#                     diff = top_changes.values[i].item()
+#                     print(f"    Token='{token_text}' | ID={token_id}")
+#                     print(
+#                         f"      Raw: {raw_logit:.4f} → Processed: {processed_logit:.4f} (Δ={diff:.4f})"
+#                     )
+#             else:
+#                 print(f"  ✓ No significant modifications detected")
+#             # Show what token was actually selected
+#             print(f"\n{'─'*80}")
+#             print("ACTUALLY GENERATED TOKEN:")
+#             print(f"{'─'*80}")
+#             first_generated_id = generated_ids[0, len(inputs.input_ids[0])].item()
+#             first_generated_token = self.processor.decode(first_generated_id)
+#             raw_logit_for_generated = raw_logits_to_show[0, first_generated_id].item()
+#             print(f"  Token: '{first_generated_token}' | ID={first_generated_id}")
+#             print(f"  Raw logit: {raw_logit_for_generated:.4f}")
+#             processed_logit_for_generated = scores_first[0, first_generated_id].item()
+#             print(f"  Post-processed logit: {processed_logit_for_generated:.4f}")
+#             # Check if this token is in top-k of raw logits
+#             top_k_raw_indices = torch.topk(
+#                 raw_logits_to_show, k=min(10, raw_logits_to_show.shape[1]), dim=-1
+#             ).indices[0]
+#             is_in_top10_raw = first_generated_id in top_k_raw_indices
+#             print(f"  In top-10 of RAW logits: {is_in_top10_raw}")
+#             if not is_in_top10_raw:
+#                 print(
+#                     f"\n  🚨 CRITICAL: Generated token was NOT in top-10 of raw logits!"
+#                 )
+#                 print(
+#                     f"      This proves that logits processors modified the distribution."
+#                 )
+#                 # Find the rank of the generated token in raw logits
+#                 sorted_raw = torch.argsort(raw_logits_to_show[0], descending=True)
+#                 raw_rank = (sorted_raw == first_generated_id).nonzero(as_tuple=True)[
+#                     0
+#                 ].item() + 1
+#                 print(f"      Raw logits rank: {raw_rank}")
+#             print("=" * 80 + "\n")
+#         # Trim the prompt tokens from generated sequences
+#         generated_ids_trimmed = [
+#             out_ids[len(in_ids) :]
+#             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+#         ]
+#         # Decode the text
+#         output_response = self.processor.batch_decode(
+#             generated_ids_trimmed,
+#             skip_special_tokens=True,
+#             clean_up_tokenization_spaces=False,
+#         )[0]
+#         # Convert scores to probabilities
+#         # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
+#         selected_token_probs = []
+#         selected_token_logits = []
+#         first_token_probs = torch.softmax(scores[0], dim=-1)
+#         # Now, find indices of tokens in token_choices and get their probabilities
+#         for token_choice in token_choices:
+#             # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
+#             token_index = self.processor.tokenizer.encode(
+#                 token_choice, add_special_tokens=False
+#             )[0]
+#             selected_token_probs.append(first_token_probs[0, token_index].item())
+#             selected_token_logits.append(scores[0][0, token_index].item())
+#         # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
+#         if return_confidence:
+#             first_token_id = generated_ids_trimmed[0][
+#                 0
+#             ].item()  # First token of the first sequence
+#             confidence = (
+#                 first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
+#                 if sum(selected_token_probs) > 0
+#                 else 0.0
+#             )
+#             return {
+#                 "response": output_response,
+#                 "confidence": confidence,
+#             }
+#         # Return token logits
+#         else:
+#             token_logits = dict(zip(token_choices, selected_token_logits))
+#             top_k_logits_indices = torch.topk(scores[0], k=top_k_tokens, dim=-1)
+#             top_k_tokens_list: List[Tuple[str, int, float]] = []
+#             for i in range(top_k_tokens):
+#                 logit_index = top_k_logits_indices.indices[0, i].item()
+#                 token = self.processor.decode(logit_index)
+#                 logit = top_k_logits_indices.values[0, i].item()
+#                 top_k_tokens_list.append((token, logit_index, logit))
+#             return {
+#                 "response": output_response,
+#                 "top_k_tokens": top_k_tokens_list,
+#                 "token_logits": token_logits,
+#             }
+# if __name__ == "__main__":
+#     model_path = "Qwen/Qwen3-VL-4B-Instruct"  # "Qwen/Qwen3-VL-8B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct"
+#     model = Qwen3VLModel(model_path)
+#     prompt = 'Does the following action accurately describe the one shown in the video? \nAnswer with "Yes" or "No".\n\nAction: Trying to bend stick so nothing happens\n\nConditions which may/may not be true BEFORE the aforementioned action occurs:\n- Stick is held by hands at two distinct points\n- Stick is intact\n\nConditions which may/may not be true AFTER the aforementioned action occurs:\n- Stick retains its original geometric shape\n- Stick remains intact\n\nAnswer:'
+#     token_choices = ["Yes", "No"]
+#     video_path = (
+#         "/home/shreyasj/Syed/data/Something-Something-V2/pre-post/videos/188064.mp4"
+#     )
+#     generation_config = {
+#         "max_new_tokens": 128,
+#         "do_sample": True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P. Allows return of raw logits
+#         "temperature": 0.7,
+#         "logits_temperature": 1.0,
+#         "fps": 1.0,
+#         "return_confidence": False,
+#         "top_k_tokens": 10,
+#         "debug": False,
+#     }
+#     output = model.chat_with_confidence(
+#         prompt, video_path, token_choices=token_choices, **generation_config
+#     )
+#     response = output["response"]
+#     print(f"Response: {response}")
+#     if generation_config["return_confidence"]:
+#         confidence = output["confidence"]
+#         print(f"Confidence: {confidence}")
+#     else:
+#         # If do_sample is True, logits pass through logit warpers which filter out un-important tokens (based on logits) to -inf,
+#         # otherwise, the raw logits are used, which are not filtered.
+#         logits_type = "POST-PROCESSED" if generation_config["do_sample"] else "RAW"
+#         top_k_tokens = output["top_k_tokens"]
+#         for i in range(len(top_k_tokens)):
+#             print(f"Top {i+1} token: {top_k_tokens[i][0]}")
+#             print(f"Top {i+1} token logit: {top_k_tokens[i][2]}")
+#             print("--------------------------------")