Spaces:

jena-shreyas
/

Video-Inference-Demo

Sleeping

App Files Files Community

jena-shreyas commited on 12 days ago

Commit

cf5f08b

1 Parent(s): 40a4325

Add correct models/ repo

Browse files

Files changed (5) hide show

models/.gitkeep +0 -0
models/__init__.py +64 -23
models/llava_video.py +173 -6
models/{qwen2_5.py → qwen2_5vl.py} +4 -0
models/qwen3vl.py +302 -66

models/.gitkeep ADDED Viewed

File without changes

models/__init__.py CHANGED Viewed

@@ -3,51 +3,64 @@ from packaging import version
 import torch
 from typing import Optional, Union, Dict
-# Required versions
 qwen_required_version = version.parse("4.57.0")
 llava_required_version = version.parse("4.40.0")
 # Conditional imports based on transformers version
 try:
     import transformers
-    # More robust import path for newer transformers
     transformers_version = version.parse(transformers.__version__)
     QWEN_MODELS_AVAILABLE = False
     LLAVA_MODELS_AVAILABLE = False
     # Qwen condition
     if transformers_version >= qwen_required_version:
-        from .qwen2_5 import Qwen2_5VLModel
         from .qwen3vl import Qwen3VLModel
         QWEN_MODELS_AVAILABLE = True
     else:
         print(
-            f"Warning: Qwen models require transformers>=4.57.0, "
-            f"but found {transformers.__version__}. "
-            f"Qwen models will not be available."
         )
     # LLaVA condition
     if transformers_version <= llava_required_version:
         from .llava_video import LLaVAVideoModel
         LLAVA_MODELS_AVAILABLE = True
     else:
         print(
-            f"Warning: LLaVA models require transformers<=4.40.0, "
-            f"but found {transformers.__version__}. "
-            f"LLaVA models will not be available."
         )
-except ImportError as e:
-    print("Warning: Could not import transformers correctly.")
-    raise e
 # Build __all__ list dynamically
 __all__ = []
 if QWEN_MODELS_AVAILABLE:
     __all__.extend(["Qwen2_5VLModel", "Qwen3VLModel"])
 if LLAVA_MODELS_AVAILABLE:
     __all__.append("LLaVAVideoModel")
@@ -59,12 +72,11 @@ def load_model(
     device_map: Optional[Union[str, Dict]] = "auto",
     attn_implementation: Optional[str] = "flash_attention_2",
 ) -> BaseVideoModel:
     if "LLaVA-Video" in model_path:
         if not LLAVA_MODELS_AVAILABLE:
             raise ImportError(
-                "LLaVA models require transformers<=4.40.0. "
-                "Please downgrade transformers."
             )
         return LLaVAVideoModel(
             model_path,
@@ -72,14 +84,12 @@ def load_model(
             device_map=device_map,
             attn_implementation=attn_implementation,
         )
     elif "Qwen" in model_path:
         if not QWEN_MODELS_AVAILABLE:
             raise ImportError(
-                "Qwen models require transformers>=4.57.0. "
-                "Please upgrade transformers."
             )
         if "Qwen3" in model_path:
             return Qwen3VLModel(
                 model_path,
@@ -94,7 +104,38 @@ def load_model(
                 device_map=device_map,
                 attn_implementation=attn_implementation,
             )
-    else:
-        raise ValueError(f"Unsupported model path: {model_path}")

 import torch
 from typing import Optional, Union, Dict
+# IMP: Add required versions here
 qwen_required_version = version.parse("4.57.0")
+internvl_required_version = version.parse("4.45.0")
 llava_required_version = version.parse("4.40.0")
 # Conditional imports based on transformers version
 try:
     import transformers
+    from transformers.generation.logits_process import LogitsProcessor
+    # Check transformers version
     transformers_version = version.parse(transformers.__version__)
     QWEN_MODELS_AVAILABLE = False
+    INTERNVL_MODELS_AVAILABLE = False
     LLAVA_MODELS_AVAILABLE = False
     # Qwen condition
     if transformers_version >= qwen_required_version:
+        from .qwen2_5vl import Qwen2_5VLModel
         from .qwen3vl import Qwen3VLModel
         QWEN_MODELS_AVAILABLE = True
     else:
         print(
+            f"Warning: Qwen models require transformers>=4.57.0, but found {transformers.__version__}. Qwen models will not be available. Please upgrade to transformers>=4.57.0 or switch conda environments to use Qwen models."
+        )
+    # InternVL condition
+    if transformers_version >= internvl_required_version:
+        from .internvl import InternVLModel
+        INTERNVL_MODELS_AVAILABLE = True
+    else:
+        print(
+            f"Warning: InternVL models require transformers>=4.45.0, but found {transformers.__version__}. InternVL models will not be available. Please downgrade to transformers<=4.45.0 or switch conda environments to use InternVL models."
         )
     # LLaVA condition
     if transformers_version <= llava_required_version:
         from .llava_video import LLaVAVideoModel
         LLAVA_MODELS_AVAILABLE = True
     else:
         print(
+            f"Warning: LLaVA models require transformers<=4.40.0, but found {transformers.__version__}. LLaVA models will not be available. Please downgrade to transformers<=4.40.0 or switch conda environments to use LLaVA models."
         )
+except ImportError:
+    print(
+        "Warning: Could not check transformers version. Please re-check transformers installation."
+    )
 # Build __all__ list dynamically
 __all__ = []
 if QWEN_MODELS_AVAILABLE:
     __all__.extend(["Qwen2_5VLModel", "Qwen3VLModel"])
+if INTERNVL_MODELS_AVAILABLE:
+    __all__.append("InternVLModel")
 if LLAVA_MODELS_AVAILABLE:
     __all__.append("LLaVAVideoModel")
     device_map: Optional[Union[str, Dict]] = "auto",
     attn_implementation: Optional[str] = "flash_attention_2",
 ) -> BaseVideoModel:
     if "LLaVA-Video" in model_path:
         if not LLAVA_MODELS_AVAILABLE:
             raise ImportError(
+                f"LLaVA models require transformers<=4.40.0."
+                f"Please downgrade transformers: pip install transformers<=4.40.0"
             )
         return LLaVAVideoModel(
             model_path,
             device_map=device_map,
             attn_implementation=attn_implementation,
         )
     elif "Qwen" in model_path:
         if not QWEN_MODELS_AVAILABLE:
             raise ImportError(
+                f"Qwen models require transformers>=4.57.0."
+                f"Please upgrade transformers: pip install transformers>=4.57.0"
             )
         if "Qwen3" in model_path:
             return Qwen3VLModel(
                 model_path,
                 device_map=device_map,
                 attn_implementation=attn_implementation,
             )
+    elif "Intern" in model_path:
+        if not INTERNVL_MODELS_AVAILABLE:
+            raise ImportError(
+                f"InternVL models require transformers>=4.45.0."
+                f"Please upgrade transformers: pip install transformers>=4.45.0"
+            )
+        return InternVLModel(
+            model_path,
+            dtype=dtype,
+            device_map=device_map,
+            attn_implementation=attn_implementation,
+        )
+class LogitsCaptureProcessor(LogitsProcessor):
+    """
+    Custom LogitsProcessor that captures the processed logits right before sampling.
+    This allows us to see what the actual distribution looks like after all other
+    processors have been applied.
+    """
+    def __init__(self):
+        self.captured_logits = []
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # Store a copy of the logits at this point in generation
+        self.captured_logits.append(scores.detach().clone().cpu())
+        # Return scores unchanged - we're just observing
+        return scores
+    def reset(self):
+        """Clear captured logits for a new generation"""
+        self.captured_logits = []

models/llava_video.py CHANGED Viewed

@@ -17,8 +17,7 @@ from PIL import Image
 import requests
 import copy
 import torch
-import sys
-from typing import Optional, Union, Dict, List, Any
 import warnings
 from decord import VideoReader, cpu
 import numpy as np
@@ -56,7 +55,6 @@ class LLaVAVideoModel(BaseVideoModel):
                 base_model,
                 torch_dtype=torch_dtype,
                 device_map=device_map,
-                attn_implementation=attn_implementation,
             )
         )  # Add any other thing you want to pass in llava_model_args
         self.model.eval()
@@ -105,10 +103,18 @@ class LLaVAVideoModel(BaseVideoModel):
         video_path: str,
         fps: float = 1.0,
         max_new_tokens: int = 512,
         temperature: float = 0.7,
         **kwargs: Any,
     ) -> str:
-        video, _, _ = self.load_video(video_path, fps)
         video = self.image_processor.preprocess(video, return_tensors="pt")[
             "pixel_values"
         ].to(device=self.model.device, dtype=self.dtype)
@@ -132,7 +138,7 @@ class LLaVAVideoModel(BaseVideoModel):
             input_ids,
             images=video,
             modalities=["video"],
-            do_sample=False,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
             **kwargs,
@@ -149,9 +155,170 @@ class LLaVAVideoModel(BaseVideoModel):
         fps: float = 1.0,
         max_new_tokens: int = 512,
         temperature: float = 0.7,
         token_choices: Optional[List[str]] = ["Yes", "No"],
         logits_temperature: Optional[float] = 1.0,
         return_confidence: Optional[bool] = False,
         debug: Optional[bool] = False,
     ) -> Dict[str, Any]:
-        pass

 import requests
 import copy
 import torch
+from typing import Optional, Union, Dict, List, Tuple, Any
 import warnings
 from decord import VideoReader, cpu
 import numpy as np
                 base_model,
                 torch_dtype=torch_dtype,
                 device_map=device_map,
             )
         )  # Add any other thing you want to pass in llava_model_args
         self.model.eval()
         video_path: str,
         fps: float = 1.0,
         max_new_tokens: int = 512,
+        do_sample: Optional[
+            bool
+        ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
         temperature: float = 0.7,
+        video_mode: Optional[str] = "video",
+        video_frames: Optional[int] = 10,
         **kwargs: Any,
     ) -> str:
+        if video_mode == "frames":
+            video, _, _ = self.load_video(video_path, max_frames_num=video_frames)
+        elif video_mode == "video":
+            video, _, _ = self.load_video(video_path, fps)
         video = self.image_processor.preprocess(video, return_tensors="pt")[
             "pixel_values"
         ].to(device=self.model.device, dtype=self.dtype)
             input_ids,
             images=video,
             modalities=["video"],
+            do_sample=do_sample,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
             **kwargs,
         fps: float = 1.0,
         max_new_tokens: int = 512,
         temperature: float = 0.7,
+        do_sample: Optional[
+            bool
+        ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
         token_choices: Optional[List[str]] = ["Yes", "No"],
         logits_temperature: Optional[float] = 1.0,
         return_confidence: Optional[bool] = False,
+        top_k_tokens: Optional[int] = 10,
         debug: Optional[bool] = False,
     ) -> Dict[str, Any]:
+        video, _, _ = self.load_video(video_path, fps)
+        video = self.image_processor.preprocess(video, return_tensors="pt")[
+            "pixel_values"
+        ].to(device=self.model.device, dtype=self.dtype)
+        video = [video]
+        conv_template = (
+            "qwen_1_5"  # Make sure you use correct chat template for different models
+        )
+        question = DEFAULT_IMAGE_TOKEN + f"\n{prompt}"
+        conv = copy.deepcopy(conv_templates[conv_template])
+        conv.append_message(conv.roles[0], question)
+        conv.append_message(conv.roles[1], None)
+        prompt_question = conv.get_prompt()
+        input_ids = (
+            tokenizer_image_token(
+                prompt_question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+            )
+            .unsqueeze(0)
+            .to(self.model.device)
+        )
+        with torch.no_grad():
+            outputs = self.model.generate(
+                input_ids,
+                images=video,
+                modalities=["video"],
+                do_sample=do_sample,  # Was set to False, i.e., greedy sampling, which invalidates things like temperature, top-K, top-P!
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                output_scores=True,
+                return_dict_in_generate=True,
+            )
+        generated_ids = outputs.sequences
+        scores = outputs.scores  # Tuple of tensors, one per generated token
+        print(f"Number of generated tokens: {len(scores)}")
+        print(f"Vocabulary size: {scores[0].shape[1]}")
+        # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
+        if debug:
+            print("****Running inference in debug mode****")
+            # Print first token scores shape and max/min scores in debug mode
+            print(f"Single token scores shape: {scores[0].shape}")
+            print(
+                f"Max score: {scores[0].max().item():.4f} | Min score: {scores[0].min().item():.4f}"
+            )
+            # Print details about top 10 tokens based on logits
+            logits_type = "POST-PROCESSED" if do_sample is True else "RAW"
+            print(f"\n{'─'*80}")
+            print(
+                f"TOP {top_k_tokens} TOKENS FROM {logits_type} LOGITS (outputs.scores):"
+            )
+            print(f"{'─'*80}")
+            top_k_tokens_scores = torch.topk(scores[0], k=top_k_tokens, dim=-1)
+            for i in range(top_k_tokens):
+                score = top_k_tokens_scores.values[0, i].item()
+                score_index = top_k_tokens_scores.indices[0, i].item()
+                token = self.tokenizer.decode(score_index)
+                print(f"#{i+1}th Token: {token}")
+                print(f"#{i+1}th Token index: {score_index}")
+                print(f"#{i+1}th Token score: {score}")
+                print("--------------------------------")
+        # Decode the text
+        output_response = self.tokenizer.batch_decode(
+            generated_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        # Convert scores to probabilities
+        # scores is a tuple of (batch_size, vocab_size) tensors, one per generated token
+        selected_token_probs = []
+        selected_token_logits = []
+        first_token_probs = torch.softmax(scores[0], dim=-1)
+        # Now, find indices of tokens in token_choices and get their probabilities
+        for token_choice in token_choices:
+            # Tokenize the choice - encode returns a list, we want the first actual token (skip special tokens)
+            token_index = self.tokenizer.encode(token_choice, add_special_tokens=False)[
+                0
+            ]
+            selected_token_probs.append(first_token_probs[0, token_index].item())
+            selected_token_logits.append(scores[0][0, token_index].item())
+        # Compute confidence as the ratio of first token's probability to the sum of all probabilities in selected_token_probs
+        if return_confidence:
+            first_token_id = generated_ids[0][
+                0
+            ].item()  # First token of the first sequence
+            confidence = (
+                first_token_probs[0, first_token_id].item() / sum(selected_token_probs)
+                if sum(selected_token_probs) > 0
+                else 0.0
+            )
+            return {
+                "response": output_response,
+                "confidence": confidence,
+            }
+        # Return token logits
+        else:
+            token_logits = dict(zip(token_choices, selected_token_logits))
+            top_k_logits_indices = torch.topk(scores[0], k=top_k_tokens, dim=-1)
+            top_k_tokens_list: List[Tuple[str, int, float]] = []
+            for i in range(top_k_tokens):
+                logit_index = top_k_logits_indices.indices[0, i].item()
+                token = self.tokenizer.decode(logit_index)
+                logit = top_k_logits_indices.values[0, i].item()
+                top_k_tokens_list.append((token, logit_index, logit))
+            return {
+                "response": output_response,
+                "top_k_tokens": top_k_tokens_list,
+                "token_logits": token_logits,
+            }
+if __name__ == "__main__":
+    model_path = "lmms-lab/LLaVA-Video-7B-Qwen2"  # "Qwen/Qwen3-VL-8B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct"
+    device_map = "cuda:0"
+    model = LLaVAVideoModel(model_path, device_map=device_map)
+    prompt = 'Does the following action accurately describe the one shown in the video? \nAnswer with "Yes" or "No".\n\nAction: Trying but failing to attach clip to ring because it doesn\'t stick\n\nConditions which may/may not be true BEFORE the aforementioned action occurs:\n- Clip is physically separate from the ring\n- Hand is holding the clip\n\nConditions which may/may not be true AFTER the aforementioned action occurs:\n- Clip is physically separate from the ring\n- Clip is not attached to the ring\n\nAnswer:'
+    token_choices = ["Yes", "No"]
+    video_path = (
+        "/home/shreyasj/Syed/data/Something-Something-V2/pre-post/videos/101917.mp4"
+    )
+    generation_config = {
+        "max_new_tokens": 128,
+        "do_sample": False,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P. Allows return of raw logits
+        "temperature": 0.7,
+        "logits_temperature": 1.0,
+        "fps": 1.0,
+        "return_confidence": False,
+        "top_k_tokens": 10,
+        "debug": False,
+    }
+    output = model.chat_with_confidence(
+        prompt, video_path, token_choices=token_choices, **generation_config
+    )
+    response = output["response"]
+    print(f"Response: {response}")
+    if generation_config["return_confidence"]:
+        confidence = output["confidence"]
+        print(f"Confidence: {confidence}")
+    else:
+        # If do_sample is True, logits pass through logit warpers which filter out un-important tokens (based on logits) to -inf,
+        # otherwise, the raw logits are used, which are not filtered.
+        logits_type = "POST-PROCESSED" if generation_config["do_sample"] else "RAW"
+        print(f"\n{'─'*80}")
+        print(f"TOP 10 TOKENS FROM {logits_type} LOGITS (outputs.scores):")
+        print(f"{'─'*80}")
+        top_k_tokens = output["top_k_tokens"]
+        for i in range(len(top_k_tokens)):
+            print(f"Top {i+1} token: {top_k_tokens[i][0]}")
+            print(f"Top {i+1} token index: {top_k_tokens[i][1]}")
+            print(f"Top {i+1} token logit: {top_k_tokens[i][2]}")
+            print("--------------------------------")

models/{qwen2_5.py → qwen2_5vl.py} RENAMED Viewed

@@ -39,6 +39,8 @@ class Qwen2_5VLModel(BaseVideoModel):
         fps: float = 1.0,
         temperature: float = 0.7,
         max_new_tokens: int = 512,
     ) -> str:
         # Messages containing a local video path and a text query
         messages = [
@@ -75,8 +77,10 @@ class Qwen2_5VLModel(BaseVideoModel):
         # Inference
         generated_ids = self.model.generate(
             **inputs,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
         )
         generated_ids_trimmed = [
             out_ids[len(in_ids) :]

         fps: float = 1.0,
         temperature: float = 0.7,
         max_new_tokens: int = 512,
+        do_sample: Optional[bool] = True,
+        **kwargs: Any,
     ) -> str:
         # Messages containing a local video path and a text query
         messages = [
         # Inference
         generated_ids = self.model.generate(
             **inputs,
+            do_sample=do_sample,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
+            **kwargs,
         )
         generated_ids_trimmed = [
             out_ids[len(in_ids) :]

models/qwen3vl.py CHANGED Viewed

@@ -5,8 +5,11 @@ from transformers import (
     Qwen3VLForConditionalGeneration,
     AutoProcessor,
 )
-from typing import Optional, Dict, Any, Union, List
 from qwen_vl_utils import process_vision_info
 # Handle both relative and absolute imports
 try:
@@ -15,6 +18,36 @@ except ImportError:
     from base import BaseVideoModel
 class Qwen3VLModel(BaseVideoModel):
     def __init__(
         self,
@@ -38,31 +71,55 @@ class Qwen3VLModel(BaseVideoModel):
         video_path: str,
         fps: float = 1.0,
         temperature: float = 0.7,
         max_new_tokens: int = 512,
     ) -> str:
         # Messages containing a local video path and a text query
         messages = [
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "video",
-                        "video": video_path,
-                        # "max_pixels": 360 * 420,
-                        "fps": fps,
-                    },
                     {"type": "text", "text": prompt},
                 ],
             }
         ]
-        inputs = self.processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
         inputs = inputs.to(self.model.device)
@@ -70,6 +127,8 @@ class Qwen3VLModel(BaseVideoModel):
             **inputs,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
         )
         generated_ids_trimmed = [
@@ -92,13 +151,18 @@ class Qwen3VLModel(BaseVideoModel):
         fps: float = 1.0,
         max_new_tokens: int = 512,
         temperature: float = 0.7,
         token_choices: Optional[List[str]] = ["Yes", "No"],
         logits_temperature: Optional[float] = 1.0,
         return_confidence: Optional[bool] = False,
         debug: Optional[bool] = False,
     ) -> Dict[str, Any]:
         """
-        Returns the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
         Args:
             prompt (str): The text prompt to generate a response for.
@@ -108,19 +172,17 @@ class Qwen3VLModel(BaseVideoModel):
             token_choices (List[str], optional): The list of token choices to return logits for. Defaults to ["Yes", "No"].
             generation_config (Dict[str, Any], optional): The generation configuration. Defaults to None.
             return_confidence (bool, optional): Whether to return the confidence of the response. Defaults to False.
             debug (bool, optional): Whether to run in debug mode. Defaults to False.
         Returns:
-            Dict[str, Any]: A dictionary containing the response and confidence of the response, if return_confidence is True. Else, returns the token logits for token_choices.
         e.g., return_confidence: False
             Output:
                 {
                     "response": "Yes",
-                    "logits": {
-                        "Yes": 12.0,
-                        "No": 9.0
-                    }
                 }
         e.g., return_confidence: True
@@ -146,68 +208,233 @@ class Qwen3VLModel(BaseVideoModel):
             }
         ]
-        text = self.processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, videos, video_kwargs = process_vision_info(
             messages,
-            image_patch_size=16,
-            return_video_kwargs=True,
-            return_video_metadata=True,
-        )
-        # Extract out videos and video metadata
-        if videos is not None:
-            videos, video_metadatas = zip(*videos)
-            videos, video_metadatas = list(videos), list(video_metadatas)
-        else:
-            video_metadatas = None
-        inputs = self.processor(
-            text=text,
-            images=image_inputs,
-            videos=videos,
-            video_metadata=video_metadatas,
             return_tensors="pt",
-            do_resize=False,
-            **video_kwargs,
         )
         inputs = inputs.to(self.model.device)
         # Inference with scores
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
                 output_scores=True,
                 return_dict_in_generate=True,
             )
         generated_ids = outputs.sequences
-        scores = outputs.scores  # Tuple of tensors, one per generated token
         scores = tuple(
             s / logits_temperature for s in scores
         )  # Scales the logits by a factor for normalization during reporting
         print(f"Number of generated tokens: {len(scores)}")
         print(f"Vocabulary size: {scores[0].shape[1]}")
         # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
         if debug:
             print("****Running inference in debug mode****")
             # Print first token scores shape and max/min scores in debug mode
-            print(f"Single token scores shape: {scores[0].shape}")
             print(
-                f"First token max/min scores: {scores[0].max().item()}, {scores[0].min().item()}"
             )
-            # Print details about top 3 tokens
-            top_3_tokens = torch.topk(scores[0], k=3, dim=-1)
             for i in range(3):
                 print(
-                    f"Pos 0 | {i+1}th Token: {self.processor.decode(top_3_tokens.indices[0, i].item())}"
                 )
                 print(
-                    f"Pos 0 | {i+1}th Token logit: {top_3_tokens.values[0, i].item()}"
                 )
         # Trim the prompt tokens from generated sequences
         generated_ids_trimmed = [
@@ -252,37 +479,41 @@ class Qwen3VLModel(BaseVideoModel):
                 "confidence": confidence,
             }
-        # Retrn token logits
         else:
             token_logits = dict(zip(token_choices, selected_token_logits))
             return {
                 "response": output_response,
-                "logits": token_logits,
             }
 if __name__ == "__main__":
     model_path = "Qwen/Qwen3-VL-4B-Instruct"  # "Qwen/Qwen3-VL-8B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct"
     model = Qwen3VLModel(model_path)
-    prompt = "Describe this video."
-    ext = ".mp4"
     video_path = (
-        "/home/shreyasj/Syed/data/Something-Something-V2/pre-post/videos/1586" + ext
     )
-    response = model.chat(prompt, video_path)
-    print("Response: ", response)
-    token_choices = ["A", "B"]
-    ext = ".webm"
-    video_path = "/home/shreyasj/Syed/data/Something-Something-V2/videos/101917" + ext
     generation_config = {
         "max_new_tokens": 128,
         "temperature": 0.7,
-        "logits_temperature": 5.0,
-        "fps": 3.0,
         "return_confidence": False,
-        "debug": True,
     }
     output = model.chat_with_confidence(
         prompt, video_path, token_choices=token_choices, **generation_config
@@ -294,6 +525,11 @@ if __name__ == "__main__":
         confidence = output["confidence"]
         print(f"Confidence: {confidence}")
     else:
-        selected_token_logits = output["logits"]
-        print(f"Selected token logits: {selected_token_logits}")
-        print(f"Logits temperature: {generation_config['logits_temperature']}")

     Qwen3VLForConditionalGeneration,
     AutoProcessor,
 )
+from typing import Optional, Dict, Any, Union, List, Tuple
 from qwen_vl_utils import process_vision_info
+import cv2
+import numpy as np
+from PIL import Image
 # Handle both relative and absolute imports
 try:
     from base import BaseVideoModel
+def downsample_video(video_path, max_dim=720, num_frames=10):
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frames = []
+    frame_indices = np.linspace(
+        0, total_frames - 1, min(total_frames, num_frames), dtype=int
+    )
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            h, w = image.shape[:2]
+            scale = max_dim / max(h, w)
+            if scale < 1:
+                image = cv2.resize(
+                    image,
+                    (int(w * scale), int(h * scale)),
+                    interpolation=cv2.INTER_AREA,
+                )
+            pil_image = Image.fromarray(image)
+            frames.append(pil_image)
+    vidcap.release()
+    return frames
 class Qwen3VLModel(BaseVideoModel):
     def __init__(
         self,
         video_path: str,
         fps: float = 1.0,
         temperature: float = 0.7,
+        do_sample: Optional[
+            bool
+        ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
         max_new_tokens: int = 512,
+        video_mode: Optional[str] = "video",  # Choose from "video" or "frames"
+        video_frames: Optional[int] = 10,
+        **kwargs: Any,
     ) -> str:
         # Messages containing a local video path and a text query
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "text", "text": prompt},
                 ],
             }
         ]
+        if video_mode == "video":
+            messages[0]["content"].append(
+                {
+                    "type": "video",
+                    "video": video_path,
+                    # "max_pixels": 360 * 420,
+                    "fps": fps,
+                }
+            )
+            inputs = self.processor.apply_chat_template(
+                messages,
+                tokenize=True,
+                add_generation_prompt=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+        elif video_mode == "frames":
+            frames = downsample_video(video_path, max_dim=720, num_frames=video_frames)
+            images_for_processor = []
+            for frame in frames:
+                messages[0]["content"].append({"type": "image"})
+                images_for_processor.append(frame)
+            prompt_full = self.processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            inputs = self.processor(
+                text=[prompt_full],
+                images=images_for_processor,
+                return_tensors="pt",
+                padding=True,
+            )
         inputs = inputs.to(self.model.device)
             **inputs,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
+            do_sample=do_sample,
+            **kwargs,
         )
         generated_ids_trimmed = [
         fps: float = 1.0,
         max_new_tokens: int = 512,
         temperature: float = 0.7,
+        do_sample: Optional[
+            bool
+        ] = True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P!
         token_choices: Optional[List[str]] = ["Yes", "No"],
         logits_temperature: Optional[float] = 1.0,
         return_confidence: Optional[bool] = False,
+        top_k_tokens: Optional[int] = 10,
         debug: Optional[bool] = False,
+        **kwargs: Any,
     ) -> Dict[str, Any]:
         """
+        Returns the response and confidence of the response, if return_confidence is True. Else, returns the top k tokens and their logits.
         Args:
             prompt (str): The text prompt to generate a response for.
             token_choices (List[str], optional): The list of token choices to return logits for. Defaults to ["Yes", "No"].
             generation_config (Dict[str, Any], optional): The generation configuration. Defaults to None.
             return_confidence (bool, optional): Whether to return the confidence of the response. Defaults to False.
+            top_k_tokens (int, optional): The number of top tokens to return. Defaults to 10. Only applicable if return_confidence is False.
             debug (bool, optional): Whether to run in debug mode. Defaults to False.
         Returns:
+            Dict[str, Any]: A dictionary containing the response and confidence of the response, if return_confidence is True. Else, returns the top k tokens and their logits.
         e.g., return_confidence: False
             Output:
                 {
                     "response": "Yes",
+                    "top_k_tokens": [("Yes", 12.0, 12), ("No", 9.0, 9)],
                 }
         e.g., return_confidence: True
             }
         ]
+        inputs = self.processor.apply_chat_template(
             messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
             return_tensors="pt",
         )
         inputs = inputs.to(self.model.device)
+        # In debug mode, inspect what logits processors will be used
+        if debug:
+            print("\n" + "=" * 80)
+            print("INSPECTING GENERATION CONFIG & WARPERS")
+            print("=" * 80)
+            # Get the generation config to see what processors will be added
+            gen_config = self.model.generation_config
+            print(f"Generation config attributes:")
+            print(f"  Processor-related:")
+            print(
+                f"    - repetition_penalty: {getattr(gen_config, 'repetition_penalty', None)}"
+            )
+            print(
+                f"    - no_repeat_ngram_size: {getattr(gen_config, 'no_repeat_ngram_size', None)}"
+            )
+            print(
+                f"    - encoder_no_repeat_ngram_size: {getattr(gen_config, 'encoder_no_repeat_ngram_size', None)}"
+            )
+            print(f"    - bad_words_ids: {getattr(gen_config, 'bad_words_ids', None)}")
+            print(f"    - min_length: {getattr(gen_config, 'min_length', None)}")
+            print(
+                f"    - forced_bos_token_id: {getattr(gen_config, 'forced_bos_token_id', None)}"
+            )
+            print(
+                f"    - forced_eos_token_id: {getattr(gen_config, 'forced_eos_token_id', None)}"
+            )
+            print(f"  Warper-related (THESE MASK TOKENS TO -INF):")
+            print(f"    - temperature: {temperature} (passed as arg)")
+            print(
+                f"    - do_sample: {getattr(gen_config, 'do_sample', 'Not set (will be inferred)')}"
+            )
+            print(f"    - top_k: {getattr(gen_config, 'top_k', None)}")
+            print(f"    - top_p: {getattr(gen_config, 'top_p', None)}")
+            print(f"    - typical_p: {getattr(gen_config, 'typical_p', None)}")
+            print(
+                f"    - epsilon_cutoff: {getattr(gen_config, 'epsilon_cutoff', None)}"
+            )
+            print(f"    - eta_cutoff: {getattr(gen_config, 'eta_cutoff', None)}")
+            print(
+                f"\n  ⚠️  If top_k or top_p are set, they will mask non-selected tokens to -inf!"
+            )
+            print("=" * 80 + "\n")
         # Inference with scores
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
+                do_sample=do_sample,
                 output_scores=True,
+                output_logits=True,  # Get TRUE raw logits before any processing
                 return_dict_in_generate=True,
+                **kwargs,
             )
         generated_ids = outputs.sequences
+        scores = outputs.scores  # Tuple of tensors - PROCESSED logits used for sampling
+        logits = (
+            outputs.logits if hasattr(outputs, "logits") else None
+        )  # TRUE raw logits from model
         scores = tuple(
             s / logits_temperature for s in scores
         )  # Scales the logits by a factor for normalization during reporting
         print(f"Number of generated tokens: {len(scores)}")
         print(f"Vocabulary size: {scores[0].shape[1]}")
+        # Check if logits differ from scores
+        if debug and logits is not None:
+            print(f"\n[IMPORTANT] output_logits available: True")
+            print(
+                f"[IMPORTANT] Comparing outputs.logits (raw) vs outputs.scores (processed):"
+            )
+            logits_raw = logits[0] / logits_temperature  # First token's raw logits
+            scores_first = scores[0]  # First token's processed scores
+            logits_diff = (logits_raw.cpu() - scores_first.cpu()).abs()
+            max_diff = logits_diff.max().item()
+            if max_diff > 0.001:
+                print(
+                    f"[IMPORTANT] ⚠️  outputs.scores ARE DIFFERENT from outputs.logits!"
+                )
+                print(f"[IMPORTANT]    Max difference: {max_diff:.6f}")
+                print(
+                    f"[IMPORTANT]    This means outputs.scores are PROCESSED, not raw!"
+                )
+            else:
+                print(f"[IMPORTANT] ✓ outputs.scores == outputs.logits (both are raw)")
+        elif debug:
+            print(
+                f"\n[IMPORTANT] output_logits not available in this transformers version"
+            )
         # Print top 3 tokens at 1st position (i.e., scores[0]) along with their probabilities in debug mode
         if debug:
+            print("\n" + "=" * 80)
             print("****Running inference in debug mode****")
+            print("=" * 80)
+            # Use truly raw logits if available, otherwise use scores
+            raw_logits_to_show = (
+                logits[0] / logits_temperature if logits is not None else scores[0]
+            )
+            logits_label = (
+                "TRUE RAW LOGITS (from outputs.logits)"
+                if logits is not None
+                else "LOGITS (from outputs.scores)"
+            )
             # Print first token scores shape and max/min scores in debug mode
             print(
+                f"\n[{logits_label}] Single token scores shape: {raw_logits_to_show.shape}"
+            )
+            print(
+                f"[{logits_label}] First token max/min: {raw_logits_to_show.max().item():.4f}, {raw_logits_to_show.min().item():.4f}"
+            )
+            # Print details about top 3 tokens from RAW logits
+            print(f"\n{'─'*80}")
+            print(f"TOP 3 TOKENS FROM {logits_label}:")
+            print(f"{'─'*80}")
+            top_3_tokens = torch.topk(raw_logits_to_show, k=3, dim=-1)
+            for i in range(3):
+                token_id = top_3_tokens.indices[0, i].item()
+                token_text = self.processor.decode(token_id)
+                token_logit = top_3_tokens.values[0, i].item()
+                print(
+                    f"  #{i+1}: Token='{token_text}' | ID={token_id} | Logit={token_logit:.4f}"
+                )
+            # Now compare with POST-PROCESSED logits (outputs.scores)
+            scores_first = scores[0] / logits_temperature
+            print(f"\n{'─'*80}")
+            print("TOP 3 TOKENS FROM LOGITS CAPTURE (after all processors):")
+            print(f"{'─'*80}")
+            print(
+                f"[POST-PROCESSED] Max/min logits: {scores_first.max().item():.4f}, {scores_first.min().item():.4f}"
             )
+            top_3_processed = torch.topk(scores_first, k=3, dim=-1)
             for i in range(3):
+                token_id = top_3_processed.indices[0, i].item()
+                token_text = self.processor.decode(token_id)
+                token_logit = top_3_processed.values[0, i].item()
                 print(
+                    f"  #{i+1}: Token='{token_text}' | ID={token_id} | Logit={token_logit:.4f}"
+                )
+            # Check if the distributions differ (compare against truly raw logits if available)
+            print(f"\n{'─'*80}")
+            print("DIFFERENCE ANALYSIS (Raw → Post-Processed):")
+            print(f"{'─'*80}")
+            logit_diff = (scores_first.cpu() - raw_logits_to_show.cpu()).abs()
+            max_diff = logit_diff.max().item()
+            num_changed = (logit_diff > 0.001).sum().item()
+            print(f"  Max logit difference: {max_diff:.6f}")
+            print(
+                f"  Number of tokens with changed logits: {num_changed}/{raw_logits_to_show.shape[1]}"
+            )
+            if max_diff > 0.001:
+                print(f"\n  ⚠️  LOGITS WERE MODIFIED BY PROCESSORS!")
+                # Show which tokens changed the most
+                top_changes = torch.topk(logit_diff[0], k=min(5, num_changed))
+                print(f"\n  Top 5 most changed tokens:")
+                for i in range(min(5, len(top_changes.indices))):
+                    token_id = top_changes.indices[i].item()
+                    token_text = self.processor.decode(token_id)
+                    raw_logit = raw_logits_to_show[0, token_id].item()
+                    processed_logit = scores_first[0, token_id].item()
+                    diff = top_changes.values[i].item()
+                    print(f"    Token='{token_text}' | ID={token_id}")
+                    print(
+                        f"      Raw: {raw_logit:.4f} → Processed: {processed_logit:.4f} (Δ={diff:.4f})"
+                    )
+            else:
+                print(f"  ✓ No significant modifications detected")
+            # Show what token was actually selected
+            print(f"\n{'─'*80}")
+            print("ACTUALLY GENERATED TOKEN:")
+            print(f"{'─'*80}")
+            first_generated_id = generated_ids[0, len(inputs.input_ids[0])].item()
+            first_generated_token = self.processor.decode(first_generated_id)
+            raw_logit_for_generated = raw_logits_to_show[0, first_generated_id].item()
+            print(f"  Token: '{first_generated_token}' | ID={first_generated_id}")
+            print(f"  Raw logit: {raw_logit_for_generated:.4f}")
+            processed_logit_for_generated = scores_first[0, first_generated_id].item()
+            print(f"  Post-processed logit: {processed_logit_for_generated:.4f}")
+            # Check if this token is in top-k of raw logits
+            top_k_raw_indices = torch.topk(
+                raw_logits_to_show, k=min(10, raw_logits_to_show.shape[1]), dim=-1
+            ).indices[0]
+            is_in_top10_raw = first_generated_id in top_k_raw_indices
+            print(f"  In top-10 of RAW logits: {is_in_top10_raw}")
+            if not is_in_top10_raw:
+                print(
+                    f"\n  🚨 CRITICAL: Generated token was NOT in top-10 of raw logits!"
                 )
                 print(
+                    f"      This proves that logits processors modified the distribution."
                 )
+                # Find the rank of the generated token in raw logits
+                sorted_raw = torch.argsort(raw_logits_to_show[0], descending=True)
+                raw_rank = (sorted_raw == first_generated_id).nonzero(as_tuple=True)[
+                    0
+                ].item() + 1
+                print(f"      Raw logits rank: {raw_rank}")
+            print("=" * 80 + "\n")
         # Trim the prompt tokens from generated sequences
         generated_ids_trimmed = [
                 "confidence": confidence,
             }
+        # Return token logits
         else:
             token_logits = dict(zip(token_choices, selected_token_logits))
+            top_k_logits_indices = torch.topk(scores[0], k=top_k_tokens, dim=-1)
+            top_k_tokens_list: List[Tuple[str, int, float]] = []
+            for i in range(top_k_tokens):
+                logit_index = top_k_logits_indices.indices[0, i].item()
+                token = self.processor.decode(logit_index)
+                logit = top_k_logits_indices.values[0, i].item()
+                top_k_tokens_list.append((token, logit_index, logit))
             return {
                 "response": output_response,
+                "top_k_tokens": top_k_tokens_list,
+                "token_logits": token_logits,
             }
 if __name__ == "__main__":
     model_path = "Qwen/Qwen3-VL-4B-Instruct"  # "Qwen/Qwen3-VL-8B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct"
     model = Qwen3VLModel(model_path)
+    prompt = 'Does the following action accurately describe the one shown in the video? \nAnswer with "Yes" or "No".\n\nAction: Trying to bend stick so nothing happens\n\nConditions which may/may not be true BEFORE the aforementioned action occurs:\n- Stick is held by hands at two distinct points\n- Stick is intact\n\nConditions which may/may not be true AFTER the aforementioned action occurs:\n- Stick retains its original geometric shape\n- Stick remains intact\n\nAnswer:'
+    token_choices = ["Yes", "No"]
     video_path = (
+        "/home/shreyasj/Syed/data/Something-Something-V2/pre-post/videos/188064.mp4"
     )
     generation_config = {
         "max_new_tokens": 128,
+        "do_sample": True,  # False enables greedy sampling, which invalidates things like temperature, top-K, top-P. Allows return of raw logits
         "temperature": 0.7,
+        "logits_temperature": 1.0,
+        "fps": 1.0,
         "return_confidence": False,
+        "top_k_tokens": 10,
+        "debug": False,
     }
     output = model.chat_with_confidence(
         prompt, video_path, token_choices=token_choices, **generation_config
         confidence = output["confidence"]
         print(f"Confidence: {confidence}")
     else:
+        # If do_sample is True, logits pass through logit warpers which filter out un-important tokens (based on logits) to -inf,
+        # otherwise, the raw logits are used, which are not filtered.
+        logits_type = "POST-PROCESSED" if generation_config["do_sample"] else "RAW"
+        top_k_tokens = output["top_k_tokens"]
+        for i in range(len(top_k_tokens)):
+            print(f"Top {i+1} token: {top_k_tokens[i][0]}")
+            print(f"Top {i+1} token logit: {top_k_tokens[i][2]}")
+            print("--------------------------------")