Spaces:

jena-shreyas
/

Video-Inference-Demo

Sleeping

File size: 18,605 Bytes

80ceab0
 
 
2a63fd4
 
80ceab0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5644567
80ceab0
5644567
8427fe9
5644567
 
8427fe9
 
 
 
5644567
8427fe9
5644567
2a63fd4
 
 
 
 
 
 
 
5644567
 
 
8427fe9
5644567
8427fe9
5644567
 
 
8427fe9
 
 
06ef027
8427fe9
 
06ef027
 
5644567
8427fe9
5644567
 
 
 
8427fe9
 
 
 
5644567
 
8427fe9
80ceab0
 
 
 
 
 
 
38b23dd
80ceab0
 
 
 
 
 
 
 
38b23dd
80ceab0
 
 
 
 
b70cd0c
 
 
e6b02aa
b70cd0c
e6b02aa
b70cd0c
 
 
 
 
80ceab0
 
 
 
 
 
 
 
b70cd0c
80ceab0
 
b70cd0c
 
 
 
 
 
 
 
 
 
 
 
 
e6b02aa
0cdc356
b70cd0c
 
 
 
 
 
 
 
 
80ceab0
 
 
 
 
 
 
 
8427fe9
 
80ceab0
 
 
 
b70cd0c
 
80ceab0
 
 
 
b70cd0c
 
80ceab0
 
 
 
b70cd0c
 
80ceab0
8427fe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b70cd0c
 
 
5644567
 
 
 
 
 
 
 
 
 
8427fe9
5644567
 
 
 
b70cd0c
 
ef7643d
b70cd0c
 
 
 
 
 
 
 
 
 
 
 
 
e6b02aa
 
 
 
 
 
 
 
 
b70cd0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80ceab0
 
 
b70cd0c
 
80ceab0
 
b70cd0c
 
 
80ceab0
b70cd0c
80ceab0
 
b70cd0c
 
80ceab0
b70cd0c
 
 
 
 
 
5644567
b70cd0c
 
 
 
 
80ceab0
 
 
 
 
 
 
 
5644567
 
 
8427fe9
 
 
 
5644567
 
 
80ceab0
 
 
b70cd0c
 
 
e6b02aa
b70cd0c
e6b02aa
b70cd0c
 
 
 
 
80ceab0
 
 
 
 
 
 
079a5b2
80ceab0

import os
import sys
from pathlib import Path
import gc
import torch
import gradio as gr

# Allow importing your models package
sys.path.insert(0, str(Path(__file__).parent))

from models import load_model
from models.base import BaseVideoModel

# ----------------------
# CONFIG
# ----------------------
DEVICE_MAP = "cuda:0"

VIDEO_DIR = str(Path(__file__).parent / "videos")

FPS = 1.0
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.01

# ----------------------
# Model loading with quantization support
# ----------------------
model: BaseVideoModel = None
current_model_name = "Qwen3-VL-4B-Instruct"
current_quantization = "16-bit"

def load_model_with_quantization(
        model_name: str,
        quantization: str
    ):
    """Load or reload the model with specified quantization"""
    global model, current_model_name, current_quantization
    
    # Free GPU memory if model already exists
    if model is not None:
        print("Unloading existing model and freeing GPU memory...")
        del model
        gc.collect()
        torch.cuda.empty_cache()
        print("GPU memory cleared.")
    
    load_8bit = False
    load_4bit = False
    
    if quantization == "8-bit":
        load_8bit = True
    elif quantization == "4-bit":
        load_4bit = True
    # else: 16-bit (normal) - both flags remain False
    
    print(f"Loading {model_name} with {quantization} quantization...")
    model_path = model_name
    # Load the HF version of LLaVA-Video-7B instead of the default version, for transformers v5 compatibility
    # For the Qwen models, load the model from the Qwen directory
    if model_name == "LLaVA-Video-7B-Qwen2":
        model_path = "Isotr0py/LLaVA-Video-7B-Qwen2-hf"
    elif model_name.startswith("Qwen"):
        model_path = f"Qwen/{model_name}"
    model = load_model(
        model_path,
        device_map=DEVICE_MAP,
        load_8bit=load_8bit,
        load_4bit=load_4bit,
    )
    current_model_name = model_name
    current_quantization = quantization
    print(f"{model_name} loaded with {quantization} quantization.")
    return f"✅ {model_name} loaded successfully with {quantization} quantization"

# Load model initially with 16-bit (normal)
load_model_with_quantization(current_model_name, current_quantization)

# ----------------------
# Collect video IDs
# ----------------------
VIDEO_IDS = sorted([
    os.path.splitext(f)[0]
    for f in os.listdir(VIDEO_DIR)
    if f.endswith(".mp4")
])

# ----------------------
# Helpers
# ----------------------
def get_video_path(video_id: str):
    if not video_id:
        return None
    path = os.path.join(VIDEO_DIR, video_id + ".mp4")
    return path if os.path.exists(path) else None

# ----------------------
# Inference function
# ----------------------
def video_qa(
    video_id: str,
    prompt: str,
    video_mode: str,
    fps: float,
    num_frames: int,
    max_tokens: int,
    temperature: float,
    top_k: int,
    top_p: float,
) -> str:
    if not video_id:
        return "❌ Please select a video ID."

    if not prompt.strip():
        return "❌ Please enter a prompt."

    video_path = get_video_path(video_id)
    if video_path is None:
        return f"❌ Video not found: {video_id}.mp4"

    try:
        # Prepare generation config
        generation_config = {
            "max_new_tokens": max_tokens,
            "temperature": temperature,
            "top_k": top_k,
            "top_p": top_p,
        }
        
        # Add video_mode if supported by the model
        kwargs = {
            "prompt": prompt,
            "video_path": video_path,
            "fps": fps,
            "num_frames": num_frames,
            **generation_config
        }
        
        # Try to add video_mode (for Qwen models)
        try:
            response = model.chat(**kwargs, video_mode=video_mode)
        except TypeError:
            # If video_mode is not supported, fall back to without it
            response = model.chat(**kwargs)
        
        return response

    except Exception as e:
        return f"❌ Error during inference: {str(e)}"

# ----------------------
# Gradio UI
# ----------------------
with gr.Blocks(title="Video Inference Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🎥 Video Inference")

    with gr.Row():
        # LEFT COLUMN
        with gr.Column(scale=1):
            gr.Markdown("### 📁 Video Selection")
            
            video_id = gr.Dropdown(
                choices=VIDEO_IDS,
                label="Video ID",
                filterable=True,
                interactive=True,
                value=VIDEO_IDS[0] if VIDEO_IDS else None
            )

            video_player = gr.Video(
                label="Selected Video",
                autoplay=False,
                height=300
            )

            gr.Markdown("### 🤖 Model Name")

            model_name_radio = gr.Radio(
                choices=[
                    "Qwen3-VL-4B-Instruct", 
                    "Qwen3-VL-8B-Instruct",
                    "Qwen3-VL-2B-Thinking",
                    "Qwen3-VL-4B-Thinking",
                    "LLaVA-Video-7B-Qwen2"
                ],
                value="Qwen3-VL-4B-Instruct",
                label="🤖 Model Name",
                info="Select the model to use for inference"
            )
            
            gr.Markdown("### ⚙️ Model Parameters")
            
            quantization_radio = gr.Radio(
                choices=["16-bit", "8-bit", "4-bit"],
                value="16-bit",
                label="🔧 Model Quantization",
                info="16-bit: Default precision, 8-bit/4-bit: Reduced memory usage"
            )
            
            reload_button = gr.Button("🔄 Reload Model", variant="secondary")
            reload_status = gr.Textbox(
                label="Model Status",
                value=f"{current_model_name} loaded with {current_quantization} quantization",
                interactive=False,
                lines=1
            )
            
            fps_slider = gr.Slider(
                minimum=0.5,
                maximum=10.0,
                step=0.5,
                value=FPS,
                label="🎞️ Frames Per Second (FPS)",
                info="Sample rate for video frames"
            )
            
            video_mode_radio = gr.Radio(
                choices=["video", "frames"],
                value="video",
                label="📹 Video Mode",
                info="'video' for FPS-based, 'frames' for fixed count"
            )
            
            num_frames_slider = gr.Slider(
                minimum=1,
                maximum=30,
                step=1,
                value=8,
                label="🖼️ Number of Frames",
                info="Fixed frame count (used when video_mode='frames')"
            )
            
            with gr.Accordion("🔧 Advanced Settings", open=False):
                max_tokens_slider = gr.Slider(
                    minimum=128,
                    maximum=2048,
                    step=128,
                    value=MAX_NEW_TOKENS,
                    label="Max New Tokens",
                    info="Maximum length of generated response"
                )
                
                temperature_slider = gr.Slider(
                    minimum=0.01,
                    maximum=2.0,
                    step=0.01,
                    value=TEMPERATURE,
                    label="🌡️ Temperature",
                    info="Higher = more creative, lower = more focused"
                )
                
                top_k_slider = gr.Slider(
                    minimum=1,
                    maximum=100,
                    step=1,
                    value=50,
                    label="🔝 Top-K",
                    info="Sample from top K tokens"
                )
                
                top_p_slider = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    value=0.95,
                    label="🎯 Top-P (Nucleus)",
                    info="Cumulative probability threshold"
                )

        # RIGHT COLUMN
        with gr.Column(scale=2):
            gr.Markdown("### 💬 Question & Answer")
            
            prompt = gr.Textbox(
                label="Prompt",
                placeholder="Ask a question about the selected video...",
                lines=4,
                value="Describe what is happening in this video."
            )
            
            answer = gr.Textbox(
                label="Model Answer",
                lines=20,
                interactive=False
            )
            
            run = gr.Button("🚀 Run Inference", variant="primary", size="lg")
    
    gr.Markdown("""
    ---
    **ℹ️ Tips:**
    - **Quantization:** 16-bit (full precision), 8-bit (2x memory savings), 4-bit (4x memory savings with slight quality loss)
    - Adjust FPS to control video sampling rate (higher = more frames, slower inference)
    - Use video_mode='frames' for fixed frame count (useful for very long videos)
    - Temperature: Lower (0.01-0.5) for factual, higher (0.7-1.5) for creative responses
    - Top-K and Top-P control output diversity
    """)

    # Update video player when dropdown changes
    video_id.change(
        fn=get_video_path,
        inputs=video_id,
        outputs=video_player
    )

    # Reload model with new quantization
    reload_button.click(
        fn=load_model_with_quantization,
        inputs=[
            model_name_radio,
            quantization_radio,
        ],
        outputs=reload_status
    )

    # Run inference
    run.click(
        fn=video_qa,
        inputs=[
            video_id,
            prompt,
            video_mode_radio,
            fps_slider,
            num_frames_slider,
            max_tokens_slider,
            temperature_slider,
            top_k_slider,
            top_p_slider,
        ],
        outputs=answer
    )



demo.launch(
    server_name="0.0.0.0",
    server_port=7860,
    share=True
)


# #---------------
# #---------------
# #---------------
# # Feb 5, 2026
# #---------------
# import os
# import sys
# import json
# from pathlib import Path
# import gradio as gr

# # Allow importing your models package
# sys.path.insert(0, str(Path(__file__).parent))

# from models import load_model
# from models.base import BaseVideoModel

# # ----------------------
# # CONFIG
# # ----------------------
# QWEN_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct"
# LLAVA_MODEL_PATH = "lmms-lab/LLaVA-Video-7B-Qwen2"
# DEVICE_MAP_QWEN = "cuda:0"
# DEVICE_MAP_LLAVA = "cuda:0"  # Both models on same GPU

# VIDEO_DIR = "/home/raman/Gradio_Qwen3vl4bInstruct/videos"
# LABELS_JSON = "/home/raman/Gradio_Qwen3vl4bInstruct/SSv2_prepost_sampled.json"

# DEFAULT_FPS = 1.0
# MAX_NEW_TOKENS = 512
# TEMPERATURE = 0.01

# # ----------------------
# # Load video labels
# # ----------------------
# print("Loading video labels...")
# video_labels = {}
# try:
#     with open(LABELS_JSON, 'r') as f:
#         labels_data = json.load(f)
#         for item in labels_data:
#             video_labels[item['id']] = {
#                 'label': item['label'],
#                 'template': item.get('template', ''),
#                 'action_group': item.get('action_group', '')
#             }
#     print(f"Loaded {len(video_labels)} video labels.")
# except Exception as e:
#     print(f"Warning: Could not load labels JSON: {e}")

# # ----------------------
# # Load models
# # ----------------------
# print("Loading Qwen3-VL-4B-Instruct...")
# qwen_model: BaseVideoModel = load_model(
#     QWEN_MODEL_PATH,
#     device_map=DEVICE_MAP_QWEN,
# )
# print("Qwen model loaded.")

# print("Loading LLaVA-Video-7B...")
# llava_model: BaseVideoModel = load_model(
#     LLAVA_MODEL_PATH,
#     device_map=DEVICE_MAP_LLAVA,
# )
# print("LLaVA model loaded.")

# # ----------------------
# # Collect video IDs
# # ----------------------
# VIDEO_IDS = sorted([
#     os.path.splitext(f)[0]
#     for f in os.listdir(VIDEO_DIR)
#     if f.endswith(".mp4")
# ])

# print(f"Found {len(VIDEO_IDS)} videos.")

# # ----------------------
# # Helpers
# # ----------------------
# def get_video_path(video_id: str):
#     if not video_id:
#         return None
#     path = os.path.join(VIDEO_DIR, video_id + ".mp4")
#     return path if os.path.exists(path) else None

# def get_video_label(video_id: str):
#     if not video_id:
#         return ""
#     info = video_labels.get(video_id, {})
#     label = info.get('label', 'No label available')
#     action_group = info.get('action_group', '')
    
#     if action_group:
#         return f"**Label:** {label}\n\n**Action Group:** {action_group}"
#     return f"**Label:** {label}"

# def update_video_info(video_id: str):
#     """Returns video path and label when video is selected"""
#     video_path = get_video_path(video_id)
#     label = get_video_label(video_id)
#     return video_path, label

# # ----------------------
# # Inference functions
# # ----------------------
# def qwen_inference(video_id: str, prompt: str, fps: float) -> str:
#     if not video_id:
#         return "❌ Please select a video ID."
    
#     if not prompt.strip():
#         return "❌ Please enter a prompt."
    
#     video_path = get_video_path(video_id)
#     if video_path is None:
#         return f"❌ Video not found: {video_id}.mp4"
    
#     try:
#         response = qwen_model.chat(
#             prompt=prompt,
#             video_path=video_path,
#             fps=fps,
#             max_new_tokens=MAX_NEW_TOKENS,
#             temperature=TEMPERATURE,
#         )
#         return response
    
#     except Exception as e:
#         return f"❌ Error during Qwen inference: {str(e)}"

# def llava_inference(video_id: str, prompt: str, fps: float) -> str:
#     if not video_id:
#         return "❌ Please select a video ID."
    
#     if not prompt.strip():
#         return "❌ Please enter a prompt."
    
#     video_path = get_video_path(video_id)
#     if video_path is None:
#         return f"❌ Video not found: {video_id}.mp4"
    
#     try:
#         response = llava_model.chat(
#             prompt=prompt,
#             video_path=video_path,
#             fps=fps,
#             max_new_tokens=MAX_NEW_TOKENS,
#             temperature=TEMPERATURE,
#         )
#         return response
    
#     except Exception as e:
#         return f"❌ Error during LLaVA inference: {str(e)}"

# # ----------------------
# # Gradio UI
# # ----------------------
# with gr.Blocks(title="Video QA – Qwen3-VL & LLaVA-Video", theme=gr.themes.Soft()) as demo:
#     gr.Markdown("# 🎥 Video Question Answering Demo")
#     gr.Markdown("Compare **Qwen3-VL-4B-Instruct** and **LLaVA-Video-7B-Qwen2** on the same videos")
    
#     # TOP SECTION: Video Selection and Display
#     with gr.Row():
#         with gr.Column(scale=1):
#             video_id = gr.Dropdown(
#                 choices=VIDEO_IDS,
#                 label="📁 Select Video ID",
#                 filterable=True,
#                 interactive=True,
#                 value=VIDEO_IDS[0] if VIDEO_IDS else None
#             )
            
#             video_label = gr.Markdown(
#                 value=get_video_label(VIDEO_IDS[0]) if VIDEO_IDS else "",
#                 label="Video Information"
#             )
            
#             fps_slider = gr.Slider(
#                 minimum=0.5,
#                 maximum=5.0,
#                 step=0.5,
#                 value=DEFAULT_FPS,
#                 label="🎞️ Frames Per Second (FPS)",
#                 info="Higher FPS = more frames analyzed (slower but more detailed)"
#             )
        
#         with gr.Column(scale=2):
#             video_player = gr.Video(
#                 label="Selected Video",
#                 autoplay=False,
#                 height=360,
#                 value=get_video_path(VIDEO_IDS[0]) if VIDEO_IDS else None
#             )
    
#     gr.Markdown("---")
    
#     # BOTTOM SECTION: Two Models Side by Side
#     with gr.Row():
#         # QWEN COLUMN
#         with gr.Column(scale=1):
#             gr.Markdown("### 🤖 Qwen3-VL-4B-Instruct")
            
#             qwen_prompt = gr.Textbox(
#                 label="Prompt",
#                 placeholder="Ask a question about the video...",
#                 lines=4,
#                 value="Describe what is happening in this video."
#             )
            
#             qwen_answer = gr.Textbox(
#                 label="Qwen Answer",
#                 lines=10,
#                 interactive=False
#             )
            
#             qwen_run = gr.Button("🚀 Run Qwen Inference", variant="primary")
        
#         # LLAVA COLUMN
#         with gr.Column(scale=1):
#             gr.Markdown("### 🎬 LLaVA-Video-7B-Qwen2")
            
#             llava_prompt = gr.Textbox(
#                 label="Prompt",
#                 placeholder="Ask a question about the video...",
#                 lines=4,
#                 value="Describe what is happening in this video."
#             )
            
#             llava_answer = gr.Textbox(
#                 label="LLaVA Answer",
#                 lines=10,
#                 interactive=False
#             )
            
#             llava_run = gr.Button("🚀 Run LLaVA Inference", variant="primary")
    
#     # Model info footer
#     gr.Markdown("""
#     ---
#     **Model Information:**
#     - **Qwen3-VL-4B-Instruct**: 4B parameter vision-language model
#     - **LLaVA-Video-7B-Qwen2**: 7B parameter video understanding model
    
#     **Settings:** Max Tokens={}, Temperature={}
#     """.format(MAX_NEW_TOKENS, TEMPERATURE))
    
#     # ----------------------
#     # Event Handlers
#     # ----------------------
    
#     # Update video player and label when dropdown changes
#     video_id.change(
#         fn=update_video_info,
#         inputs=video_id,
#         outputs=[video_player, video_label]
#     )
    
#     # Run Qwen inference
#     qwen_run.click(
#         fn=qwen_inference,
#         inputs=[video_id, qwen_prompt, fps_slider],
#         outputs=qwen_answer
#     )
    
#     # Run LLaVA inference
#     llava_run.click(
#         fn=llava_inference,
#         inputs=[video_id, llava_prompt, fps_slider],
#         outputs=llava_answer
#     )

# # Launch
# demo.launch(
#     server_name="0.0.0.0",
#     server_port=7860,
#     share=True
# )