import os import sys from pathlib import Path import gc import torch import gradio as gr # Allow importing your models package sys.path.insert(0, str(Path(__file__).parent)) from models import load_model from models.base import BaseVideoModel # ---------------------- # CONFIG # ---------------------- DEVICE_MAP = "cuda:0" VIDEO_DIR = str(Path(__file__).parent / "videos") FPS = 1.0 MAX_NEW_TOKENS = 512 TEMPERATURE = 0.01 # ---------------------- # Model loading with quantization support # ---------------------- model: BaseVideoModel = None current_model_name = "Qwen3-VL-4B-Instruct" current_quantization = "16-bit" def load_model_with_quantization( model_name: str, quantization: str ): """Load or reload the model with specified quantization""" global model, current_model_name, current_quantization # Free GPU memory if model already exists if model is not None: print("Unloading existing model and freeing GPU memory...") del model gc.collect() torch.cuda.empty_cache() print("GPU memory cleared.") load_8bit = False load_4bit = False if quantization == "8-bit": load_8bit = True elif quantization == "4-bit": load_4bit = True # else: 16-bit (normal) - both flags remain False print(f"Loading {model_name} with {quantization} quantization...") model_path = model_name # Load the HF version of LLaVA-Video-7B instead of the default version, for transformers v5 compatibility # For the Qwen models, load the model from the Qwen directory if model_name == "LLaVA-Video-7B-Qwen2": model_path = "Isotr0py/LLaVA-Video-7B-Qwen2-hf" elif model_name.startswith("Qwen"): model_path = f"Qwen/{model_name}" model = load_model( model_path, device_map=DEVICE_MAP, load_8bit=load_8bit, load_4bit=load_4bit, ) current_model_name = model_name current_quantization = quantization print(f"{model_name} loaded with {quantization} quantization.") return f"✅ {model_name} loaded successfully with {quantization} quantization" # Load model initially with 16-bit (normal) load_model_with_quantization(current_model_name, current_quantization) # ---------------------- # Collect video IDs # ---------------------- VIDEO_IDS = sorted([ os.path.splitext(f)[0] for f in os.listdir(VIDEO_DIR) if f.endswith(".mp4") ]) # ---------------------- # Helpers # ---------------------- def get_video_path(video_id: str): if not video_id: return None path = os.path.join(VIDEO_DIR, video_id + ".mp4") return path if os.path.exists(path) else None # ---------------------- # Inference function # ---------------------- def video_qa( video_id: str, prompt: str, video_mode: str, fps: float, num_frames: int, max_tokens: int, temperature: float, top_k: int, top_p: float, ) -> str: if not video_id: return "❌ Please select a video ID." if not prompt.strip(): return "❌ Please enter a prompt." video_path = get_video_path(video_id) if video_path is None: return f"❌ Video not found: {video_id}.mp4" try: # Prepare generation config generation_config = { "max_new_tokens": max_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, } # Add video_mode if supported by the model kwargs = { "prompt": prompt, "video_path": video_path, "fps": fps, "num_frames": num_frames, **generation_config } # Try to add video_mode (for Qwen models) try: response = model.chat(**kwargs, video_mode=video_mode) except TypeError: # If video_mode is not supported, fall back to without it response = model.chat(**kwargs) return response except Exception as e: return f"❌ Error during inference: {str(e)}" # ---------------------- # Gradio UI # ---------------------- with gr.Blocks(title="Video Inference Demo", theme=gr.themes.Soft()) as demo: gr.Markdown("## 🎥 Video Inference") with gr.Row(): # LEFT COLUMN with gr.Column(scale=1): gr.Markdown("### 📁 Video Selection") video_id = gr.Dropdown( choices=VIDEO_IDS, label="Video ID", filterable=True, interactive=True, value=VIDEO_IDS[0] if VIDEO_IDS else None ) video_player = gr.Video( label="Selected Video", autoplay=False, height=300 ) gr.Markdown("### 🤖 Model Name") model_name_radio = gr.Radio( choices=[ "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "Qwen3-VL-2B-Thinking", "Qwen3-VL-4B-Thinking", "LLaVA-Video-7B-Qwen2" ], value="Qwen3-VL-4B-Instruct", label="🤖 Model Name", info="Select the model to use for inference" ) gr.Markdown("### ⚙️ Model Parameters") quantization_radio = gr.Radio( choices=["16-bit", "8-bit", "4-bit"], value="16-bit", label="🔧 Model Quantization", info="16-bit: Default precision, 8-bit/4-bit: Reduced memory usage" ) reload_button = gr.Button("🔄 Reload Model", variant="secondary") reload_status = gr.Textbox( label="Model Status", value=f"{current_model_name} loaded with {current_quantization} quantization", interactive=False, lines=1 ) fps_slider = gr.Slider( minimum=0.5, maximum=10.0, step=0.5, value=FPS, label="🎞️ Frames Per Second (FPS)", info="Sample rate for video frames" ) video_mode_radio = gr.Radio( choices=["video", "frames"], value="video", label="📹 Video Mode", info="'video' for FPS-based, 'frames' for fixed count" ) num_frames_slider = gr.Slider( minimum=1, maximum=30, step=1, value=8, label="🖼️ Number of Frames", info="Fixed frame count (used when video_mode='frames')" ) with gr.Accordion("🔧 Advanced Settings", open=False): max_tokens_slider = gr.Slider( minimum=128, maximum=2048, step=128, value=MAX_NEW_TOKENS, label="Max New Tokens", info="Maximum length of generated response" ) temperature_slider = gr.Slider( minimum=0.01, maximum=2.0, step=0.01, value=TEMPERATURE, label="🌡️ Temperature", info="Higher = more creative, lower = more focused" ) top_k_slider = gr.Slider( minimum=1, maximum=100, step=1, value=50, label="🔝 Top-K", info="Sample from top K tokens" ) top_p_slider = gr.Slider( minimum=0.0, maximum=1.0, step=0.05, value=0.95, label="🎯 Top-P (Nucleus)", info="Cumulative probability threshold" ) # RIGHT COLUMN with gr.Column(scale=2): gr.Markdown("### 💬 Question & Answer") prompt = gr.Textbox( label="Prompt", placeholder="Ask a question about the selected video...", lines=4, value="Describe what is happening in this video." ) answer = gr.Textbox( label="Model Answer", lines=20, interactive=False ) run = gr.Button("🚀 Run Inference", variant="primary", size="lg") gr.Markdown(""" --- **ℹ️ Tips:** - **Quantization:** 16-bit (full precision), 8-bit (2x memory savings), 4-bit (4x memory savings with slight quality loss) - Adjust FPS to control video sampling rate (higher = more frames, slower inference) - Use video_mode='frames' for fixed frame count (useful for very long videos) - Temperature: Lower (0.01-0.5) for factual, higher (0.7-1.5) for creative responses - Top-K and Top-P control output diversity """) # Update video player when dropdown changes video_id.change( fn=get_video_path, inputs=video_id, outputs=video_player ) # Reload model with new quantization reload_button.click( fn=load_model_with_quantization, inputs=[ model_name_radio, quantization_radio, ], outputs=reload_status ) # Run inference run.click( fn=video_qa, inputs=[ video_id, prompt, video_mode_radio, fps_slider, num_frames_slider, max_tokens_slider, temperature_slider, top_k_slider, top_p_slider, ], outputs=answer ) demo.launch( server_name="0.0.0.0", server_port=7860, share=True ) # #--------------- # #--------------- # #--------------- # # Feb 5, 2026 # #--------------- # import os # import sys # import json # from pathlib import Path # import gradio as gr # # Allow importing your models package # sys.path.insert(0, str(Path(__file__).parent)) # from models import load_model # from models.base import BaseVideoModel # # ---------------------- # # CONFIG # # ---------------------- # QWEN_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct" # LLAVA_MODEL_PATH = "lmms-lab/LLaVA-Video-7B-Qwen2" # DEVICE_MAP_QWEN = "cuda:0" # DEVICE_MAP_LLAVA = "cuda:0" # Both models on same GPU # VIDEO_DIR = "/home/raman/Gradio_Qwen3vl4bInstruct/videos" # LABELS_JSON = "/home/raman/Gradio_Qwen3vl4bInstruct/SSv2_prepost_sampled.json" # DEFAULT_FPS = 1.0 # MAX_NEW_TOKENS = 512 # TEMPERATURE = 0.01 # # ---------------------- # # Load video labels # # ---------------------- # print("Loading video labels...") # video_labels = {} # try: # with open(LABELS_JSON, 'r') as f: # labels_data = json.load(f) # for item in labels_data: # video_labels[item['id']] = { # 'label': item['label'], # 'template': item.get('template', ''), # 'action_group': item.get('action_group', '') # } # print(f"Loaded {len(video_labels)} video labels.") # except Exception as e: # print(f"Warning: Could not load labels JSON: {e}") # # ---------------------- # # Load models # # ---------------------- # print("Loading Qwen3-VL-4B-Instruct...") # qwen_model: BaseVideoModel = load_model( # QWEN_MODEL_PATH, # device_map=DEVICE_MAP_QWEN, # ) # print("Qwen model loaded.") # print("Loading LLaVA-Video-7B...") # llava_model: BaseVideoModel = load_model( # LLAVA_MODEL_PATH, # device_map=DEVICE_MAP_LLAVA, # ) # print("LLaVA model loaded.") # # ---------------------- # # Collect video IDs # # ---------------------- # VIDEO_IDS = sorted([ # os.path.splitext(f)[0] # for f in os.listdir(VIDEO_DIR) # if f.endswith(".mp4") # ]) # print(f"Found {len(VIDEO_IDS)} videos.") # # ---------------------- # # Helpers # # ---------------------- # def get_video_path(video_id: str): # if not video_id: # return None # path = os.path.join(VIDEO_DIR, video_id + ".mp4") # return path if os.path.exists(path) else None # def get_video_label(video_id: str): # if not video_id: # return "" # info = video_labels.get(video_id, {}) # label = info.get('label', 'No label available') # action_group = info.get('action_group', '') # if action_group: # return f"**Label:** {label}\n\n**Action Group:** {action_group}" # return f"**Label:** {label}" # def update_video_info(video_id: str): # """Returns video path and label when video is selected""" # video_path = get_video_path(video_id) # label = get_video_label(video_id) # return video_path, label # # ---------------------- # # Inference functions # # ---------------------- # def qwen_inference(video_id: str, prompt: str, fps: float) -> str: # if not video_id: # return "❌ Please select a video ID." # if not prompt.strip(): # return "❌ Please enter a prompt." # video_path = get_video_path(video_id) # if video_path is None: # return f"❌ Video not found: {video_id}.mp4" # try: # response = qwen_model.chat( # prompt=prompt, # video_path=video_path, # fps=fps, # max_new_tokens=MAX_NEW_TOKENS, # temperature=TEMPERATURE, # ) # return response # except Exception as e: # return f"❌ Error during Qwen inference: {str(e)}" # def llava_inference(video_id: str, prompt: str, fps: float) -> str: # if not video_id: # return "❌ Please select a video ID." # if not prompt.strip(): # return "❌ Please enter a prompt." # video_path = get_video_path(video_id) # if video_path is None: # return f"❌ Video not found: {video_id}.mp4" # try: # response = llava_model.chat( # prompt=prompt, # video_path=video_path, # fps=fps, # max_new_tokens=MAX_NEW_TOKENS, # temperature=TEMPERATURE, # ) # return response # except Exception as e: # return f"❌ Error during LLaVA inference: {str(e)}" # # ---------------------- # # Gradio UI # # ---------------------- # with gr.Blocks(title="Video QA – Qwen3-VL & LLaVA-Video", theme=gr.themes.Soft()) as demo: # gr.Markdown("# 🎥 Video Question Answering Demo") # gr.Markdown("Compare **Qwen3-VL-4B-Instruct** and **LLaVA-Video-7B-Qwen2** on the same videos") # # TOP SECTION: Video Selection and Display # with gr.Row(): # with gr.Column(scale=1): # video_id = gr.Dropdown( # choices=VIDEO_IDS, # label="📁 Select Video ID", # filterable=True, # interactive=True, # value=VIDEO_IDS[0] if VIDEO_IDS else None # ) # video_label = gr.Markdown( # value=get_video_label(VIDEO_IDS[0]) if VIDEO_IDS else "", # label="Video Information" # ) # fps_slider = gr.Slider( # minimum=0.5, # maximum=5.0, # step=0.5, # value=DEFAULT_FPS, # label="🎞️ Frames Per Second (FPS)", # info="Higher FPS = more frames analyzed (slower but more detailed)" # ) # with gr.Column(scale=2): # video_player = gr.Video( # label="Selected Video", # autoplay=False, # height=360, # value=get_video_path(VIDEO_IDS[0]) if VIDEO_IDS else None # ) # gr.Markdown("---") # # BOTTOM SECTION: Two Models Side by Side # with gr.Row(): # # QWEN COLUMN # with gr.Column(scale=1): # gr.Markdown("### 🤖 Qwen3-VL-4B-Instruct") # qwen_prompt = gr.Textbox( # label="Prompt", # placeholder="Ask a question about the video...", # lines=4, # value="Describe what is happening in this video." # ) # qwen_answer = gr.Textbox( # label="Qwen Answer", # lines=10, # interactive=False # ) # qwen_run = gr.Button("🚀 Run Qwen Inference", variant="primary") # # LLAVA COLUMN # with gr.Column(scale=1): # gr.Markdown("### 🎬 LLaVA-Video-7B-Qwen2") # llava_prompt = gr.Textbox( # label="Prompt", # placeholder="Ask a question about the video...", # lines=4, # value="Describe what is happening in this video." # ) # llava_answer = gr.Textbox( # label="LLaVA Answer", # lines=10, # interactive=False # ) # llava_run = gr.Button("🚀 Run LLaVA Inference", variant="primary") # # Model info footer # gr.Markdown(""" # --- # **Model Information:** # - **Qwen3-VL-4B-Instruct**: 4B parameter vision-language model # - **LLaVA-Video-7B-Qwen2**: 7B parameter video understanding model # **Settings:** Max Tokens={}, Temperature={} # """.format(MAX_NEW_TOKENS, TEMPERATURE)) # # ---------------------- # # Event Handlers # # ---------------------- # # Update video player and label when dropdown changes # video_id.change( # fn=update_video_info, # inputs=video_id, # outputs=[video_player, video_label] # ) # # Run Qwen inference # qwen_run.click( # fn=qwen_inference, # inputs=[video_id, qwen_prompt, fps_slider], # outputs=qwen_answer # ) # # Run LLaVA inference # llava_run.click( # fn=llava_inference, # inputs=[video_id, llava_prompt, fps_slider], # outputs=llava_answer # ) # # Launch # demo.launch( # server_name="0.0.0.0", # server_port=7860, # share=True # )