Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image | |
| import spaces | |
| import tempfile | |
| import os | |
| import subprocess | |
| import sys | |
| # Install dependencies if needed | |
| def install_dependencies(): | |
| """Install required packages for VideoLLaMA3""" | |
| packages = ["decord", "imageio", "imageio-ffmpeg"] | |
| for package in packages: | |
| try: | |
| __import__(package.replace("-", "_")) | |
| except ImportError: | |
| print(f"Installing {package}...") | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"]) | |
| # Install dependencies on startup | |
| install_dependencies() | |
| from transformers import AutoModelForCausalLM, AutoProcessor | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Global variables | |
| model = None | |
| processor = None | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_loaded = False | |
| def load_videollama3_model(): | |
| """Load VideoLLaMA3 model""" | |
| global model, processor, model_loaded | |
| try: | |
| print("π Loading VideoLLaMA3-7B model...") | |
| model_name = "DAMO-NLP-SG/VideoLLaMA3-7B" | |
| print("Loading processor...") | |
| processor = AutoProcessor.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| print("Loading VideoLLaMA3 model (this may take several minutes)...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| model_loaded = True | |
| success_msg = "β VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI." | |
| print(success_msg) | |
| return success_msg | |
| except Exception as e: | |
| model_loaded = False | |
| error_msg = f"β Failed to load VideoLLaMA3: {str(e)}" | |
| print(error_msg) | |
| return error_msg | |
| def analyze_video_with_videollama3(video_file, question, progress=gr.Progress()): | |
| """Analyze video using VideoLLaMA3""" | |
| if video_file is None: | |
| return "β Please upload a video file first." | |
| if not question.strip(): | |
| return "β Please enter a question about the video." | |
| if not model_loaded or model is None or processor is None: | |
| return "β VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for completion." | |
| try: | |
| progress(0.1, desc="Preparing video for analysis...") | |
| # Create the conversation in the format VideoLLaMA3 expects | |
| conversation = [ | |
| {"role": "system", "content": "You are a helpful assistant that can analyze videos."}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 64}}, | |
| {"type": "text", "text": question} | |
| ] | |
| } | |
| ] | |
| progress(0.3, desc="Processing video with VideoLLaMA3...") | |
| # Process the conversation | |
| inputs = processor(conversation=conversation, return_tensors="pt") | |
| inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} | |
| if "pixel_values" in inputs: | |
| inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16) | |
| progress(0.7, desc="Generating AI response...") | |
| # Generate response | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=processor.tokenizer.eos_token_id, | |
| eos_token_id=processor.tokenizer.eos_token_id | |
| ) | |
| progress(0.9, desc="Processing response...") | |
| # Decode response | |
| response = processor.batch_decode(output_ids, skip_special_tokens=True)[0] | |
| # Extract assistant response | |
| if "assistant" in response.lower(): | |
| ai_response = response.split("assistant")[-1].strip() | |
| elif "user:" in response.lower(): | |
| parts = response.split("user:") | |
| if len(parts) > 1: | |
| ai_response = parts[-1].strip() | |
| else: | |
| ai_response = response.strip() | |
| else: | |
| ai_response = response.strip() | |
| # Clean up the response | |
| ai_response = ai_response.replace("</s>", "").strip() | |
| # Get video info for technical details | |
| cap = cv2.VideoCapture(video_file) | |
| fps = cap.get(cv2.CAP_PROP_FPS) | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| duration = total_frames / fps if fps > 0 else 0 | |
| width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
| height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
| cap.release() | |
| progress(1.0, desc="Analysis complete!") | |
| # Format the final response | |
| formatted_response = f"""π₯ **VideoLLaMA3 AI Video Analysis** | |
| β **Your Question:** | |
| {question} | |
| π€ **AI Analysis:** | |
| {ai_response} | |
| π **Video Information:** | |
| β’ Duration: {duration:.1f} seconds | |
| β’ Frame Rate: {fps:.1f} FPS | |
| β’ Total Frames: {total_frames:,} | |
| β’ Resolution: {width}x{height} | |
| β‘ **Powered by:** VideoLLaMA3-7B (Multimodal AI) | |
| """ | |
| return formatted_response | |
| except Exception as e: | |
| error_msg = f"β Error during VideoLLaMA3 analysis: {str(e)}" | |
| print(error_msg) | |
| # Fallback: Basic video analysis if VideoLLaMA3 fails | |
| try: | |
| cap = cv2.VideoCapture(video_file) | |
| fps = cap.get(cv2.CAP_PROP_FPS) | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| duration = total_frames / fps if fps > 0 else 0 | |
| cap.release() | |
| fallback_response = f"""β VideoLLaMA3 analysis failed, but here's what I can tell you: | |
| **Video Technical Info:** | |
| β’ Duration: {duration:.1f} seconds | |
| β’ Frame Rate: {fps:.1f} FPS | |
| β’ Total Frames: {total_frames:,} | |
| **Error:** {str(e)} | |
| **Suggestion:** Try reloading the model or using a shorter video file. | |
| """ | |
| return fallback_response | |
| except: | |
| return error_msg | |
| def create_interface(): | |
| """Create the Gradio interface""" | |
| with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π₯ VideoLLaMA3 Video Analysis Tool") | |
| gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!") | |
| # Model loading section | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| model_status = gr.Textbox( | |
| label="π€ Model Status", | |
| value="Model not loaded - Click the button to load VideoLLaMA3-7B β", | |
| interactive=False, | |
| lines=2 | |
| ) | |
| with gr.Column(scale=1): | |
| load_btn = gr.Button("π Load VideoLLaMA3 Model", variant="primary", size="lg") | |
| load_btn.click(load_videollama3_model, outputs=model_status) | |
| gr.Markdown("---") | |
| # Main interface | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| video_input = gr.Video( | |
| label="πΉ Upload Video (MP4, AVI, MOV, WebM)", | |
| height=350 | |
| ) | |
| question_input = gr.Textbox( | |
| label="β Ask about the video", | |
| placeholder="What is happening in this video? Describe it in detail.", | |
| lines=3, | |
| max_lines=5 | |
| ) | |
| analyze_btn = gr.Button("π Analyze Video with VideoLLaMA3", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| output = gr.Textbox( | |
| label="π― AI Analysis Results", | |
| lines=25, | |
| max_lines=30, | |
| show_copy_button=True | |
| ) | |
| # Example questions | |
| gr.Markdown("### π‘ Example Questions (click to use):") | |
| example_questions = [ | |
| "What is happening in this video? Describe the scene in detail.", | |
| "Who are the people in this video and what are they doing?", | |
| "Describe the setting, location, and environment shown.", | |
| "What objects, animals, or items can you see in the video?", | |
| "What is the mood, atmosphere, or emotion conveyed?", | |
| "Summarize the key events that occur chronologically." | |
| ] | |
| with gr.Row(): | |
| for i in range(0, len(example_questions), 2): | |
| with gr.Column(): | |
| if i < len(example_questions): | |
| btn1 = gr.Button(example_questions[i], size="sm") | |
| btn1.click(lambda x=example_questions[i]: x, outputs=question_input) | |
| if i+1 < len(example_questions): | |
| btn2 = gr.Button(example_questions[i+1], size="sm") | |
| btn2.click(lambda x=example_questions[i+1]: x, outputs=question_input) | |
| # Connect the analyze button | |
| analyze_btn.click( | |
| analyze_video_with_videollama3, | |
| inputs=[video_input, question_input], | |
| outputs=output, | |
| show_progress=True | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown(""" | |
| ### π Instructions: | |
| 1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes) | |
| 2. **Then:** Upload your video file (works best with videos under 2 minutes) | |
| 3. **Ask:** Type your question about the video content | |
| 4. **Analyze:** Click "Analyze Video with VideoLLaMA3" to get detailed insights | |
| π‘ **Tips:** | |
| - Keep videos under 2 minutes for best performance | |
| - Ask specific, detailed questions for better results | |
| - The model will analyze up to 64 frames from your video | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() |