Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image | |
| import spaces | |
| import tempfile | |
| import os | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Global variables | |
| model = None | |
| tokenizer = None | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_loaded = False | |
| def load_videollama3_model(): | |
| """Load VideoLLaMA3 model with proper configuration""" | |
| global model, tokenizer, model_loaded | |
| try: | |
| print("π Loading VideoLLaMA3-7B model...") | |
| model_name = "DAMO-NLP-SG/VideoLLaMA3-7B" | |
| # Configure quantization to fit in GPU memory | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| # Load tokenizer | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| use_fast=False | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load model | |
| print("Loading VideoLLaMA3 model (this may take several minutes)...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=quantization_config, | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| attn_implementation="flash_attention_2" | |
| ) | |
| model_loaded = True | |
| success_msg = "β VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI." | |
| print(success_msg) | |
| return success_msg | |
| except Exception as e: | |
| model_loaded = False | |
| error_msg = f"β Failed to load VideoLLaMA3: {str(e)}" | |
| print(error_msg) | |
| return error_msg | |
| def extract_video_frames(video_path, max_frames=16, target_fps=1): | |
| """Extract frames from video for VideoLLaMA3 processing""" | |
| try: | |
| cap = cv2.VideoCapture(video_path) | |
| original_fps = cap.get(cv2.CAP_PROP_FPS) | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| duration = total_frames / original_fps if original_fps > 0 else 0 | |
| if total_frames == 0: | |
| return [], None | |
| # Calculate frame sampling | |
| frame_interval = max(1, int(original_fps / target_fps)) | |
| frame_indices = list(range(0, total_frames, frame_interval))[:max_frames] | |
| frames = [] | |
| valid_indices = [] | |
| for idx in frame_indices: | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, idx) | |
| ret, frame = cap.read() | |
| if ret: | |
| # Convert BGR to RGB | |
| frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| # Resize to reasonable size for processing | |
| height, width = frame_rgb.shape[:2] | |
| if max(height, width) > 720: | |
| scale = 720 / max(height, width) | |
| new_height, new_width = int(height * scale), int(width * scale) | |
| frame_rgb = cv2.resize(frame_rgb, (new_width, new_height)) | |
| frames.append(Image.fromarray(frame_rgb)) | |
| valid_indices.append(idx) | |
| cap.release() | |
| video_info = { | |
| "duration": duration, | |
| "original_fps": original_fps, | |
| "total_frames": total_frames, | |
| "extracted_frames": len(frames), | |
| "resolution": f"{width}x{height}" | |
| } | |
| return frames, video_info | |
| except Exception as e: | |
| print(f"Error extracting frames: {e}") | |
| return [], None | |
| def analyze_video_with_ai(video_file, question, progress=gr.Progress()): | |
| """Analyze video using VideoLLaMA3 model""" | |
| if video_file is None: | |
| return "β Please upload a video file first." | |
| if not question.strip(): | |
| return "β Please enter a question about the video." | |
| if not model_loaded: | |
| return "β VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for it to complete." | |
| try: | |
| progress(0.1, desc="Extracting video frames...") | |
| # Extract frames from video | |
| frames, video_info = extract_video_frames(video_file, max_frames=16) | |
| if not frames or video_info is None: | |
| return "β Could not process video. Please check the video format and try again." | |
| progress(0.3, desc="Preparing AI input...") | |
| # Create a detailed prompt for video analysis | |
| system_prompt = "You are VideoLLaMA3, an advanced AI assistant specialized in video understanding. Analyze the video frames and provide detailed, accurate responses about the video content." | |
| user_prompt = f"""I have a video with the following specifications: | |
| - Duration: {video_info['duration']:.1f} seconds | |
| - Original FPS: {video_info['original_fps']:.1f} | |
| - Total frames: {video_info['total_frames']} | |
| - Analyzed frames: {video_info['extracted_frames']} | |
| - Resolution: {video_info['resolution']} | |
| Question: {question} | |
| Please analyze the video content and provide a comprehensive answer based on what you observe in the video frames.""" | |
| progress(0.5, desc="Processing with VideoLLaMA3...") | |
| # Prepare conversation format | |
| conversation = f"System: {system_prompt}\n\nHuman: {user_prompt}\n\nAssistant:" | |
| # Tokenize input | |
| inputs = tokenizer( | |
| conversation, | |
| return_tensors="pt", | |
| max_length=2048, | |
| truncation=True, | |
| padding=True | |
| ).to(device) | |
| progress(0.7, desc="Generating AI response...") | |
| # Generate response | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.9, | |
| repetition_penalty=1.1, | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id | |
| ) | |
| # Decode response | |
| full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| # Extract just the assistant's response | |
| if "Assistant:" in full_response: | |
| ai_response = full_response.split("Assistant:")[-1].strip() | |
| else: | |
| ai_response = full_response.split(conversation)[-1].strip() | |
| progress(0.9, desc="Formatting results...") | |
| # Format the final response | |
| formatted_response = f"""π₯ **VideoLLaMA3 AI Video Analysis** | |
| β **Your Question:** | |
| {question} | |
| π€ **AI Analysis:** | |
| {ai_response} | |
| π **Video Information:** | |
| β’ Duration: {video_info['duration']:.1f} seconds | |
| β’ Frame Rate: {video_info['original_fps']:.1f} FPS | |
| β’ Total Frames: {video_info['total_frames']:,} | |
| β’ Analyzed Frames: {video_info['extracted_frames']} | |
| β’ Resolution: {video_info['resolution']} | |
| β‘ **Powered by:** VideoLLaMA3-7B (Multimodal AI) | |
| """ | |
| progress(1.0, desc="Analysis complete!") | |
| return formatted_response | |
| except torch.cuda.OutOfMemoryError: | |
| torch.cuda.empty_cache() | |
| return "β GPU memory error. Please try with a shorter video or restart the space." | |
| except Exception as e: | |
| error_msg = f"β Error during video analysis: {str(e)}" | |
| print(error_msg) | |
| return error_msg | |
| def create_interface(): | |
| """Create the Gradio interface""" | |
| with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π₯ VideoLLaMA3 AI Video Analysis Tool") | |
| gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!") | |
| # Model loading section | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| model_status = gr.Textbox( | |
| label="π€ Model Status", | |
| value="Model not loaded - Click the button to load VideoLLaMA3-7B β", | |
| interactive=False, | |
| lines=2 | |
| ) | |
| with gr.Column(scale=1): | |
| load_btn = gr.Button("π Load VideoLLaMA3 Model", variant="primary", size="lg") | |
| load_btn.click(load_videollama3_model, outputs=model_status) | |
| gr.Markdown("---") | |
| # Main interface | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| video_input = gr.Video( | |
| label="πΉ Upload Video (MP4, AVI, MOV, WebM)", | |
| height=350 | |
| ) | |
| question_input = gr.Textbox( | |
| label="β Ask about the video", | |
| placeholder="What is happening in this video? Describe it in detail.", | |
| lines=3, | |
| max_lines=5 | |
| ) | |
| analyze_btn = gr.Button("π Analyze Video with AI", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| output = gr.Textbox( | |
| label="π― AI Analysis Results", | |
| lines=25, | |
| max_lines=30, | |
| show_copy_button=True | |
| ) | |
| # Example questions | |
| gr.Markdown("### π‘ Example Questions (click to use):") | |
| example_questions = [ | |
| "What is happening in this video? Describe the scene in detail.", | |
| "Who are the people in this video and what are they doing?", | |
| "Describe the setting, location, and environment shown.", | |
| "What objects, animals, or items can you see in the video?", | |
| "What is the mood, atmosphere, or emotion conveyed?", | |
| "Summarize the key events that occur chronologically." | |
| ] | |
| with gr.Row(): | |
| for i in range(0, len(example_questions), 2): | |
| with gr.Column(): | |
| if i < len(example_questions): | |
| btn1 = gr.Button(example_questions[i], size="sm") | |
| btn1.click(lambda x=example_questions[i]: x, outputs=question_input) | |
| if i+1 < len(example_questions): | |
| btn2 = gr.Button(example_questions[i+1], size="sm") | |
| btn2.click(lambda x=example_questions[i+1]: x, outputs=question_input) | |
| # Connect the analyze button | |
| analyze_btn.click( | |
| analyze_video_with_ai, | |
| inputs=[video_input, question_input], | |
| outputs=output, | |
| show_progress=True | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown(""" | |
| ### π Instructions: | |
| 1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes) | |
| 2. **Then:** Upload your video file (keep it under 2 minutes for best results) | |
| 3. **Ask:** Type your question about the video content | |
| 4. **Analyze:** Click "Analyze Video with AI" to get detailed insights | |
| π‘ **Tips:** | |
| - Shorter videos (30s-2min) work best | |
| - Ask specific questions for better results | |
| - Try different question styles to explore the AI's capabilities | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() |