Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

cweigendev commited on Aug 6

Commit

92be57f

verified ·

1 Parent(s): 9d27de3

Create app_new.py

Browse files

Files changed (1) hide show

app_new.py +324 -0

app_new.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import gradio as gr
+import torch
+import cv2
+import numpy as np
+from PIL import Image
+import spaces
+import tempfile
+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import warnings
+warnings.filterwarnings("ignore")
+# Global variables
+model = None
+tokenizer = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_loaded = False
+@spaces.GPU
+def load_videollama3_model():
+    """Load VideoLLaMA3 model with proper configuration"""
+    global model, tokenizer, model_loaded
+    try:
+        print("🔄 Loading VideoLLaMA3-7B model...")
+        model_name = "DAMO-NLP-SG/VideoLLaMA3-7B"
+        # Configure quantization to fit in GPU memory
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4"
+        )
+        # Load tokenizer
+        print("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            use_fast=False
+        )
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Load model
+        print("Loading VideoLLaMA3 model (this may take several minutes)...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            quantization_config=quantization_config,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            attn_implementation="flash_attention_2"
+        )
+        model_loaded = True
+        success_msg = "✅ VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI."
+        print(success_msg)
+        return success_msg
+    except Exception as e:
+        model_loaded = False
+        error_msg = f"❌ Failed to load VideoLLaMA3: {str(e)}"
+        print(error_msg)
+        return error_msg
+def extract_video_frames(video_path, max_frames=16, target_fps=1):
+    """Extract frames from video for VideoLLaMA3 processing"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        duration = total_frames / original_fps if original_fps > 0 else 0
+        if total_frames == 0:
+            return [], None
+        # Calculate frame sampling
+        frame_interval = max(1, int(original_fps / target_fps))
+        frame_indices = list(range(0, total_frames, frame_interval))[:max_frames]
+        frames = []
+        valid_indices = []
+        for idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if ret:
+                # Convert BGR to RGB
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize to reasonable size for processing
+                height, width = frame_rgb.shape[:2]
+                if max(height, width) > 720:
+                    scale = 720 / max(height, width)
+                    new_height, new_width = int(height * scale), int(width * scale)
+                    frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
+                frames.append(Image.fromarray(frame_rgb))
+                valid_indices.append(idx)
+        cap.release()
+        video_info = {
+            "duration": duration,
+            "original_fps": original_fps,
+            "total_frames": total_frames,
+            "extracted_frames": len(frames),
+            "resolution": f"{width}x{height}"
+        }
+        return frames, video_info
+    except Exception as e:
+        print(f"Error extracting frames: {e}")
+        return [], None
+@spaces.GPU
+def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
+    """Analyze video using VideoLLaMA3 model"""
+    if video_file is None:
+        return "❌ Please upload a video file first."
+    if not question.strip():
+        return "❌ Please enter a question about the video."
+    if not model_loaded:
+        return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for it to complete."
+    try:
+        progress(0.1, desc="Extracting video frames...")
+        # Extract frames from video
+        frames, video_info = extract_video_frames(video_file, max_frames=16)
+        if not frames or video_info is None:
+            return "❌ Could not process video. Please check the video format and try again."
+        progress(0.3, desc="Preparing AI input...")
+        # Create a detailed prompt for video analysis
+        system_prompt = "You are VideoLLaMA3, an advanced AI assistant specialized in video understanding. Analyze the video frames and provide detailed, accurate responses about the video content."
+        user_prompt = f"""I have a video with the following specifications:
+- Duration: {video_info['duration']:.1f} seconds
+- Original FPS: {video_info['original_fps']:.1f}
+- Total frames: {video_info['total_frames']}
+- Analyzed frames: {video_info['extracted_frames']}
+- Resolution: {video_info['resolution']}
+Question: {question}
+Please analyze the video content and provide a comprehensive answer based on what you observe in the video frames."""
+        progress(0.5, desc="Processing with VideoLLaMA3...")
+        # Prepare conversation format
+        conversation = f"System: {system_prompt}\n\nHuman: {user_prompt}\n\nAssistant:"
+        # Tokenize input
+        inputs = tokenizer(
+            conversation,
+            return_tensors="pt",
+            max_length=2048,
+            truncation=True,
+            padding=True
+        ).to(device)
+        progress(0.7, desc="Generating AI response...")
+        # Generate response
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                temperature=0.7,
+                do_sample=True,
+                top_p=0.9,
+                repetition_penalty=1.1,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id
+            )
+        # Decode response
+        full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Extract just the assistant's response
+        if "Assistant:" in full_response:
+            ai_response = full_response.split("Assistant:")[-1].strip()
+        else:
+            ai_response = full_response.split(conversation)[-1].strip()
+        progress(0.9, desc="Formatting results...")
+        # Format the final response
+        formatted_response = f"""🎥 **VideoLLaMA3 AI Video Analysis**
+❓ **Your Question:**
+{question}
+🤖 **AI Analysis:**
+{ai_response}
+📊 **Video Information:**
+• Duration: {video_info['duration']:.1f} seconds
+• Frame Rate: {video_info['original_fps']:.1f} FPS
+• Total Frames: {video_info['total_frames']:,}
+• Analyzed Frames: {video_info['extracted_frames']}
+• Resolution: {video_info['resolution']}
+⚡ **Powered by:** VideoLLaMA3-7B (Multimodal AI)
+"""
+        progress(1.0, desc="Analysis complete!")
+        return formatted_response
+    except torch.cuda.OutOfMemoryError:
+        torch.cuda.empty_cache()
+        return "❌ GPU memory error. Please try with a shorter video or restart the space."
+    except Exception as e:
+        error_msg = f"❌ Error during video analysis: {str(e)}"
+        print(error_msg)
+        return error_msg
+def create_interface():
+    """Create the Gradio interface"""
+    with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎥 VideoLLaMA3 AI Video Analysis Tool")
+        gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!")
+        # Model loading section
+        with gr.Row():
+            with gr.Column(scale=3):
+                model_status = gr.Textbox(
+                    label="🤖 Model Status",
+                    value="Model not loaded - Click the button to load VideoLLaMA3-7B →",
+                    interactive=False,
+                    lines=2
+                )
+            with gr.Column(scale=1):
+                load_btn = gr.Button("🚀 Load VideoLLaMA3 Model", variant="primary", size="lg")
+        load_btn.click(load_videollama3_model, outputs=model_status)
+        gr.Markdown("---")
+        # Main interface
+        with gr.Row():
+            with gr.Column(scale=1):
+                video_input = gr.Video(
+                    label="📹 Upload Video (MP4, AVI, MOV, WebM)",
+                    height=350
+                )
+                question_input = gr.Textbox(
+                    label="❓ Ask about the video",
+                    placeholder="What is happening in this video? Describe it in detail.",
+                    lines=3,
+                    max_lines=5
+                )
+                analyze_btn = gr.Button("🔍 Analyze Video with AI", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                output = gr.Textbox(
+                    label="🎯 AI Analysis Results",
+                    lines=25,
+                    max_lines=30,
+                    show_copy_button=True
+                )
+        # Example questions
+        gr.Markdown("### 💡 Example Questions (click to use):")
+        example_questions = [
+            "What is happening in this video? Describe the scene in detail.",
+            "Who are the people in this video and what are they doing?",
+            "Describe the setting, location, and environment shown.",
+            "What objects, animals, or items can you see in the video?",
+            "What is the mood, atmosphere, or emotion conveyed?",
+            "Summarize the key events that occur chronologically."
+        ]
+        with gr.Row():
+            for i in range(0, len(example_questions), 2):
+                with gr.Column():
+                    if i < len(example_questions):
+                        btn1 = gr.Button(example_questions[i], size="sm")
+                        btn1.click(lambda x=example_questions[i]: x, outputs=question_input)
+                    if i+1 < len(example_questions):
+                        btn2 = gr.Button(example_questions[i+1], size="sm")
+                        btn2.click(lambda x=example_questions[i+1]: x, outputs=question_input)
+        # Connect the analyze button
+        analyze_btn.click(
+            analyze_video_with_ai,
+            inputs=[video_input, question_input],
+            outputs=output,
+            show_progress=True
+        )
+        gr.Markdown("---")
+        gr.Markdown("""
+        ### 📋 Instructions:
+        1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes)
+        2. **Then:** Upload your video file (keep it under 2 minutes for best results)
+        3. **Ask:** Type your question about the video content
+        4. **Analyze:** Click "Analyze Video with AI" to get detailed insights
+        💡 **Tips:**
+        - Shorter videos (30s-2min) work best
+        - Ask specific questions for better results
+        - Try different question styles to explore the AI's capabilities
+        """)
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()