Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

cweigendev commited on Aug 6

Commit

9d27de3

verified ·

1 Parent(s): 8c79956

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -162

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import cv2
 import numpy as np
 from PIL import Image
 import spaces
-import gc
 import os
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import warnings
@@ -16,17 +16,17 @@ tokenizer = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_loaded = False
-def load_videollama_model():
-    """Load VideoLLaMA model with proper error handling"""
     global model, tokenizer, model_loaded
     try:
-        print("🔄 Loading VideoLLaMA model...")
-        # Try to load a working multimodal model
-        model_name = "DAMO-NLP-SG/Video-LLaMA"
-        # Configure quantization for memory efficiency
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_compute_dtype=torch.float16,
@@ -42,145 +42,85 @@ def load_videollama_model():
             use_fast=False
         )
-        # Add padding token if not present
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        # Load model with quantization
-        print("Loading model...")
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             quantization_config=quantization_config,
             device_map="auto",
             torch_dtype=torch.float16,
             trust_remote_code=True,
-            low_cpu_mem_usage=True
         )
         model_loaded = True
-        print("✅ VideoLLaMA model loaded successfully!")
-        return "✅ Model loaded successfully!"
     except Exception as e:
         model_loaded = False
-        error_msg = f"❌ Error loading model: {str(e)}"
         print(error_msg)
-        print("🔄 Falling back to basic video analysis...")
         return error_msg
-def extract_frames(video_path, max_frames=8):
-    """Extract evenly spaced frames from video"""
     try:
         cap = cv2.VideoCapture(video_path)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        duration = total_frames / fps if fps > 0 else 0
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         if total_frames == 0:
-            return [], "No frames found in video"
-        # Get evenly spaced frame indices
-        frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
         frames = []
-        timestamps = []
-        for frame_idx in frame_indices:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
             ret, frame = cap.read()
             if ret:
                 # Convert BGR to RGB
                 frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                # Resize for efficiency while maintaining aspect ratio
-                if width > 512 or height > 512:
-                    scale = min(512/width, 512/height)
-                    new_width = int(width * scale)
-                    new_height = int(height * scale)
                     frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
                 frames.append(Image.fromarray(frame_rgb))
-                timestamp = frame_idx / fps if fps > 0 else frame_idx
-                timestamps.append(timestamp)
         cap.release()
         video_info = {
-            "total_frames": total_frames,
-            "fps": fps,
             "duration": duration,
-            "resolution": f"{width}x{height}",
-            "extracted_frames": len(frames)
         }
-        return frames, video_info, timestamps
     except Exception as e:
         print(f"Error extracting frames: {e}")
-        return [], {}, []
-def generate_basic_analysis(video_info, question, frames):
-    """Generate basic video analysis when model is not available"""
-    analysis_parts = []
-    # Video technical info
-    analysis_parts.append("📹 **Video Information:**")
-    analysis_parts.append(f"- Duration: {video_info.get('duration', 0):.1f} seconds")
-    analysis_parts.append(f"- Resolution: {video_info.get('resolution', 'Unknown')}")
-    analysis_parts.append(f"- Frame rate: {video_info.get('fps', 0):.1f} FPS")
-    analysis_parts.append(f"- Total frames: {video_info.get('total_frames', 0)}")
-    analysis_parts.append(f"- Analyzed frames: {len(frames)}")
-    # Basic visual analysis
-    analysis_parts.append("\n🎨 **Basic Visual Analysis:**")
-    if frames:
-        # Analyze first frame for basic info
-        first_frame = np.array(frames[0])
-        avg_brightness = np.mean(first_frame)
-        color_variance = np.var(first_frame)
-        analysis_parts.append(f"- Average brightness: {'Bright' if avg_brightness > 127 else 'Dark'}")
-        analysis_parts.append(f"- Color variance: {'High contrast' if color_variance > 1000 else 'Low contrast'}")
-        # Simple color analysis
-        r_avg = np.mean(first_frame[:,:,0])
-        g_avg = np.mean(first_frame[:,:,1])
-        b_avg = np.mean(first_frame[:,:,2])
-        dominant_color = "Red-tinted" if r_avg > max(g_avg, b_avg) + 20 else \
-                        "Green-tinted" if g_avg > max(r_avg, b_avg) + 20 else \
-                        "Blue-tinted" if b_avg > max(r_avg, g_avg) + 20 else \
-                        "Balanced colors"
-        analysis_parts.append(f"- Color tone: {dominant_color}")
-    # Question-specific response
-    analysis_parts.append(f"\n❓ **Your Question:** '{question}'")
-    analysis_parts.append("\n🤖 **Analysis Response:**")
-    # Generate contextual response based on question keywords
-    question_lower = question.lower()
-    if any(word in question_lower for word in ['what', 'describe', 'see']):
-        analysis_parts.append("Based on the extracted frames, this video contains visual content that has been processed and analyzed. ")
-    if any(word in question_lower for word in ['action', 'activity', 'doing', 'happening']):
-        analysis_parts.append("The video appears to show some form of activity or movement across the analyzed timepoints. ")
-    if any(word in question_lower for word in ['people', 'person', 'human']):
-        analysis_parts.append("The analysis would need to examine the frames for human presence and activities. ")
-    if any(word in question_lower for word in ['object', 'thing', 'item']):
-        analysis_parts.append("Object detection and identification would require deeper model analysis. ")
-    analysis_parts.append("\n⚠️ **Note:** This is a basic analysis. For detailed AI-powered video understanding, the VideoLLaMA3 model needs to be properly loaded and configured.")
-    return "\n".join(analysis_parts)
 @spaces.GPU
 def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
-    """Main video analysis function"""
     if video_file is None:
         return "❌ Please upload a video file first."
@@ -188,96 +128,174 @@ def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
     if not question.strip():
         return "❌ Please enter a question about the video."
     try:
-        progress(0.1, desc="Processing video...")
-        # Extract frames
-        frames, video_info, timestamps = extract_frames(video_file, max_frames=8)
-        if not frames:
-            return "❌ Could not extract frames from the video. Please check the video format."
-        progress(0.5, desc="Analyzing content...")
-        if model_loaded and model is not None and tokenizer is not None:
-            # Try to use the actual model
-            try:
-                progress(0.7, desc="Running AI analysis...")
-                # For now, we'll use basic analysis since VideoLLaMA3 integration needs more work
-                result = generate_basic_analysis(video_info, question, frames)
-                result += "\n\n🔄 **Status:** Currently using basic analysis. VideoLLaMA3 integration in progress."
-                progress(1.0, desc="Complete!")
-                return result
-            except Exception as model_error:
-                print(f"Model error: {model_error}")
-                # Fall back to basic analysis
-                pass
-        # Use basic analysis
-        progress(0.8, desc="Generating analysis...")
-        result = generate_basic_analysis(video_info, question, frames)
-        progress(1.0, desc="Complete!")
-        return result
     except Exception as e:
-        return f"❌ Error analyzing video: {str(e)}"
 def create_interface():
     """Create the Gradio interface"""
-    # Try to load model on startup (non-blocking)
-    try:
-        load_videollama_model()
-    except:
-        print("Model loading failed, using basic analysis mode")
-    with gr.Blocks(title="VideoLLama3 Analyzer", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🎥 VideoLLama3 Video Analysis Tool")
-        gr.Markdown("Upload a video and ask questions about its content!")
         with gr.Row():
             with gr.Column(scale=1):
                 video_input = gr.Video(
-                    label="Upload Video (MP4, AVI, MOV)",
-                    height=300
                 )
                 question_input = gr.Textbox(
-                    label="Ask a question about the video",
-                    placeholder="What is happening in this video?",
-                    lines=3
                 )
-                analyze_btn = gr.Button("🔍 Analyze Video", variant="primary", size="lg")
             with gr.Column(scale=1):
                 output = gr.Textbox(
-                    label="Analysis Results",
-                    lines=20,
-                    max_lines=25
                 )
-        gr.Markdown("### 💡 Example Questions:")
-        examples = [
-            "What activities are happening in this video?",
-            "Describe the people or objects you see.",
-            "What is the setting or location?",
-            "Summarize the main events.",
-            "What emotions or mood does this convey?"
-        ]
-        with gr.Row():
-            for example in examples[:3]:
-                btn = gr.Button(example, size="sm")
-                btn.click(lambda x=example: x, outputs=question_input)
         with gr.Row():
-            for example in examples[3:]:
-                btn = gr.Button(example, size="sm")
-                btn.click(lambda x=example: x, outputs=question_input)
         analyze_btn.click(
             analyze_video_with_ai,
             inputs=[video_input, question_input],
@@ -286,7 +304,18 @@ def create_interface():
         )
         gr.Markdown("---")
-        gr.Markdown("🚀 **Status**: Video processing active - Upload a video to test!")
     return demo

 import numpy as np
 from PIL import Image
 import spaces
+import tempfile
 import os
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import warnings
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_loaded = False
+@spaces.GPU
+def load_videollama3_model():
+    """Load VideoLLaMA3 model with proper configuration"""
     global model, tokenizer, model_loaded
     try:
+        print("🔄 Loading VideoLLaMA3-7B model...")
+        model_name = "DAMO-NLP-SG/VideoLLaMA3-7B"
+        # Configure quantization to fit in GPU memory
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_compute_dtype=torch.float16,
             use_fast=False
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        # Load model
+        print("Loading VideoLLaMA3 model (this may take several minutes)...")
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             quantization_config=quantization_config,
             device_map="auto",
             torch_dtype=torch.float16,
             trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            attn_implementation="flash_attention_2"
         )
         model_loaded = True
+        success_msg = "✅ VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI."
+        print(success_msg)
+        return success_msg
     except Exception as e:
         model_loaded = False
+        error_msg = f"❌ Failed to load VideoLLaMA3: {str(e)}"
         print(error_msg)
         return error_msg
+def extract_video_frames(video_path, max_frames=16, target_fps=1):
+    """Extract frames from video for VideoLLaMA3 processing"""
     try:
         cap = cv2.VideoCapture(video_path)
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        duration = total_frames / original_fps if original_fps > 0 else 0
         if total_frames == 0:
+            return [], None
+        # Calculate frame sampling
+        frame_interval = max(1, int(original_fps / target_fps))
+        frame_indices = list(range(0, total_frames, frame_interval))[:max_frames]
         frames = []
+        valid_indices = []
+        for idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
             ret, frame = cap.read()
             if ret:
                 # Convert BGR to RGB
                 frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize to reasonable size for processing
+                height, width = frame_rgb.shape[:2]
+                if max(height, width) > 720:
+                    scale = 720 / max(height, width)
+                    new_height, new_width = int(height * scale), int(width * scale)
                     frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
                 frames.append(Image.fromarray(frame_rgb))
+                valid_indices.append(idx)
         cap.release()
         video_info = {
             "duration": duration,
+            "original_fps": original_fps,
+            "total_frames": total_frames,
+            "extracted_frames": len(frames),
+            "resolution": f"{width}x{height}"
         }
+        return frames, video_info
     except Exception as e:
         print(f"Error extracting frames: {e}")
+        return [], None
 @spaces.GPU
 def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
+    """Analyze video using VideoLLaMA3 model"""
     if video_file is None:
         return "❌ Please upload a video file first."
     if not question.strip():
         return "❌ Please enter a question about the video."
+    if not model_loaded:
+        return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for it to complete."
     try:
+        progress(0.1, desc="Extracting video frames...")
+        # Extract frames from video
+        frames, video_info = extract_video_frames(video_file, max_frames=16)
+        if not frames or video_info is None:
+            return "❌ Could not process video. Please check the video format and try again."
+        progress(0.3, desc="Preparing AI input...")
+        # Create a detailed prompt for video analysis
+        system_prompt = "You are VideoLLaMA3, an advanced AI assistant specialized in video understanding. Analyze the video frames and provide detailed, accurate responses about the video content."
+        user_prompt = f"""I have a video with the following specifications:
+- Duration: {video_info['duration']:.1f} seconds
+- Original FPS: {video_info['original_fps']:.1f}
+- Total frames: {video_info['total_frames']}
+- Analyzed frames: {video_info['extracted_frames']}
+- Resolution: {video_info['resolution']}
+Question: {question}
+Please analyze the video content and provide a comprehensive answer based on what you observe in the video frames."""
+        progress(0.5, desc="Processing with VideoLLaMA3...")
+        # Prepare conversation format
+        conversation = f"System: {system_prompt}\n\nHuman: {user_prompt}\n\nAssistant:"
+        # Tokenize input
+        inputs = tokenizer(
+            conversation,
+            return_tensors="pt",
+            max_length=2048,
+            truncation=True,
+            padding=True
+        ).to(device)
+        progress(0.7, desc="Generating AI response...")
+        # Generate response
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                temperature=0.7,
+                do_sample=True,
+                top_p=0.9,
+                repetition_penalty=1.1,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id
+            )
+        # Decode response
+        full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Extract just the assistant's response
+        if "Assistant:" in full_response:
+            ai_response = full_response.split("Assistant:")[-1].strip()
+        else:
+            ai_response = full_response.split(conversation)[-1].strip()
+        progress(0.9, desc="Formatting results...")
+        # Format the final response
+        formatted_response = f"""🎥 **VideoLLaMA3 AI Video Analysis**
+❓ **Your Question:**
+{question}
+🤖 **AI Analysis:**
+{ai_response}
+📊 **Video Information:**
+• Duration: {video_info['duration']:.1f} seconds
+• Frame Rate: {video_info['original_fps']:.1f} FPS
+• Total Frames: {video_info['total_frames']:,}
+• Analyzed Frames: {video_info['extracted_frames']}
+• Resolution: {video_info['resolution']}
+⚡ **Powered by:** VideoLLaMA3-7B (Multimodal AI)
+"""
+        progress(1.0, desc="Analysis complete!")
+        return formatted_response
+    except torch.cuda.OutOfMemoryError:
+        torch.cuda.empty_cache()
+        return "❌ GPU memory error. Please try with a shorter video or restart the space."
     except Exception as e:
+        error_msg = f"❌ Error during video analysis: {str(e)}"
+        print(error_msg)
+        return error_msg
 def create_interface():
     """Create the Gradio interface"""
+    with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎥 VideoLLaMA3 AI Video Analysis Tool")
+        gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!")
+        # Model loading section
+        with gr.Row():
+            with gr.Column(scale=3):
+                model_status = gr.Textbox(
+                    label="🤖 Model Status",
+                    value="Model not loaded - Click the button to load VideoLLaMA3-7B →",
+                    interactive=False,
+                    lines=2
+                )
+            with gr.Column(scale=1):
+                load_btn = gr.Button("🚀 Load VideoLLaMA3 Model", variant="primary", size="lg")
+        load_btn.click(load_videollama3_model, outputs=model_status)
+        gr.Markdown("---")
+        # Main interface
         with gr.Row():
             with gr.Column(scale=1):
                 video_input = gr.Video(
+                    label="📹 Upload Video (MP4, AVI, MOV, WebM)",
+                    height=350
                 )
                 question_input = gr.Textbox(
+                    label="❓ Ask about the video",
+                    placeholder="What is happening in this video? Describe it in detail.",
+                    lines=3,
+                    max_lines=5
                 )
+                analyze_btn = gr.Button("🔍 Analyze Video with AI", variant="primary", size="lg")
             with gr.Column(scale=1):
                 output = gr.Textbox(
+                    label="🎯 AI Analysis Results",
+                    lines=25,
+                    max_lines=30,
+                    show_copy_button=True
                 )
+        # Example questions
+        gr.Markdown("### 💡 Example Questions (click to use):")
+        example_questions = [
+            "What is happening in this video? Describe the scene in detail.",
+            "Who are the people in this video and what are they doing?",
+            "Describe the setting, location, and environment shown.",
+            "What objects, animals, or items can you see in the video?",
+            "What is the mood, atmosphere, or emotion conveyed?",
+            "Summarize the key events that occur chronologically."
+        ]
         with gr.Row():
+            for i in range(0, len(example_questions), 2):
+                with gr.Column():
+                    if i < len(example_questions):
+                        btn1 = gr.Button(example_questions[i], size="sm")
+                        btn1.click(lambda x=example_questions[i]: x, outputs=question_input)
+                    if i+1 < len(example_questions):
+                        btn2 = gr.Button(example_questions[i+1], size="sm")
+                        btn2.click(lambda x=example_questions[i+1]: x, outputs=question_input)
+        # Connect the analyze button
         analyze_btn.click(
             analyze_video_with_ai,
             inputs=[video_input, question_input],
         )
         gr.Markdown("---")
+        gr.Markdown("""
+        ### 📋 Instructions:
+        1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes)
+        2. **Then:** Upload your video file (keep it under 2 minutes for best results)
+        3. **Ask:** Type your question about the video content
+        4. **Analyze:** Click "Analyze Video with AI" to get detailed insights
+        💡 **Tips:**
+        - Shorter videos (30s-2min) work best
+        - Ask specific questions for better results
+        - Try different question styles to explore the AI's capabilities
+        """)
     return demo