Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

cweigendev commited on Aug 6

Commit

4726b4a

verified ·

1 Parent(s): 90dbf48

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -221

app.py CHANGED Viewed

@@ -4,199 +4,73 @@ import cv2
 import numpy as np
 from PIL import Image
 import spaces
-import base64
-import io
-from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import warnings
 warnings.filterwarnings("ignore")
 # Global variables
-vision_model = None
-vision_processor = None
-text_model = None
-text_tokenizer = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_loaded = False
 @spaces.GPU
-def load_models():
-    """Load BLIP for vision and a language model for analysis"""
-    global vision_model, vision_processor, text_model, text_tokenizer, model_loaded
     try:
-        print("🔄 Loading AI models for video analysis...")
-        # Load BLIP for image understanding
-        print("Loading BLIP vision model...")
-        vision_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-        vision_model = BlipForConditionalGeneration.from_pretrained(
-            "Salesforce/blip-image-captioning-large",
-            torch_dtype=torch.float16,
-            device_map="auto"
-        )
-        # Load a conversational model for analysis
-        print("Loading language model...")
-        text_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
-        text_model = AutoModelForCausalLM.from_pretrained(
-            "microsoft/DialoGPT-medium",
-            torch_dtype=torch.float16,
-            device_map="auto"
         )
-        # Add padding token if needed
-        if text_tokenizer.pad_token is None:
-            text_tokenizer.pad_token = text_tokenizer.eos_token
         model_loaded = True
-        success_msg = "✅ AI models loaded successfully! You can now analyze videos."
         print(success_msg)
         return success_msg
     except Exception as e:
         model_loaded = False
-        error_msg = f"❌ Failed to load models: {str(e)}"
         print(error_msg)
         return error_msg
-def extract_key_frames(video_path, max_frames=8):
-    """Extract key frames from video"""
-    try:
-        cap = cv2.VideoCapture(video_path)
-        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        duration = total_frames / fps if fps > 0 else 0
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        if total_frames == 0:
-            return [], None
-        # Get evenly spaced frames
-        frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
-        frames = []
-        timestamps = []
-        for frame_idx in frame_indices:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
-            ret, frame = cap.read()
-            if ret:
-                # Convert BGR to RGB
-                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                # Resize if too large
-                if max(width, height) > 512:
-                    scale = 512 / max(width, height)
-                    new_width = int(width * scale)
-                    new_height = int(height * scale)
-                    frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
-                frames.append(Image.fromarray(frame_rgb))
-                timestamp = frame_idx / fps if fps > 0 else frame_idx
-                timestamps.append(timestamp)
-        cap.release()
-        video_info = {
-            "duration": duration,
-            "fps": fps,
-            "total_frames": total_frames,
-            "resolution": f"{width}x{height}",
-            "extracted_frames": len(frames)
-        }
-        return frames, video_info, timestamps
-    except Exception as e:
-        print(f"Error extracting frames: {e}")
-        return [], None, []
 @spaces.GPU
-def analyze_frame_with_blip(frame, custom_question=None):
-    """Analyze a single frame with BLIP"""
-    try:
-        if custom_question:
-            # Use BLIP for visual question answering
-            inputs = vision_processor(frame, custom_question, return_tensors="pt").to(device)
-        else:
-            # Use BLIP for image captioning
-            inputs = vision_processor(frame, return_tensors="pt").to(device)
-        with torch.no_grad():
-            if custom_question:
-                output_ids = vision_model.generate(**inputs, max_new_tokens=100)
-            else:
-                output_ids = vision_model.generate(**inputs, max_new_tokens=50)
-        caption = vision_processor.decode(output_ids[0], skip_special_tokens=True)
-        return caption
-    except Exception as e:
-        return f"Error analyzing frame: {str(e)}"
-def synthesize_video_analysis(frame_descriptions, question, video_info):
-    """Create comprehensive video analysis from frame descriptions"""
-    # Combine all frame descriptions
-    all_descriptions = " ".join(frame_descriptions)
-    # Create analysis based on question type
-    question_lower = question.lower()
-    analysis = f"""🎥 **AI Video Analysis**
-❓ **Your Question:** {question}
-🤖 **Detailed Analysis:**
-"""
-    if any(word in question_lower for word in ['what', 'happening', 'describe', 'see']):
-        analysis += f"Based on my analysis of {len(frame_descriptions)} key frames from the video:\n\n"
-        for i, desc in enumerate(frame_descriptions):
-            timestamp = i * (video_info['duration'] / len(frame_descriptions))
-            analysis += f"• **At {timestamp:.1f}s:** {desc}\n"
-        analysis += f"\n**Overall Summary:** This {video_info['duration']:.1f}-second video shows {all_descriptions.lower()}. "
-        # Add contextual insights
-        if len(set(frame_descriptions)) < len(frame_descriptions) * 0.3:
-            analysis += "The scene appears relatively static with consistent elements throughout."
-        else:
-            analysis += "The video shows dynamic content with changing scenes and activities."
-    elif any(word in question_lower for word in ['people', 'person', 'human', 'who']):
-        people_mentions = [desc for desc in frame_descriptions if any(word in desc.lower() for word in ['person', 'people', 'man', 'woman', 'child', 'human'])]
-        if people_mentions:
-            analysis += f"**People in the video:** {' '.join(people_mentions)}\n\n"
-        else:
-            analysis += "**People analysis:** No clear human figures were detected in the analyzed frames.\n\n"
-    elif any(word in question_lower for word in ['object', 'item', 'thing']):
-        analysis += "**Objects and items visible:**\n"
-        for desc in frame_descriptions:
-            analysis += f"• {desc}\n"
-    elif any(word in question_lower for word in ['setting', 'location', 'place', 'where']):
-        analysis += "**Setting and location analysis:**\n"
-        analysis += f"Based on the visual elements: {all_descriptions}\n\n"
-    elif any(word in question_lower for word in ['mood', 'emotion', 'feeling', 'atmosphere']):
-        analysis += "**Mood and atmosphere:**\n"
-        analysis += f"The visual elements suggest: {all_descriptions}\n\n"
-    else:
-        # General analysis
-        analysis += f"**Frame-by-frame analysis:**\n"
-        for i, desc in enumerate(frame_descriptions):
-            analysis += f"{i+1}. {desc}\n"
-    return analysis
-@spaces.GPU
-def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
-    """Main video analysis function"""
     if video_file is None:
         return "❌ Please upload a video file first."
@@ -204,81 +78,145 @@ def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
     if not question.strip():
         return "❌ Please enter a question about the video."
-    if not model_loaded:
-        return "❌ AI models are not loaded. Please click 'Load AI Models' first and wait for completion."
     try:
-        progress(0.1, desc="Extracting video frames...")
-        # Extract frames
-        frames, video_info, timestamps = extract_key_frames(video_file, max_frames=8)
-        if not frames or video_info is None:
-            return "❌ Could not process video. Please check the video format."
-        progress(0.3, desc="Analyzing frames with AI...")
-        # Analyze each frame
-        frame_descriptions = []
-        for i, frame in enumerate(frames):
-            progress(0.3 + (i / len(frames)) * 0.5, desc=f"Analyzing frame {i+1}/{len(frames)}...")
-            # Create frame-specific question if relevant
-            if any(word in question.lower() for word in ['what', 'describe', 'see', 'happening']):
-                frame_question = f"What do you see in this image? {question}"
-                description = analyze_frame_with_blip(frame, frame_question)
             else:
-                description = analyze_frame_with_blip(frame)
-            frame_descriptions.append(description)
-        progress(0.8, desc="Synthesizing analysis...")
-        # Create comprehensive analysis
-        analysis = synthesize_video_analysis(frame_descriptions, question, video_info)
-        # Add technical information
-        analysis += f"""
-📊 **Technical Information:**
-• Duration: {video_info['duration']:.1f} seconds
-• Frame Rate: {video_info['fps']:.1f} FPS
-• Total Frames: {video_info['total_frames']:,}
-• Analyzed Frames: {video_info['extracted_frames']}
-• Resolution: {video_info['resolution']}
-⚡ **Powered by:** BLIP Vision AI + Advanced Analysis
 """
-        progress(1.0, desc="Analysis complete!")
-        return analysis
     except Exception as e:
-        error_msg = f"❌ Error during analysis: {str(e)}"
         print(error_msg)
-        return error_msg
 def create_interface():
     """Create the Gradio interface"""
-    with gr.Blocks(title="AI Video Analyzer", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🎥 AI Video Analysis Tool")
-        gr.Markdown("Upload videos and get detailed AI-powered analysis using advanced computer vision!")
         # Model loading section
         with gr.Row():
             with gr.Column(scale=3):
                 model_status = gr.Textbox(
                     label="🤖 Model Status",
-                    value="Models not loaded - Click the button to load AI models →",
                     interactive=False,
                     lines=2
                 )
             with gr.Column(scale=1):
-                load_btn = gr.Button("🚀 Load AI Models", variant="primary", size="lg")
-        load_btn.click(load_models, outputs=model_status)
         gr.Markdown("---")
@@ -295,7 +233,7 @@ def create_interface():
                     lines=3,
                     max_lines=5
                 )
-                analyze_btn = gr.Button("🔍 Analyze Video with AI", variant="primary", size="lg")
             with gr.Column(scale=1):
                 output = gr.Textbox(
@@ -329,7 +267,7 @@ def create_interface():
         # Connect the analyze button
         analyze_btn.click(
-            analyze_video_with_ai,
             inputs=[video_input, question_input],
             outputs=output,
             show_progress=True
@@ -338,16 +276,15 @@ def create_interface():
         gr.Markdown("---")
         gr.Markdown("""
         ### 📋 Instructions:
-        1. **First:** Click "Load AI Models" and wait for it to complete (~3-5 minutes)
-        2. **Then:** Upload your video file (works with most formats)
         3. **Ask:** Type your question about the video content
-        4. **Analyze:** Click "Analyze Video with AI" to get detailed insights
-        💡 **How it works:**
-        - Extracts key frames from your video
-        - Analyzes each frame with BLIP vision AI
-        - Synthesizes comprehensive analysis based on your question
-        - Works reliably with standard video formats
         """)
     return demo

 import numpy as np
 from PIL import Image
 import spaces
+import tempfile
+import os
+import subprocess
+import sys
+# Install dependencies if needed
+def install_dependencies():
+    """Install required packages for VideoLLaMA3"""
+    packages = ["decord", "imageio", "imageio-ffmpeg"]
+    for package in packages:
+        try:
+            __import__(package.replace("-", "_"))
+        except ImportError:
+            print(f"Installing {package}...")
+            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
+# Install dependencies on startup
+install_dependencies()
+from transformers import AutoModelForCausalLM, AutoProcessor
 import warnings
 warnings.filterwarnings("ignore")
 # Global variables
+model = None
+processor = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_loaded = False
 @spaces.GPU
+def load_videollama3_model():
+    """Load VideoLLaMA3 model"""
+    global model, processor, model_loaded
     try:
+        print("🔄 Loading VideoLLaMA3-7B model...")
+        model_name = "DAMO-NLP-SG/VideoLLaMA3-7B"
+        print("Loading processor...")
+        processor = AutoProcessor.from_pretrained(
+            model_name,
+            trust_remote_code=True
         )
+        print("Loading VideoLLaMA3 model (this may take several minutes)...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+        )
         model_loaded = True
+        success_msg = "✅ VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI."
         print(success_msg)
         return success_msg
     except Exception as e:
         model_loaded = False
+        error_msg = f"❌ Failed to load VideoLLaMA3: {str(e)}"
         print(error_msg)
         return error_msg
 @spaces.GPU
+def analyze_video_with_videollama3(video_file, question, progress=gr.Progress()):
+    """Analyze video using VideoLLaMA3"""
     if video_file is None:
         return "❌ Please upload a video file first."
     if not question.strip():
         return "❌ Please enter a question about the video."
+    if not model_loaded or model is None or processor is None:
+        return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for completion."
     try:
+        progress(0.1, desc="Preparing video for analysis...")
+        # Create the conversation in the format VideoLLaMA3 expects
+        conversation = [
+            {"role": "system", "content": "You are a helpful assistant that can analyze videos."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 64}},
+                    {"type": "text", "text": question}
+                ]
+            }
+        ]
+        progress(0.3, desc="Processing video with VideoLLaMA3...")
+        # Process the conversation
+        inputs = processor(conversation=conversation, return_tensors="pt")
+        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        if "pixel_values" in inputs:
+            inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+        progress(0.7, desc="Generating AI response...")
+        # Generate response
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=processor.tokenizer.eos_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id
+            )
+        progress(0.9, desc="Processing response...")
+        # Decode response
+        response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
+        # Extract assistant response
+        if "assistant" in response.lower():
+            ai_response = response.split("assistant")[-1].strip()
+        elif "user:" in response.lower():
+            parts = response.split("user:")
+            if len(parts) > 1:
+                ai_response = parts[-1].strip()
             else:
+                ai_response = response.strip()
+        else:
+            ai_response = response.strip()
+        # Clean up the response
+        ai_response = ai_response.replace("</s>", "").strip()
+        # Get video info for technical details
+        cap = cv2.VideoCapture(video_file)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        duration = total_frames / fps if fps > 0 else 0
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap.release()
+        progress(1.0, desc="Analysis complete!")
+        # Format the final response
+        formatted_response = f"""🎥 **VideoLLaMA3 AI Video Analysis**
+❓ **Your Question:**
+{question}
+🤖 **AI Analysis:**
+{ai_response}
+📊 **Video Information:**
+• Duration: {duration:.1f} seconds
+• Frame Rate: {fps:.1f} FPS
+• Total Frames: {total_frames:,}
+• Resolution: {width}x{height}
+⚡ **Powered by:** VideoLLaMA3-7B (Multimodal AI)
 """
+        return formatted_response
     except Exception as e:
+        error_msg = f"❌ Error during VideoLLaMA3 analysis: {str(e)}"
         print(error_msg)
+        # Fallback: Basic video analysis if VideoLLaMA3 fails
+        try:
+            cap = cv2.VideoCapture(video_file)
+            fps = cap.get(cv2.CAP_PROP_FPS)
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            duration = total_frames / fps if fps > 0 else 0
+            cap.release()
+            fallback_response = f"""❌ VideoLLaMA3 analysis failed, but here's what I can tell you:
+**Video Technical Info:**
+• Duration: {duration:.1f} seconds
+• Frame Rate: {fps:.1f} FPS
+• Total Frames: {total_frames:,}
+**Error:** {str(e)}
+**Suggestion:** Try reloading the model or using a shorter video file.
+"""
+            return fallback_response
+        except:
+            return error_msg
 def create_interface():
     """Create the Gradio interface"""
+    with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎥 VideoLLaMA3 Video Analysis Tool")
+        gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!")
         # Model loading section
         with gr.Row():
             with gr.Column(scale=3):
                 model_status = gr.Textbox(
                     label="🤖 Model Status",
+                    value="Model not loaded - Click the button to load VideoLLaMA3-7B →",
                     interactive=False,
                     lines=2
                 )
             with gr.Column(scale=1):
+                load_btn = gr.Button("🚀 Load VideoLLaMA3 Model", variant="primary", size="lg")
+        load_btn.click(load_videollama3_model, outputs=model_status)
         gr.Markdown("---")
                     lines=3,
                     max_lines=5
                 )
+                analyze_btn = gr.Button("🔍 Analyze Video with VideoLLaMA3", variant="primary", size="lg")
             with gr.Column(scale=1):
                 output = gr.Textbox(
         # Connect the analyze button
         analyze_btn.click(
+            analyze_video_with_videollama3,
             inputs=[video_input, question_input],
             outputs=output,
             show_progress=True
         gr.Markdown("---")
         gr.Markdown("""
         ### 📋 Instructions:
+        1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes)
+        2. **Then:** Upload your video file (works best with videos under 2 minutes)
         3. **Ask:** Type your question about the video content
+        4. **Analyze:** Click "Analyze Video with VideoLLaMA3" to get detailed insights
+        💡 **Tips:**
+        - Keep videos under 2 minutes for best performance
+        - Ask specific, detailed questions for better results
+        - The model will analyze up to 64 frames from your video
         """)
     return demo