Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

cweigendev commited on Aug 6

Commit

90dbf48

verified ·

1 Parent(s): ad51a7d

Update app.py

Browse files

Files changed (1) hide show

app.py +184 -139

app.py CHANGED Viewed

@@ -4,118 +4,199 @@ import cv2
 import numpy as np
 from PIL import Image
 import spaces
-import tempfile
-import os
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
 import warnings
 warnings.filterwarnings("ignore")
 # Global variables
-model = None
-processor = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_loaded = False
 @spaces.GPU
-def load_videollama3_model():
-    """Load VideoLLaMA3 model with proper configuration"""
-    global model, processor, model_loaded
     try:
-        print("🔄 Loading VideoLLaMA3-7B model...")
-        model_name = "DAMO-NLP-SG/VideoLLaMA3-7B"
-        # Configure quantization to fit in GPU memory
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4"
-        )
-        # Load processor (handles both text and video)
-        print("Loading processor...")
-        processor = AutoProcessor.from_pretrained(
-            model_name,
-            trust_remote_code=True
         )
-        # Load model
-        print("Loading VideoLLaMA3 model (this may take several minutes)...")
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            quantization_config=quantization_config,
-            device_map="auto",
             torch_dtype=torch.float16,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True
         )
         model_loaded = True
-        success_msg = "✅ VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI."
         print(success_msg)
         return success_msg
     except Exception as e:
         model_loaded = False
-        error_msg = f"❌ Failed to load VideoLLaMA3: {str(e)}"
         print(error_msg)
         return error_msg
-def extract_video_frames(video_path, max_frames=16, target_fps=1):
-    """Extract frames from video for VideoLLaMA3 processing"""
     try:
         cap = cv2.VideoCapture(video_path)
-        original_fps = cap.get(cv2.CAP_PROP_FPS)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        duration = total_frames / original_fps if original_fps > 0 else 0
         if total_frames == 0:
             return [], None
-        # Calculate frame sampling
-        frame_interval = max(1, int(original_fps / target_fps))
-        frame_indices = list(range(0, total_frames, frame_interval))[:max_frames]
         frames = []
-        valid_indices = []
-        for idx in frame_indices:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
             ret, frame = cap.read()
             if ret:
                 # Convert BGR to RGB
                 frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                # Resize to reasonable size for processing
-                height, width = frame_rgb.shape[:2]
-                if max(height, width) > 720:
-                    scale = 720 / max(height, width)
-                    new_height, new_width = int(height * scale), int(width * scale)
                     frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
                 frames.append(Image.fromarray(frame_rgb))
-                valid_indices.append(idx)
         cap.release()
         video_info = {
             "duration": duration,
-            "original_fps": original_fps,
             "total_frames": total_frames,
-            "extracted_frames": len(frames),
-            "resolution": f"{width}x{height}"
         }
-        return frames, video_info
     except Exception as e:
         print(f"Error extracting frames: {e}")
-        return [], None
 @spaces.GPU
 def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
-    """Analyze video using VideoLLaMA3 model"""
     if video_file is None:
         return "❌ Please upload a video file first."
@@ -124,117 +205,80 @@ def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
         return "❌ Please enter a question about the video."
     if not model_loaded:
-        return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for it to complete."
     try:
         progress(0.1, desc="Extracting video frames...")
-        # Extract frames from video
-        frames, video_info = extract_video_frames(video_file, max_frames=16)
         if not frames or video_info is None:
-            return "❌ Could not process video. Please check the video format and try again."
-        progress(0.3, desc="Preparing AI input...")
-        # Create proper conversation format for VideoLLaMA3
-        conversation = [
-            {"role": "system", "content": "You are a helpful assistant that can analyze videos."},
-            {
-                "role": "user",
-                "content": [
-                    {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 16}},
-                    {"type": "text", "text": question}
-                ]
-            }
-        ]
-        progress(0.5, desc="Processing with VideoLLaMA3...")
-        # Process the conversation with video
-        inputs = processor(conversation=conversation, return_tensors="pt")
-        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
-        if "pixel_values" in inputs:
-            inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)
-        progress(0.7, desc="Generating AI response...")
-        # Generate response
-        with torch.no_grad():
-            output_ids = model.generate(
-                **inputs,
-                max_new_tokens=512,
-                temperature=0.7,
-                do_sample=True,
-                top_p=0.9,
-                repetition_penalty=1.1,
-                pad_token_id=processor.tokenizer.eos_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id
-            )
-        # Decode response
-        response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
-        # Extract just the assistant's response
-        if "assistant" in response.lower():
-            ai_response = response.split("assistant")[-1].strip()
-        else:
-            ai_response = response.strip()
-        progress(0.9, desc="Formatting results...")
-        # Format the final response
-        formatted_response = f"""🎥 **VideoLLaMA3 AI Video Analysis**
-❓ **Your Question:**
-{question}
-🤖 **AI Analysis:**
-{ai_response}
-📊 **Video Information:**
 • Duration: {video_info['duration']:.1f} seconds
-• Frame Rate: {video_info['original_fps']:.1f} FPS
 • Total Frames: {video_info['total_frames']:,}
 • Analyzed Frames: {video_info['extracted_frames']}
 • Resolution: {video_info['resolution']}
-⚡ **Powered by:** VideoLLaMA3-7B (Multimodal AI)
 """
         progress(1.0, desc="Analysis complete!")
-        return formatted_response
-    except torch.cuda.OutOfMemoryError:
-        torch.cuda.empty_cache()
-        return "❌ GPU memory error. Please try with a shorter video or restart the space."
     except Exception as e:
-        error_msg = f"❌ Error during video analysis: {str(e)}"
         print(error_msg)
         return error_msg
 def create_interface():
     """Create the Gradio interface"""
-    with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🎥 VideoLLaMA3 AI Video Analysis Tool")
-        gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!")
         # Model loading section
         with gr.Row():
             with gr.Column(scale=3):
                 model_status = gr.Textbox(
                     label="🤖 Model Status",
-                    value="Model not loaded - Click the button to load VideoLLaMA3-7B →",
                     interactive=False,
                     lines=2
                 )
             with gr.Column(scale=1):
-                load_btn = gr.Button("🚀 Load VideoLLaMA3 Model", variant="primary", size="lg")
-        load_btn.click(load_videollama3_model, outputs=model_status)
         gr.Markdown("---")
@@ -294,15 +338,16 @@ def create_interface():
         gr.Markdown("---")
         gr.Markdown("""
         ### 📋 Instructions:
-        1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes)
-        2. **Then:** Upload your video file (keep it under 2 minutes for best results)
         3. **Ask:** Type your question about the video content
         4. **Analyze:** Click "Analyze Video with AI" to get detailed insights
-        💡 **Tips:**
-        - Shorter videos (30s-2min) work best
-        - Ask specific questions for better results
-        - Try different question styles to explore the AI's capabilities
         """)
     return demo

 import numpy as np
 from PIL import Image
 import spaces
+import base64
+import io
+from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import warnings
 warnings.filterwarnings("ignore")
 # Global variables
+vision_model = None
+vision_processor = None
+text_model = None
+text_tokenizer = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_loaded = False
 @spaces.GPU
+def load_models():
+    """Load BLIP for vision and a language model for analysis"""
+    global vision_model, vision_processor, text_model, text_tokenizer, model_loaded
     try:
+        print("🔄 Loading AI models for video analysis...")
+        # Load BLIP for image understanding
+        print("Loading BLIP vision model...")
+        vision_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+        vision_model = BlipForConditionalGeneration.from_pretrained(
+            "Salesforce/blip-image-captioning-large",
+            torch_dtype=torch.float16,
+            device_map="auto"
         )
+        # Load a conversational model for analysis
+        print("Loading language model...")
+        text_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
+        text_model = AutoModelForCausalLM.from_pretrained(
+            "microsoft/DialoGPT-medium",
             torch_dtype=torch.float16,
+            device_map="auto"
         )
+        # Add padding token if needed
+        if text_tokenizer.pad_token is None:
+            text_tokenizer.pad_token = text_tokenizer.eos_token
         model_loaded = True
+        success_msg = "✅ AI models loaded successfully! You can now analyze videos."
         print(success_msg)
         return success_msg
     except Exception as e:
         model_loaded = False
+        error_msg = f"❌ Failed to load models: {str(e)}"
         print(error_msg)
         return error_msg
+def extract_key_frames(video_path, max_frames=8):
+    """Extract key frames from video"""
     try:
         cap = cv2.VideoCapture(video_path)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames / fps if fps > 0 else 0
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         if total_frames == 0:
             return [], None
+        # Get evenly spaced frames
+        frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
         frames = []
+        timestamps = []
+        for frame_idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
             ret, frame = cap.read()
             if ret:
                 # Convert BGR to RGB
                 frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize if too large
+                if max(width, height) > 512:
+                    scale = 512 / max(width, height)
+                    new_width = int(width * scale)
+                    new_height = int(height * scale)
                     frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
                 frames.append(Image.fromarray(frame_rgb))
+                timestamp = frame_idx / fps if fps > 0 else frame_idx
+                timestamps.append(timestamp)
         cap.release()
         video_info = {
             "duration": duration,
+            "fps": fps,
             "total_frames": total_frames,
+            "resolution": f"{width}x{height}",
+            "extracted_frames": len(frames)
         }
+        return frames, video_info, timestamps
     except Exception as e:
         print(f"Error extracting frames: {e}")
+        return [], None, []
+@spaces.GPU
+def analyze_frame_with_blip(frame, custom_question=None):
+    """Analyze a single frame with BLIP"""
+    try:
+        if custom_question:
+            # Use BLIP for visual question answering
+            inputs = vision_processor(frame, custom_question, return_tensors="pt").to(device)
+        else:
+            # Use BLIP for image captioning
+            inputs = vision_processor(frame, return_tensors="pt").to(device)
+        with torch.no_grad():
+            if custom_question:
+                output_ids = vision_model.generate(**inputs, max_new_tokens=100)
+            else:
+                output_ids = vision_model.generate(**inputs, max_new_tokens=50)
+        caption = vision_processor.decode(output_ids[0], skip_special_tokens=True)
+        return caption
+    except Exception as e:
+        return f"Error analyzing frame: {str(e)}"
+def synthesize_video_analysis(frame_descriptions, question, video_info):
+    """Create comprehensive video analysis from frame descriptions"""
+    # Combine all frame descriptions
+    all_descriptions = " ".join(frame_descriptions)
+    # Create analysis based on question type
+    question_lower = question.lower()
+    analysis = f"""🎥 **AI Video Analysis**
+❓ **Your Question:** {question}
+🤖 **Detailed Analysis:**
+"""
+    if any(word in question_lower for word in ['what', 'happening', 'describe', 'see']):
+        analysis += f"Based on my analysis of {len(frame_descriptions)} key frames from the video:\n\n"
+        for i, desc in enumerate(frame_descriptions):
+            timestamp = i * (video_info['duration'] / len(frame_descriptions))
+            analysis += f"• **At {timestamp:.1f}s:** {desc}\n"
+        analysis += f"\n**Overall Summary:** This {video_info['duration']:.1f}-second video shows {all_descriptions.lower()}. "
+        # Add contextual insights
+        if len(set(frame_descriptions)) < len(frame_descriptions) * 0.3:
+            analysis += "The scene appears relatively static with consistent elements throughout."
+        else:
+            analysis += "The video shows dynamic content with changing scenes and activities."
+    elif any(word in question_lower for word in ['people', 'person', 'human', 'who']):
+        people_mentions = [desc for desc in frame_descriptions if any(word in desc.lower() for word in ['person', 'people', 'man', 'woman', 'child', 'human'])]
+        if people_mentions:
+            analysis += f"**People in the video:** {' '.join(people_mentions)}\n\n"
+        else:
+            analysis += "**People analysis:** No clear human figures were detected in the analyzed frames.\n\n"
+    elif any(word in question_lower for word in ['object', 'item', 'thing']):
+        analysis += "**Objects and items visible:**\n"
+        for desc in frame_descriptions:
+            analysis += f"• {desc}\n"
+    elif any(word in question_lower for word in ['setting', 'location', 'place', 'where']):
+        analysis += "**Setting and location analysis:**\n"
+        analysis += f"Based on the visual elements: {all_descriptions}\n\n"
+    elif any(word in question_lower for word in ['mood', 'emotion', 'feeling', 'atmosphere']):
+        analysis += "**Mood and atmosphere:**\n"
+        analysis += f"The visual elements suggest: {all_descriptions}\n\n"
+    else:
+        # General analysis
+        analysis += f"**Frame-by-frame analysis:**\n"
+        for i, desc in enumerate(frame_descriptions):
+            analysis += f"{i+1}. {desc}\n"
+    return analysis
 @spaces.GPU
 def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
+    """Main video analysis function"""
     if video_file is None:
         return "❌ Please upload a video file first."
         return "❌ Please enter a question about the video."
     if not model_loaded:
+        return "❌ AI models are not loaded. Please click 'Load AI Models' first and wait for completion."
     try:
         progress(0.1, desc="Extracting video frames...")
+        # Extract frames
+        frames, video_info, timestamps = extract_key_frames(video_file, max_frames=8)
         if not frames or video_info is None:
+            return "❌ Could not process video. Please check the video format."
+        progress(0.3, desc="Analyzing frames with AI...")
+        # Analyze each frame
+        frame_descriptions = []
+        for i, frame in enumerate(frames):
+            progress(0.3 + (i / len(frames)) * 0.5, desc=f"Analyzing frame {i+1}/{len(frames)}...")
+            # Create frame-specific question if relevant
+            if any(word in question.lower() for word in ['what', 'describe', 'see', 'happening']):
+                frame_question = f"What do you see in this image? {question}"
+                description = analyze_frame_with_blip(frame, frame_question)
+            else:
+                description = analyze_frame_with_blip(frame)
+            frame_descriptions.append(description)
+        progress(0.8, desc="Synthesizing analysis...")
+        # Create comprehensive analysis
+        analysis = synthesize_video_analysis(frame_descriptions, question, video_info)
+        # Add technical information
+        analysis += f"""
+📊 **Technical Information:**
 • Duration: {video_info['duration']:.1f} seconds
+• Frame Rate: {video_info['fps']:.1f} FPS
 • Total Frames: {video_info['total_frames']:,}
 • Analyzed Frames: {video_info['extracted_frames']}
 • Resolution: {video_info['resolution']}
+⚡ **Powered by:** BLIP Vision AI + Advanced Analysis
 """
         progress(1.0, desc="Analysis complete!")
+        return analysis
     except Exception as e:
+        error_msg = f"❌ Error during analysis: {str(e)}"
         print(error_msg)
         return error_msg
 def create_interface():
     """Create the Gradio interface"""
+    with gr.Blocks(title="AI Video Analyzer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎥 AI Video Analysis Tool")
+        gr.Markdown("Upload videos and get detailed AI-powered analysis using advanced computer vision!")
         # Model loading section
         with gr.Row():
             with gr.Column(scale=3):
                 model_status = gr.Textbox(
                     label="🤖 Model Status",
+                    value="Models not loaded - Click the button to load AI models →",
                     interactive=False,
                     lines=2
                 )
             with gr.Column(scale=1):
+                load_btn = gr.Button("🚀 Load AI Models", variant="primary", size="lg")
+        load_btn.click(load_models, outputs=model_status)
         gr.Markdown("---")
         gr.Markdown("---")
         gr.Markdown("""
         ### 📋 Instructions:
+        1. **First:** Click "Load AI Models" and wait for it to complete (~3-5 minutes)
+        2. **Then:** Upload your video file (works with most formats)
         3. **Ask:** Type your question about the video content
         4. **Analyze:** Click "Analyze Video with AI" to get detailed insights
+        💡 **How it works:**
+        - Extracts key frames from your video
+        - Analyzes each frame with BLIP vision AI
+        - Synthesizes comprehensive analysis based on your question
+        - Works reliably with standard video formats
         """)
     return demo