Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

cweigendev commited on Aug 6

Commit

366ac1b

verified ·

1 Parent(s): b1330bb

Create app.py

Browse files

Files changed (1) hide show

app.py +215 -0

app.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import gradio as gr
+import torch
+import cv2
+import numpy as np
+from PIL import Image
+import spaces
+import gc
+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import warnings
+warnings.filterwarnings("ignore")
+# Global variables
+model = None
+tokenizer = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_loaded = False
+def load_videollama_model():
+    """Load VideoLLaMA model with proper error handling"""
+    global model, tokenizer, model_loaded
+    try:
+        print("🔄 Loading VideoLLaMA model...")
+        # Try to load a working multimodal model
+        # Note: Replace with actual VideoLLaMA3 model when available
+        model_name = "DAMO-NLP-SG/Video-LLaMA"
+        # Configure quantization for memory efficiency
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4"
+        )
+        # Load tokenizer
+        print("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            use_fast=False
+        )
+        # Add padding token if not present
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Load model with quantization
+        print("Loading model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            quantization_config=quantization_config,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True
+        )
+        model_loaded = True
+        print("✅ VideoLLaMA model loaded successfully!")
+        return "✅ Model loaded successfully!"
+    except Exception as e:
+        model_loaded = False
+        error_msg = f"❌ Error loading model: {str(e)}"
+        print(error_msg)
+        print("🔄 Falling back to basic video analysis...")
+        return error_msg
+def extract_frames(video_path, max_frames=8):
+    """Extract evenly spaced frames from video"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames / fps if fps > 0 else 0
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        if total_frames == 0:
+            return [], "No frames found in video"
+        # Get evenly spaced frame indices
+        frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
+        frames = []
+        timestamps = []
+        for frame_idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+            ret, frame = cap.read()
+            if ret:
+                # Convert BGR to RGB
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize for efficiency while maintaining aspect ratio
+                if width > 512 or height > 512:
+                    scale = min(512/width, 512/height)
+                    new_width = int(width * scale)
+                    new_height = int(height * scale)
+                    frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
+                frames.append(Image.fromarray(frame_rgb))
+                timestamp = frame_idx / fps if fps > 0 else frame_idx
+                timestamps.append(timestamp)
+        cap.release()
+        video_info = {
+            "total_frames": total_frames,
+            "fps": fps,
+            "duration": duration,
+            "resolution": f"{width}x{height}",
+            "extracted_frames": len(frames)
+        }
+        return frames, video_info, timestamps
+    except Exception as e:
+        print(f"Error extracting frames: {e}")
+        return [], {}, []
+def generate_basic_analysis(video_info, question, frames):
+    """Generate basic video analysis when model is not available"""
+    analysis_parts = []
+    # Video technical info
+    analysis_parts.append("📹 **Video Information:**")
+    analysis_parts.append(f"- Duration: {video_info.get('duration', 0):.1f} seconds")
+    analysis_parts.append(f"- Resolution: {video_info.get('resolution', 'Unknown')}")
+    analysis_parts.append(f"- Frame rate: {video_info.get('fps', 0):.1f} FPS")
+    analysis_parts.append(f"- Total frames: {video_info.get('total_frames', 0)}")
+    analysis_parts.append(f"- Analyzed frames: {len(frames)}")
+    # Basic visual analysis
+    analysis_parts.append("\n🎨 **Basic Visual Analysis:**")
+    if frames:
+        # Analyze first frame for basic info
+        first_frame = np.array(frames[0])
+        avg_brightness = np.mean(first_frame)
+        color_variance = np.var(first_frame)
+        analysis_parts.append(f"- Average brightness: {'Bright' if avg_brightness > 127 else 'Dark'}")
+        analysis_parts.append(f"- Color variance: {'High contrast' if color_variance > 1000 else 'Low contrast'}")
+        analysis_parts.append(f"- Dominant colors: Analyzing RGB distribution...")
+        # Simple color analysis
+        r_avg = np.mean(first_frame[:,:,0])
+        g_avg = np.mean(first_frame[:,:,1])
+        b_avg = np.mean(first_frame[:,:,2])
+        dominant_color = "Red-tinted" if r_avg > max(g_avg, b_avg) + 20 else \
+                        "Green-tinted" if g_avg > max(r_avg, b_avg) + 20 else \
+                        "Blue-tinted" if b_avg > max(r_avg, g_avg) + 20 else \
+                        "Balanced colors"
+        analysis_parts.append(f"- Color tone: {dominant_color}")
+    # Question-specific response
+    analysis_parts.append(f"\n❓ **Your Question:** '{question}'")
+    analysis_parts.append("\n🤖 **Analysis Response:**")
+    # Generate contextual response based on question keywords
+    question_lower = question.lower()
+    if any(word in question_lower for word in ['what', 'describe', 'see']):
+        analysis_parts.append("Based on the extracted frames, this video contains visual content that has been processed and analyzed. ")
+    if any(word in question_lower for word in ['action', 'activity', 'doing', 'happening']):
+        analysis_parts.append("The video appears to show some form of activity or movement across the analyzed timepoints. ")
+    if any(word in question_lower for word in ['people', 'person', 'human']):
+        analysis_parts.append("The analysis would need to examine the frames for human presence and activities. ")
+    if any(word in question_lower for word in ['object', 'thing', 'item']):
+        analysis_parts.append("Object detection and identification would require deeper model analysis. ")
+    analysis_parts.append("\n⚠️ **Note:** This is a basic analysis. For detailed AI-powered video understanding, the VideoLLaMA3 model needs to be properly loaded and configured.")
+    return "\n".join(analysis_parts)
+@spaces.GPU
+def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
+    """Main video analysis function"""
+    if video_file is None:
+        return "❌ Please upload a video file first."
+    if not question.strip():
+        return "❌ Please enter a question about the video."
+    try:
+        progress(0.1, desc="Processing video...")
+        # Extract frames
+        frames, video_info, timestamps = extract_frames(video_file, max_frames=8)
+        if not frames:
+            return "❌ Could not extract frames from the video. Please check the video format."
+        progress(0.5, desc="Analyzing content...")
+        if model_loaded and model is not None and tokenizer is not None:
+            # Try to use the actual model
+            try:
+                progress(0.7, desc="Running AI analysis...")
+                # Prepare prompt for VideoLLaMA
+                prompt = f"""Human: I have a video with the following details:
+- Duration: {video_info.get('duration', 0):.1f} seconds
+- {len(frames)} key frames extracted
+- Question: {question}
+Please analyze this video and provide a detailed response.