Spaces:

cweigendev
/

videoanalyzer

Paused

File size: 8,383 Bytes

366ac1b

import gradio as gr
import torch
import cv2
import numpy as np
from PIL import Image
import spaces
import gc
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings("ignore")

# Global variables
model = None
tokenizer = None
device = "cuda" if torch.cuda.is_available() else "cpu"
model_loaded = False

def load_videollama_model():
    """Load VideoLLaMA model with proper error handling"""
    global model, tokenizer, model_loaded
    
    try:
        print("🔄 Loading VideoLLaMA model...")
        
        # Try to load a working multimodal model
        # Note: Replace with actual VideoLLaMA3 model when available
        model_name = "DAMO-NLP-SG/Video-LLaMA"
        
        # Configure quantization for memory efficiency
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        
        # Load tokenizer
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            use_fast=False
        )
        
        # Add padding token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model with quantization
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        
        model_loaded = True
        print("✅ VideoLLaMA model loaded successfully!")
        return "✅ Model loaded successfully!"
        
    except Exception as e:
        model_loaded = False
        error_msg = f"❌ Error loading model: {str(e)}"
        print(error_msg)
        print("🔄 Falling back to basic video analysis...")
        return error_msg

def extract_frames(video_path, max_frames=8):
    """Extract evenly spaced frames from video"""
    try:
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames / fps if fps > 0 else 0
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        
        if total_frames == 0:
            return [], "No frames found in video"
        
        # Get evenly spaced frame indices
        frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
        frames = []
        timestamps = []
        
        for frame_idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if ret:
                # Convert BGR to RGB
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                # Resize for efficiency while maintaining aspect ratio
                if width > 512 or height > 512:
                    scale = min(512/width, 512/height)
                    new_width = int(width * scale)
                    new_height = int(height * scale)
                    frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
                
                frames.append(Image.fromarray(frame_rgb))
                timestamp = frame_idx / fps if fps > 0 else frame_idx
                timestamps.append(timestamp)
        
        cap.release()
        
        video_info = {
            "total_frames": total_frames,
            "fps": fps,
            "duration": duration,
            "resolution": f"{width}x{height}",
            "extracted_frames": len(frames)
        }
        
        return frames, video_info, timestamps
        
    except Exception as e:
        print(f"Error extracting frames: {e}")
        return [], {}, []

def generate_basic_analysis(video_info, question, frames):
    """Generate basic video analysis when model is not available"""
    
    analysis_parts = []
    
    # Video technical info
    analysis_parts.append("📹 **Video Information:**")
    analysis_parts.append(f"- Duration: {video_info.get('duration', 0):.1f} seconds")
    analysis_parts.append(f"- Resolution: {video_info.get('resolution', 'Unknown')}")
    analysis_parts.append(f"- Frame rate: {video_info.get('fps', 0):.1f} FPS")
    analysis_parts.append(f"- Total frames: {video_info.get('total_frames', 0)}")
    analysis_parts.append(f"- Analyzed frames: {len(frames)}")
    
    # Basic visual analysis
    analysis_parts.append("\n🎨 **Basic Visual Analysis:**")
    
    if frames:
        # Analyze first frame for basic info
        first_frame = np.array(frames[0])
        avg_brightness = np.mean(first_frame)
        color_variance = np.var(first_frame)
        
        analysis_parts.append(f"- Average brightness: {'Bright' if avg_brightness > 127 else 'Dark'}")
        analysis_parts.append(f"- Color variance: {'High contrast' if color_variance > 1000 else 'Low contrast'}")
        analysis_parts.append(f"- Dominant colors: Analyzing RGB distribution...")
        
        # Simple color analysis
        r_avg = np.mean(first_frame[:,:,0])
        g_avg = np.mean(first_frame[:,:,1])
        b_avg = np.mean(first_frame[:,:,2])
        
        dominant_color = "Red-tinted" if r_avg > max(g_avg, b_avg) + 20 else \
                        "Green-tinted" if g_avg > max(r_avg, b_avg) + 20 else \
                        "Blue-tinted" if b_avg > max(r_avg, g_avg) + 20 else \
                        "Balanced colors"
        analysis_parts.append(f"- Color tone: {dominant_color}")
    
    # Question-specific response
    analysis_parts.append(f"\n❓ **Your Question:** '{question}'")
    analysis_parts.append("\n🤖 **Analysis Response:**")
    
    # Generate contextual response based on question keywords
    question_lower = question.lower()
    
    if any(word in question_lower for word in ['what', 'describe', 'see']):
        analysis_parts.append("Based on the extracted frames, this video contains visual content that has been processed and analyzed. ")
    
    if any(word in question_lower for word in ['action', 'activity', 'doing', 'happening']):
        analysis_parts.append("The video appears to show some form of activity or movement across the analyzed timepoints. ")
    
    if any(word in question_lower for word in ['people', 'person', 'human']):
        analysis_parts.append("The analysis would need to examine the frames for human presence and activities. ")
    
    if any(word in question_lower for word in ['object', 'thing', 'item']):
        analysis_parts.append("Object detection and identification would require deeper model analysis. ")
    
    analysis_parts.append("\n⚠️ **Note:** This is a basic analysis. For detailed AI-powered video understanding, the VideoLLaMA3 model needs to be properly loaded and configured.")
    
    return "\n".join(analysis_parts)

@spaces.GPU
def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
    """Main video analysis function"""
    
    if video_file is None:
        return "❌ Please upload a video file first."
    
    if not question.strip():
        return "❌ Please enter a question about the video."
    
    try:
        progress(0.1, desc="Processing video...")
        
        # Extract frames
        frames, video_info, timestamps = extract_frames(video_file, max_frames=8)
        
        if not frames:
            return "❌ Could not extract frames from the video. Please check the video format."
        
        progress(0.5, desc="Analyzing content...")
        
        if model_loaded and model is not None and tokenizer is not None:
            # Try to use the actual model
            try:
                progress(0.7, desc="Running AI analysis...")
                
                # Prepare prompt for VideoLLaMA
                prompt = f"""Human: I have a video with the following details:
- Duration: {video_info.get('duration', 0):.1f} seconds
- {len(frames)} key frames extracted
- Question: {question}

Please analyze this video and provide a detailed response.