videoanalyzer / app.py
cweigendev's picture
Create app.py
366ac1b verified
raw
history blame
8.38 kB
import gradio as gr
import torch
import cv2
import numpy as np
from PIL import Image
import spaces
import gc
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings("ignore")
# Global variables
model = None
tokenizer = None
device = "cuda" if torch.cuda.is_available() else "cpu"
model_loaded = False
def load_videollama_model():
"""Load VideoLLaMA model with proper error handling"""
global model, tokenizer, model_loaded
try:
print("πŸ”„ Loading VideoLLaMA model...")
# Try to load a working multimodal model
# Note: Replace with actual VideoLLaMA3 model when available
model_name = "DAMO-NLP-SG/Video-LLaMA"
# Configure quantization for memory efficiency
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
use_fast=False
)
# Add padding token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load model with quantization
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config,
device_map="auto",
torch_dtype=torch.float16,
trust_remote_code=True,
low_cpu_mem_usage=True
)
model_loaded = True
print("βœ… VideoLLaMA model loaded successfully!")
return "βœ… Model loaded successfully!"
except Exception as e:
model_loaded = False
error_msg = f"❌ Error loading model: {str(e)}"
print(error_msg)
print("πŸ”„ Falling back to basic video analysis...")
return error_msg
def extract_frames(video_path, max_frames=8):
"""Extract evenly spaced frames from video"""
try:
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames / fps if fps > 0 else 0
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
if total_frames == 0:
return [], "No frames found in video"
# Get evenly spaced frame indices
frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
frames = []
timestamps = []
for frame_idx in frame_indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
ret, frame = cap.read()
if ret:
# Convert BGR to RGB
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Resize for efficiency while maintaining aspect ratio
if width > 512 or height > 512:
scale = min(512/width, 512/height)
new_width = int(width * scale)
new_height = int(height * scale)
frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
frames.append(Image.fromarray(frame_rgb))
timestamp = frame_idx / fps if fps > 0 else frame_idx
timestamps.append(timestamp)
cap.release()
video_info = {
"total_frames": total_frames,
"fps": fps,
"duration": duration,
"resolution": f"{width}x{height}",
"extracted_frames": len(frames)
}
return frames, video_info, timestamps
except Exception as e:
print(f"Error extracting frames: {e}")
return [], {}, []
def generate_basic_analysis(video_info, question, frames):
"""Generate basic video analysis when model is not available"""
analysis_parts = []
# Video technical info
analysis_parts.append("πŸ“Ή **Video Information:**")
analysis_parts.append(f"- Duration: {video_info.get('duration', 0):.1f} seconds")
analysis_parts.append(f"- Resolution: {video_info.get('resolution', 'Unknown')}")
analysis_parts.append(f"- Frame rate: {video_info.get('fps', 0):.1f} FPS")
analysis_parts.append(f"- Total frames: {video_info.get('total_frames', 0)}")
analysis_parts.append(f"- Analyzed frames: {len(frames)}")
# Basic visual analysis
analysis_parts.append("\n🎨 **Basic Visual Analysis:**")
if frames:
# Analyze first frame for basic info
first_frame = np.array(frames[0])
avg_brightness = np.mean(first_frame)
color_variance = np.var(first_frame)
analysis_parts.append(f"- Average brightness: {'Bright' if avg_brightness > 127 else 'Dark'}")
analysis_parts.append(f"- Color variance: {'High contrast' if color_variance > 1000 else 'Low contrast'}")
analysis_parts.append(f"- Dominant colors: Analyzing RGB distribution...")
# Simple color analysis
r_avg = np.mean(first_frame[:,:,0])
g_avg = np.mean(first_frame[:,:,1])
b_avg = np.mean(first_frame[:,:,2])
dominant_color = "Red-tinted" if r_avg > max(g_avg, b_avg) + 20 else \
"Green-tinted" if g_avg > max(r_avg, b_avg) + 20 else \
"Blue-tinted" if b_avg > max(r_avg, g_avg) + 20 else \
"Balanced colors"
analysis_parts.append(f"- Color tone: {dominant_color}")
# Question-specific response
analysis_parts.append(f"\n❓ **Your Question:** '{question}'")
analysis_parts.append("\nπŸ€– **Analysis Response:**")
# Generate contextual response based on question keywords
question_lower = question.lower()
if any(word in question_lower for word in ['what', 'describe', 'see']):
analysis_parts.append("Based on the extracted frames, this video contains visual content that has been processed and analyzed. ")
if any(word in question_lower for word in ['action', 'activity', 'doing', 'happening']):
analysis_parts.append("The video appears to show some form of activity or movement across the analyzed timepoints. ")
if any(word in question_lower for word in ['people', 'person', 'human']):
analysis_parts.append("The analysis would need to examine the frames for human presence and activities. ")
if any(word in question_lower for word in ['object', 'thing', 'item']):
analysis_parts.append("Object detection and identification would require deeper model analysis. ")
analysis_parts.append("\n⚠️ **Note:** This is a basic analysis. For detailed AI-powered video understanding, the VideoLLaMA3 model needs to be properly loaded and configured.")
return "\n".join(analysis_parts)
@spaces.GPU
def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
"""Main video analysis function"""
if video_file is None:
return "❌ Please upload a video file first."
if not question.strip():
return "❌ Please enter a question about the video."
try:
progress(0.1, desc="Processing video...")
# Extract frames
frames, video_info, timestamps = extract_frames(video_file, max_frames=8)
if not frames:
return "❌ Could not extract frames from the video. Please check the video format."
progress(0.5, desc="Analyzing content...")
if model_loaded and model is not None and tokenizer is not None:
# Try to use the actual model
try:
progress(0.7, desc="Running AI analysis...")
# Prepare prompt for VideoLLaMA
prompt = f"""Human: I have a video with the following details:
- Duration: {video_info.get('duration', 0):.1f} seconds
- {len(frames)} key frames extracted
- Question: {question}
Please analyze this video and provide a detailed response.