Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

videoanalyzer / app.py

cweigendev

Create app.py

366ac1b verified 4 months ago

raw

history blame

8.38 kB

	import gradio as gr
	import torch
	import cv2
	import numpy as np
	from PIL import Image
	import spaces
	import gc
	import os
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	import warnings
	warnings.filterwarnings("ignore")

	# Global variables
	model = None
	tokenizer = None
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model_loaded = False

	def load_videollama_model():
	"""Load VideoLLaMA model with proper error handling"""
	global model, tokenizer, model_loaded

	try:
	print("🔄 Loading VideoLLaMA model...")

	# Try to load a working multimodal model
	# Note: Replace with actual VideoLLaMA3 model when available
	model_name = "DAMO-NLP-SG/Video-LLaMA"

	# Configure quantization for memory efficiency
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4"
	)

	# Load tokenizer
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	trust_remote_code=True,
	use_fast=False
	)

	# Add padding token if not present
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model with quantization
	print("Loading model...")
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=quantization_config,
	device_map="auto",
	torch_dtype=torch.float16,
	trust_remote_code=True,
	low_cpu_mem_usage=True
	)

	model_loaded = True
	print("✅ VideoLLaMA model loaded successfully!")
	return "✅ Model loaded successfully!"

	except Exception as e:
	model_loaded = False
	error_msg = f"❌ Error loading model: {str(e)}"
	print(error_msg)
	print("🔄 Falling back to basic video analysis...")
	return error_msg

	def extract_frames(video_path, max_frames=8):
	"""Extract evenly spaced frames from video"""
	try:
	cap = cv2.VideoCapture(video_path)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	fps = cap.get(cv2.CAP_PROP_FPS)
	duration = total_frames / fps if fps > 0 else 0
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

	if total_frames == 0:
	return [], "No frames found in video"

	# Get evenly spaced frame indices
	frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
	frames = []
	timestamps = []

	for frame_idx in frame_indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
	ret, frame = cap.read()
	if ret:
	# Convert BGR to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	# Resize for efficiency while maintaining aspect ratio
	if width > 512 or height > 512:
	scale = min(512/width, 512/height)
	new_width = int(width * scale)
	new_height = int(height * scale)
	frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))

	frames.append(Image.fromarray(frame_rgb))
	timestamp = frame_idx / fps if fps > 0 else frame_idx
	timestamps.append(timestamp)

	cap.release()

	video_info = {
	"total_frames": total_frames,
	"fps": fps,
	"duration": duration,
	"resolution": f"{width}x{height}",
	"extracted_frames": len(frames)
	}

	return frames, video_info, timestamps

	except Exception as e:
	print(f"Error extracting frames: {e}")
	return [], {}, []

	def generate_basic_analysis(video_info, question, frames):
	"""Generate basic video analysis when model is not available"""

	analysis_parts = []

	# Video technical info
	analysis_parts.append("📹 Video Information:")
	analysis_parts.append(f"- Duration: {video_info.get('duration', 0):.1f} seconds")
	analysis_parts.append(f"- Resolution: {video_info.get('resolution', 'Unknown')}")
	analysis_parts.append(f"- Frame rate: {video_info.get('fps', 0):.1f} FPS")
	analysis_parts.append(f"- Total frames: {video_info.get('total_frames', 0)}")
	analysis_parts.append(f"- Analyzed frames: {len(frames)}")

	# Basic visual analysis
	analysis_parts.append("\n🎨 Basic Visual Analysis:")

	if frames:
	# Analyze first frame for basic info
	first_frame = np.array(frames[0])
	avg_brightness = np.mean(first_frame)
	color_variance = np.var(first_frame)

	analysis_parts.append(f"- Average brightness: {'Bright' if avg_brightness > 127 else 'Dark'}")
	analysis_parts.append(f"- Color variance: {'High contrast' if color_variance > 1000 else 'Low contrast'}")
	analysis_parts.append(f"- Dominant colors: Analyzing RGB distribution...")

	# Simple color analysis
	r_avg = np.mean(first_frame[:,:,0])
	g_avg = np.mean(first_frame[:,:,1])
	b_avg = np.mean(first_frame[:,:,2])

	dominant_color = "Red-tinted" if r_avg > max(g_avg, b_avg) + 20 else \
	"Green-tinted" if g_avg > max(r_avg, b_avg) + 20 else \
	"Blue-tinted" if b_avg > max(r_avg, g_avg) + 20 else \
	"Balanced colors"
	analysis_parts.append(f"- Color tone: {dominant_color}")

	# Question-specific response
	analysis_parts.append(f"\n❓ Your Question: '{question}'")
	analysis_parts.append("\n🤖 Analysis Response:")

	# Generate contextual response based on question keywords
	question_lower = question.lower()

	if any(word in question_lower for word in ['what', 'describe', 'see']):
	analysis_parts.append("Based on the extracted frames, this video contains visual content that has been processed and analyzed. ")

	if any(word in question_lower for word in ['action', 'activity', 'doing', 'happening']):
	analysis_parts.append("The video appears to show some form of activity or movement across the analyzed timepoints. ")

	if any(word in question_lower for word in ['people', 'person', 'human']):
	analysis_parts.append("The analysis would need to examine the frames for human presence and activities. ")

	if any(word in question_lower for word in ['object', 'thing', 'item']):
	analysis_parts.append("Object detection and identification would require deeper model analysis. ")

	analysis_parts.append("\n⚠️ Note: This is a basic analysis. For detailed AI-powered video understanding, the VideoLLaMA3 model needs to be properly loaded and configured.")

	return "\n".join(analysis_parts)

	@spaces.GPU
	def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
	"""Main video analysis function"""

	if video_file is None:
	return "❌ Please upload a video file first."

	if not question.strip():
	return "❌ Please enter a question about the video."

	try:
	progress(0.1, desc="Processing video...")

	# Extract frames
	frames, video_info, timestamps = extract_frames(video_file, max_frames=8)

	if not frames:
	return "❌ Could not extract frames from the video. Please check the video format."

	progress(0.5, desc="Analyzing content...")

	if model_loaded and model is not None and tokenizer is not None:
	# Try to use the actual model
	try:
	progress(0.7, desc="Running AI analysis...")

	# Prepare prompt for VideoLLaMA
	prompt = f"""Human: I have a video with the following details:
	- Duration: {video_info.get('duration', 0):.1f} seconds
	- {len(frames)} key frames extracted
	- Question: {question}

	Please analyze this video and provide a detailed response.