Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image | |
| import spaces | |
| import gc | |
| import os | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Global variables | |
| model = None | |
| tokenizer = None | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_loaded = False | |
| def load_videollama_model(): | |
| """Load VideoLLaMA model with proper error handling""" | |
| global model, tokenizer, model_loaded | |
| try: | |
| print("π Loading VideoLLaMA model...") | |
| # Try to load a working multimodal model | |
| # Note: Replace with actual VideoLLaMA3 model when available | |
| model_name = "DAMO-NLP-SG/Video-LLaMA" | |
| # Configure quantization for memory efficiency | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| # Load tokenizer | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| use_fast=False | |
| ) | |
| # Add padding token if not present | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load model with quantization | |
| print("Loading model...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=quantization_config, | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True | |
| ) | |
| model_loaded = True | |
| print("β VideoLLaMA model loaded successfully!") | |
| return "β Model loaded successfully!" | |
| except Exception as e: | |
| model_loaded = False | |
| error_msg = f"β Error loading model: {str(e)}" | |
| print(error_msg) | |
| print("π Falling back to basic video analysis...") | |
| return error_msg | |
| def extract_frames(video_path, max_frames=8): | |
| """Extract evenly spaced frames from video""" | |
| try: | |
| cap = cv2.VideoCapture(video_path) | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| fps = cap.get(cv2.CAP_PROP_FPS) | |
| duration = total_frames / fps if fps > 0 else 0 | |
| width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
| height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
| if total_frames == 0: | |
| return [], "No frames found in video" | |
| # Get evenly spaced frame indices | |
| frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int) | |
| frames = [] | |
| timestamps = [] | |
| for frame_idx in frame_indices: | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) | |
| ret, frame = cap.read() | |
| if ret: | |
| # Convert BGR to RGB | |
| frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| # Resize for efficiency while maintaining aspect ratio | |
| if width > 512 or height > 512: | |
| scale = min(512/width, 512/height) | |
| new_width = int(width * scale) | |
| new_height = int(height * scale) | |
| frame_rgb = cv2.resize(frame_rgb, (new_width, new_height)) | |
| frames.append(Image.fromarray(frame_rgb)) | |
| timestamp = frame_idx / fps if fps > 0 else frame_idx | |
| timestamps.append(timestamp) | |
| cap.release() | |
| video_info = { | |
| "total_frames": total_frames, | |
| "fps": fps, | |
| "duration": duration, | |
| "resolution": f"{width}x{height}", | |
| "extracted_frames": len(frames) | |
| } | |
| return frames, video_info, timestamps | |
| except Exception as e: | |
| print(f"Error extracting frames: {e}") | |
| return [], {}, [] | |
| def generate_basic_analysis(video_info, question, frames): | |
| """Generate basic video analysis when model is not available""" | |
| analysis_parts = [] | |
| # Video technical info | |
| analysis_parts.append("πΉ **Video Information:**") | |
| analysis_parts.append(f"- Duration: {video_info.get('duration', 0):.1f} seconds") | |
| analysis_parts.append(f"- Resolution: {video_info.get('resolution', 'Unknown')}") | |
| analysis_parts.append(f"- Frame rate: {video_info.get('fps', 0):.1f} FPS") | |
| analysis_parts.append(f"- Total frames: {video_info.get('total_frames', 0)}") | |
| analysis_parts.append(f"- Analyzed frames: {len(frames)}") | |
| # Basic visual analysis | |
| analysis_parts.append("\nπ¨ **Basic Visual Analysis:**") | |
| if frames: | |
| # Analyze first frame for basic info | |
| first_frame = np.array(frames[0]) | |
| avg_brightness = np.mean(first_frame) | |
| color_variance = np.var(first_frame) | |
| analysis_parts.append(f"- Average brightness: {'Bright' if avg_brightness > 127 else 'Dark'}") | |
| analysis_parts.append(f"- Color variance: {'High contrast' if color_variance > 1000 else 'Low contrast'}") | |
| analysis_parts.append(f"- Dominant colors: Analyzing RGB distribution...") | |
| # Simple color analysis | |
| r_avg = np.mean(first_frame[:,:,0]) | |
| g_avg = np.mean(first_frame[:,:,1]) | |
| b_avg = np.mean(first_frame[:,:,2]) | |
| dominant_color = "Red-tinted" if r_avg > max(g_avg, b_avg) + 20 else \ | |
| "Green-tinted" if g_avg > max(r_avg, b_avg) + 20 else \ | |
| "Blue-tinted" if b_avg > max(r_avg, g_avg) + 20 else \ | |
| "Balanced colors" | |
| analysis_parts.append(f"- Color tone: {dominant_color}") | |
| # Question-specific response | |
| analysis_parts.append(f"\nβ **Your Question:** '{question}'") | |
| analysis_parts.append("\nπ€ **Analysis Response:**") | |
| # Generate contextual response based on question keywords | |
| question_lower = question.lower() | |
| if any(word in question_lower for word in ['what', 'describe', 'see']): | |
| analysis_parts.append("Based on the extracted frames, this video contains visual content that has been processed and analyzed. ") | |
| if any(word in question_lower for word in ['action', 'activity', 'doing', 'happening']): | |
| analysis_parts.append("The video appears to show some form of activity or movement across the analyzed timepoints. ") | |
| if any(word in question_lower for word in ['people', 'person', 'human']): | |
| analysis_parts.append("The analysis would need to examine the frames for human presence and activities. ") | |
| if any(word in question_lower for word in ['object', 'thing', 'item']): | |
| analysis_parts.append("Object detection and identification would require deeper model analysis. ") | |
| analysis_parts.append("\nβ οΈ **Note:** This is a basic analysis. For detailed AI-powered video understanding, the VideoLLaMA3 model needs to be properly loaded and configured.") | |
| return "\n".join(analysis_parts) | |
| def analyze_video_with_ai(video_file, question, progress=gr.Progress()): | |
| """Main video analysis function""" | |
| if video_file is None: | |
| return "β Please upload a video file first." | |
| if not question.strip(): | |
| return "β Please enter a question about the video." | |
| try: | |
| progress(0.1, desc="Processing video...") | |
| # Extract frames | |
| frames, video_info, timestamps = extract_frames(video_file, max_frames=8) | |
| if not frames: | |
| return "β Could not extract frames from the video. Please check the video format." | |
| progress(0.5, desc="Analyzing content...") | |
| if model_loaded and model is not None and tokenizer is not None: | |
| # Try to use the actual model | |
| try: | |
| progress(0.7, desc="Running AI analysis...") | |
| # Prepare prompt for VideoLLaMA | |
| prompt = f"""Human: I have a video with the following details: | |
| - Duration: {video_info.get('duration', 0):.1f} seconds | |
| - {len(frames)} key frames extracted | |
| - Question: {question} | |
| Please analyze this video and provide a detailed response. |