File size: 8,383 Bytes
366ac1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import gradio as gr
import torch
import cv2
import numpy as np
from PIL import Image
import spaces
import gc
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings("ignore")

# Global variables
model = None
tokenizer = None
device = "cuda" if torch.cuda.is_available() else "cpu"
model_loaded = False

def load_videollama_model():
    """Load VideoLLaMA model with proper error handling"""
    global model, tokenizer, model_loaded
    
    try:
        print("πŸ”„ Loading VideoLLaMA model...")
        
        # Try to load a working multimodal model
        # Note: Replace with actual VideoLLaMA3 model when available
        model_name = "DAMO-NLP-SG/Video-LLaMA"
        
        # Configure quantization for memory efficiency
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        
        # Load tokenizer
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            use_fast=False
        )
        
        # Add padding token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model with quantization
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        
        model_loaded = True
        print("βœ… VideoLLaMA model loaded successfully!")
        return "βœ… Model loaded successfully!"
        
    except Exception as e:
        model_loaded = False
        error_msg = f"❌ Error loading model: {str(e)}"
        print(error_msg)
        print("πŸ”„ Falling back to basic video analysis...")
        return error_msg

def extract_frames(video_path, max_frames=8):
    """Extract evenly spaced frames from video"""
    try:
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames / fps if fps > 0 else 0
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        
        if total_frames == 0:
            return [], "No frames found in video"
        
        # Get evenly spaced frame indices
        frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
        frames = []
        timestamps = []
        
        for frame_idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if ret:
                # Convert BGR to RGB
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                # Resize for efficiency while maintaining aspect ratio
                if width > 512 or height > 512:
                    scale = min(512/width, 512/height)
                    new_width = int(width * scale)
                    new_height = int(height * scale)
                    frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
                
                frames.append(Image.fromarray(frame_rgb))
                timestamp = frame_idx / fps if fps > 0 else frame_idx
                timestamps.append(timestamp)
        
        cap.release()
        
        video_info = {
            "total_frames": total_frames,
            "fps": fps,
            "duration": duration,
            "resolution": f"{width}x{height}",
            "extracted_frames": len(frames)
        }
        
        return frames, video_info, timestamps
        
    except Exception as e:
        print(f"Error extracting frames: {e}")
        return [], {}, []

def generate_basic_analysis(video_info, question, frames):
    """Generate basic video analysis when model is not available"""
    
    analysis_parts = []
    
    # Video technical info
    analysis_parts.append("πŸ“Ή **Video Information:**")
    analysis_parts.append(f"- Duration: {video_info.get('duration', 0):.1f} seconds")
    analysis_parts.append(f"- Resolution: {video_info.get('resolution', 'Unknown')}")
    analysis_parts.append(f"- Frame rate: {video_info.get('fps', 0):.1f} FPS")
    analysis_parts.append(f"- Total frames: {video_info.get('total_frames', 0)}")
    analysis_parts.append(f"- Analyzed frames: {len(frames)}")
    
    # Basic visual analysis
    analysis_parts.append("\n🎨 **Basic Visual Analysis:**")
    
    if frames:
        # Analyze first frame for basic info
        first_frame = np.array(frames[0])
        avg_brightness = np.mean(first_frame)
        color_variance = np.var(first_frame)
        
        analysis_parts.append(f"- Average brightness: {'Bright' if avg_brightness > 127 else 'Dark'}")
        analysis_parts.append(f"- Color variance: {'High contrast' if color_variance > 1000 else 'Low contrast'}")
        analysis_parts.append(f"- Dominant colors: Analyzing RGB distribution...")
        
        # Simple color analysis
        r_avg = np.mean(first_frame[:,:,0])
        g_avg = np.mean(first_frame[:,:,1])
        b_avg = np.mean(first_frame[:,:,2])
        
        dominant_color = "Red-tinted" if r_avg > max(g_avg, b_avg) + 20 else \
                        "Green-tinted" if g_avg > max(r_avg, b_avg) + 20 else \
                        "Blue-tinted" if b_avg > max(r_avg, g_avg) + 20 else \
                        "Balanced colors"
        analysis_parts.append(f"- Color tone: {dominant_color}")
    
    # Question-specific response
    analysis_parts.append(f"\n❓ **Your Question:** '{question}'")
    analysis_parts.append("\nπŸ€– **Analysis Response:**")
    
    # Generate contextual response based on question keywords
    question_lower = question.lower()
    
    if any(word in question_lower for word in ['what', 'describe', 'see']):
        analysis_parts.append("Based on the extracted frames, this video contains visual content that has been processed and analyzed. ")
    
    if any(word in question_lower for word in ['action', 'activity', 'doing', 'happening']):
        analysis_parts.append("The video appears to show some form of activity or movement across the analyzed timepoints. ")
    
    if any(word in question_lower for word in ['people', 'person', 'human']):
        analysis_parts.append("The analysis would need to examine the frames for human presence and activities. ")
    
    if any(word in question_lower for word in ['object', 'thing', 'item']):
        analysis_parts.append("Object detection and identification would require deeper model analysis. ")
    
    analysis_parts.append("\n⚠️ **Note:** This is a basic analysis. For detailed AI-powered video understanding, the VideoLLaMA3 model needs to be properly loaded and configured.")
    
    return "\n".join(analysis_parts)

@spaces.GPU
def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
    """Main video analysis function"""
    
    if video_file is None:
        return "❌ Please upload a video file first."
    
    if not question.strip():
        return "❌ Please enter a question about the video."
    
    try:
        progress(0.1, desc="Processing video...")
        
        # Extract frames
        frames, video_info, timestamps = extract_frames(video_file, max_frames=8)
        
        if not frames:
            return "❌ Could not extract frames from the video. Please check the video format."
        
        progress(0.5, desc="Analyzing content...")
        
        if model_loaded and model is not None and tokenizer is not None:
            # Try to use the actual model
            try:
                progress(0.7, desc="Running AI analysis...")
                
                # Prepare prompt for VideoLLaMA
                prompt = f"""Human: I have a video with the following details:
- Duration: {video_info.get('duration', 0):.1f} seconds
- {len(frames)} key frames extracted
- Question: {question}

Please analyze this video and provide a detailed response.