cweigendev commited on
Commit
366ac1b
Β·
verified Β·
1 Parent(s): b1330bb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import cv2
4
+ import numpy as np
5
+ from PIL import Image
6
+ import spaces
7
+ import gc
8
+ import os
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
+ import warnings
11
+ warnings.filterwarnings("ignore")
12
+
13
+ # Global variables
14
+ model = None
15
+ tokenizer = None
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ model_loaded = False
18
+
19
+ def load_videollama_model():
20
+ """Load VideoLLaMA model with proper error handling"""
21
+ global model, tokenizer, model_loaded
22
+
23
+ try:
24
+ print("πŸ”„ Loading VideoLLaMA model...")
25
+
26
+ # Try to load a working multimodal model
27
+ # Note: Replace with actual VideoLLaMA3 model when available
28
+ model_name = "DAMO-NLP-SG/Video-LLaMA"
29
+
30
+ # Configure quantization for memory efficiency
31
+ quantization_config = BitsAndBytesConfig(
32
+ load_in_4bit=True,
33
+ bnb_4bit_compute_dtype=torch.float16,
34
+ bnb_4bit_use_double_quant=True,
35
+ bnb_4bit_quant_type="nf4"
36
+ )
37
+
38
+ # Load tokenizer
39
+ print("Loading tokenizer...")
40
+ tokenizer = AutoTokenizer.from_pretrained(
41
+ model_name,
42
+ trust_remote_code=True,
43
+ use_fast=False
44
+ )
45
+
46
+ # Add padding token if not present
47
+ if tokenizer.pad_token is None:
48
+ tokenizer.pad_token = tokenizer.eos_token
49
+
50
+ # Load model with quantization
51
+ print("Loading model...")
52
+ model = AutoModelForCausalLM.from_pretrained(
53
+ model_name,
54
+ quantization_config=quantization_config,
55
+ device_map="auto",
56
+ torch_dtype=torch.float16,
57
+ trust_remote_code=True,
58
+ low_cpu_mem_usage=True
59
+ )
60
+
61
+ model_loaded = True
62
+ print("βœ… VideoLLaMA model loaded successfully!")
63
+ return "βœ… Model loaded successfully!"
64
+
65
+ except Exception as e:
66
+ model_loaded = False
67
+ error_msg = f"❌ Error loading model: {str(e)}"
68
+ print(error_msg)
69
+ print("πŸ”„ Falling back to basic video analysis...")
70
+ return error_msg
71
+
72
+ def extract_frames(video_path, max_frames=8):
73
+ """Extract evenly spaced frames from video"""
74
+ try:
75
+ cap = cv2.VideoCapture(video_path)
76
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
77
+ fps = cap.get(cv2.CAP_PROP_FPS)
78
+ duration = total_frames / fps if fps > 0 else 0
79
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
80
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
81
+
82
+ if total_frames == 0:
83
+ return [], "No frames found in video"
84
+
85
+ # Get evenly spaced frame indices
86
+ frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
87
+ frames = []
88
+ timestamps = []
89
+
90
+ for frame_idx in frame_indices:
91
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
92
+ ret, frame = cap.read()
93
+ if ret:
94
+ # Convert BGR to RGB
95
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
96
+ # Resize for efficiency while maintaining aspect ratio
97
+ if width > 512 or height > 512:
98
+ scale = min(512/width, 512/height)
99
+ new_width = int(width * scale)
100
+ new_height = int(height * scale)
101
+ frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
102
+
103
+ frames.append(Image.fromarray(frame_rgb))
104
+ timestamp = frame_idx / fps if fps > 0 else frame_idx
105
+ timestamps.append(timestamp)
106
+
107
+ cap.release()
108
+
109
+ video_info = {
110
+ "total_frames": total_frames,
111
+ "fps": fps,
112
+ "duration": duration,
113
+ "resolution": f"{width}x{height}",
114
+ "extracted_frames": len(frames)
115
+ }
116
+
117
+ return frames, video_info, timestamps
118
+
119
+ except Exception as e:
120
+ print(f"Error extracting frames: {e}")
121
+ return [], {}, []
122
+
123
+ def generate_basic_analysis(video_info, question, frames):
124
+ """Generate basic video analysis when model is not available"""
125
+
126
+ analysis_parts = []
127
+
128
+ # Video technical info
129
+ analysis_parts.append("πŸ“Ή **Video Information:**")
130
+ analysis_parts.append(f"- Duration: {video_info.get('duration', 0):.1f} seconds")
131
+ analysis_parts.append(f"- Resolution: {video_info.get('resolution', 'Unknown')}")
132
+ analysis_parts.append(f"- Frame rate: {video_info.get('fps', 0):.1f} FPS")
133
+ analysis_parts.append(f"- Total frames: {video_info.get('total_frames', 0)}")
134
+ analysis_parts.append(f"- Analyzed frames: {len(frames)}")
135
+
136
+ # Basic visual analysis
137
+ analysis_parts.append("\n🎨 **Basic Visual Analysis:**")
138
+
139
+ if frames:
140
+ # Analyze first frame for basic info
141
+ first_frame = np.array(frames[0])
142
+ avg_brightness = np.mean(first_frame)
143
+ color_variance = np.var(first_frame)
144
+
145
+ analysis_parts.append(f"- Average brightness: {'Bright' if avg_brightness > 127 else 'Dark'}")
146
+ analysis_parts.append(f"- Color variance: {'High contrast' if color_variance > 1000 else 'Low contrast'}")
147
+ analysis_parts.append(f"- Dominant colors: Analyzing RGB distribution...")
148
+
149
+ # Simple color analysis
150
+ r_avg = np.mean(first_frame[:,:,0])
151
+ g_avg = np.mean(first_frame[:,:,1])
152
+ b_avg = np.mean(first_frame[:,:,2])
153
+
154
+ dominant_color = "Red-tinted" if r_avg > max(g_avg, b_avg) + 20 else \
155
+ "Green-tinted" if g_avg > max(r_avg, b_avg) + 20 else \
156
+ "Blue-tinted" if b_avg > max(r_avg, g_avg) + 20 else \
157
+ "Balanced colors"
158
+ analysis_parts.append(f"- Color tone: {dominant_color}")
159
+
160
+ # Question-specific response
161
+ analysis_parts.append(f"\n❓ **Your Question:** '{question}'")
162
+ analysis_parts.append("\nπŸ€– **Analysis Response:**")
163
+
164
+ # Generate contextual response based on question keywords
165
+ question_lower = question.lower()
166
+
167
+ if any(word in question_lower for word in ['what', 'describe', 'see']):
168
+ analysis_parts.append("Based on the extracted frames, this video contains visual content that has been processed and analyzed. ")
169
+
170
+ if any(word in question_lower for word in ['action', 'activity', 'doing', 'happening']):
171
+ analysis_parts.append("The video appears to show some form of activity or movement across the analyzed timepoints. ")
172
+
173
+ if any(word in question_lower for word in ['people', 'person', 'human']):
174
+ analysis_parts.append("The analysis would need to examine the frames for human presence and activities. ")
175
+
176
+ if any(word in question_lower for word in ['object', 'thing', 'item']):
177
+ analysis_parts.append("Object detection and identification would require deeper model analysis. ")
178
+
179
+ analysis_parts.append("\n⚠️ **Note:** This is a basic analysis. For detailed AI-powered video understanding, the VideoLLaMA3 model needs to be properly loaded and configured.")
180
+
181
+ return "\n".join(analysis_parts)
182
+
183
+ @spaces.GPU
184
+ def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
185
+ """Main video analysis function"""
186
+
187
+ if video_file is None:
188
+ return "❌ Please upload a video file first."
189
+
190
+ if not question.strip():
191
+ return "❌ Please enter a question about the video."
192
+
193
+ try:
194
+ progress(0.1, desc="Processing video...")
195
+
196
+ # Extract frames
197
+ frames, video_info, timestamps = extract_frames(video_file, max_frames=8)
198
+
199
+ if not frames:
200
+ return "❌ Could not extract frames from the video. Please check the video format."
201
+
202
+ progress(0.5, desc="Analyzing content...")
203
+
204
+ if model_loaded and model is not None and tokenizer is not None:
205
+ # Try to use the actual model
206
+ try:
207
+ progress(0.7, desc="Running AI analysis...")
208
+
209
+ # Prepare prompt for VideoLLaMA
210
+ prompt = f"""Human: I have a video with the following details:
211
+ - Duration: {video_info.get('duration', 0):.1f} seconds
212
+ - {len(frames)} key frames extracted
213
+ - Question: {question}
214
+
215
+ Please analyze this video and provide a detailed response.