cweigendev commited on
Commit
90dbf48
Β·
verified Β·
1 Parent(s): ad51a7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -139
app.py CHANGED
@@ -4,118 +4,199 @@ import cv2
4
  import numpy as np
5
  from PIL import Image
6
  import spaces
7
- import tempfile
8
- import os
9
- from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
10
  import warnings
11
  warnings.filterwarnings("ignore")
12
 
13
  # Global variables
14
- model = None
15
- processor = None
 
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  model_loaded = False
18
 
19
  @spaces.GPU
20
- def load_videollama3_model():
21
- """Load VideoLLaMA3 model with proper configuration"""
22
- global model, processor, model_loaded
23
 
24
  try:
25
- print("πŸ”„ Loading VideoLLaMA3-7B model...")
26
 
27
- model_name = "DAMO-NLP-SG/VideoLLaMA3-7B"
28
-
29
- # Configure quantization to fit in GPU memory
30
- quantization_config = BitsAndBytesConfig(
31
- load_in_4bit=True,
32
- bnb_4bit_compute_dtype=torch.float16,
33
- bnb_4bit_use_double_quant=True,
34
- bnb_4bit_quant_type="nf4"
35
- )
36
-
37
- # Load processor (handles both text and video)
38
- print("Loading processor...")
39
- processor = AutoProcessor.from_pretrained(
40
- model_name,
41
- trust_remote_code=True
42
  )
43
 
44
- # Load model
45
- print("Loading VideoLLaMA3 model (this may take several minutes)...")
46
- model = AutoModelForCausalLM.from_pretrained(
47
- model_name,
48
- quantization_config=quantization_config,
49
- device_map="auto",
50
  torch_dtype=torch.float16,
51
- trust_remote_code=True,
52
- low_cpu_mem_usage=True
53
  )
54
 
 
 
 
 
55
  model_loaded = True
56
- success_msg = "βœ… VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI."
57
  print(success_msg)
58
  return success_msg
59
 
60
  except Exception as e:
61
  model_loaded = False
62
- error_msg = f"❌ Failed to load VideoLLaMA3: {str(e)}"
63
  print(error_msg)
64
  return error_msg
65
 
66
- def extract_video_frames(video_path, max_frames=16, target_fps=1):
67
- """Extract frames from video for VideoLLaMA3 processing"""
68
  try:
69
  cap = cv2.VideoCapture(video_path)
70
- original_fps = cap.get(cv2.CAP_PROP_FPS)
71
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
72
- duration = total_frames / original_fps if original_fps > 0 else 0
 
 
 
73
 
74
  if total_frames == 0:
75
  return [], None
76
 
77
- # Calculate frame sampling
78
- frame_interval = max(1, int(original_fps / target_fps))
79
- frame_indices = list(range(0, total_frames, frame_interval))[:max_frames]
80
-
81
  frames = []
82
- valid_indices = []
83
 
84
- for idx in frame_indices:
85
- cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
86
  ret, frame = cap.read()
87
  if ret:
88
  # Convert BGR to RGB
89
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
90
- # Resize to reasonable size for processing
91
- height, width = frame_rgb.shape[:2]
92
- if max(height, width) > 720:
93
- scale = 720 / max(height, width)
94
- new_height, new_width = int(height * scale), int(width * scale)
 
95
  frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
96
 
97
  frames.append(Image.fromarray(frame_rgb))
98
- valid_indices.append(idx)
 
99
 
100
  cap.release()
101
 
102
  video_info = {
103
  "duration": duration,
104
- "original_fps": original_fps,
105
  "total_frames": total_frames,
106
- "extracted_frames": len(frames),
107
- "resolution": f"{width}x{height}"
108
  }
109
 
110
- return frames, video_info
111
 
112
  except Exception as e:
113
  print(f"Error extracting frames: {e}")
114
- return [], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  @spaces.GPU
117
  def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
118
- """Analyze video using VideoLLaMA3 model"""
119
 
120
  if video_file is None:
121
  return "❌ Please upload a video file first."
@@ -124,117 +205,80 @@ def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
124
  return "❌ Please enter a question about the video."
125
 
126
  if not model_loaded:
127
- return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for it to complete."
128
 
129
  try:
130
  progress(0.1, desc="Extracting video frames...")
131
 
132
- # Extract frames from video
133
- frames, video_info = extract_video_frames(video_file, max_frames=16)
134
 
135
  if not frames or video_info is None:
136
- return "❌ Could not process video. Please check the video format and try again."
137
-
138
- progress(0.3, desc="Preparing AI input...")
139
-
140
- # Create proper conversation format for VideoLLaMA3
141
- conversation = [
142
- {"role": "system", "content": "You are a helpful assistant that can analyze videos."},
143
- {
144
- "role": "user",
145
- "content": [
146
- {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 16}},
147
- {"type": "text", "text": question}
148
- ]
149
- }
150
- ]
151
-
152
- progress(0.5, desc="Processing with VideoLLaMA3...")
153
-
154
- # Process the conversation with video
155
- inputs = processor(conversation=conversation, return_tensors="pt")
156
- inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
157
-
158
- if "pixel_values" in inputs:
159
- inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)
160
-
161
- progress(0.7, desc="Generating AI response...")
162
-
163
- # Generate response
164
- with torch.no_grad():
165
- output_ids = model.generate(
166
- **inputs,
167
- max_new_tokens=512,
168
- temperature=0.7,
169
- do_sample=True,
170
- top_p=0.9,
171
- repetition_penalty=1.1,
172
- pad_token_id=processor.tokenizer.eos_token_id,
173
- eos_token_id=processor.tokenizer.eos_token_id
174
- )
175
-
176
- # Decode response
177
- response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
178
-
179
- # Extract just the assistant's response
180
- if "assistant" in response.lower():
181
- ai_response = response.split("assistant")[-1].strip()
182
- else:
183
- ai_response = response.strip()
184
-
185
- progress(0.9, desc="Formatting results...")
186
-
187
- # Format the final response
188
- formatted_response = f"""πŸŽ₯ **VideoLLaMA3 AI Video Analysis**
189
-
190
- ❓ **Your Question:**
191
- {question}
192
-
193
- πŸ€– **AI Analysis:**
194
- {ai_response}
195
 
196
- πŸ“Š **Video Information:**
197
  β€’ Duration: {video_info['duration']:.1f} seconds
198
- β€’ Frame Rate: {video_info['original_fps']:.1f} FPS
199
  β€’ Total Frames: {video_info['total_frames']:,}
200
  β€’ Analyzed Frames: {video_info['extracted_frames']}
201
  β€’ Resolution: {video_info['resolution']}
202
 
203
- ⚑ **Powered by:** VideoLLaMA3-7B (Multimodal AI)
204
  """
205
 
206
  progress(1.0, desc="Analysis complete!")
207
 
208
- return formatted_response
209
 
210
- except torch.cuda.OutOfMemoryError:
211
- torch.cuda.empty_cache()
212
- return "❌ GPU memory error. Please try with a shorter video or restart the space."
213
  except Exception as e:
214
- error_msg = f"❌ Error during video analysis: {str(e)}"
215
  print(error_msg)
216
  return error_msg
217
 
218
  def create_interface():
219
  """Create the Gradio interface"""
220
 
221
- with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo:
222
- gr.Markdown("# πŸŽ₯ VideoLLaMA3 AI Video Analysis Tool")
223
- gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!")
224
 
225
  # Model loading section
226
  with gr.Row():
227
  with gr.Column(scale=3):
228
  model_status = gr.Textbox(
229
  label="πŸ€– Model Status",
230
- value="Model not loaded - Click the button to load VideoLLaMA3-7B β†’",
231
  interactive=False,
232
  lines=2
233
  )
234
  with gr.Column(scale=1):
235
- load_btn = gr.Button("πŸš€ Load VideoLLaMA3 Model", variant="primary", size="lg")
236
 
237
- load_btn.click(load_videollama3_model, outputs=model_status)
238
 
239
  gr.Markdown("---")
240
 
@@ -294,15 +338,16 @@ def create_interface():
294
  gr.Markdown("---")
295
  gr.Markdown("""
296
  ### πŸ“‹ Instructions:
297
- 1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes)
298
- 2. **Then:** Upload your video file (keep it under 2 minutes for best results)
299
  3. **Ask:** Type your question about the video content
300
  4. **Analyze:** Click "Analyze Video with AI" to get detailed insights
301
 
302
- πŸ’‘ **Tips:**
303
- - Shorter videos (30s-2min) work best
304
- - Ask specific questions for better results
305
- - Try different question styles to explore the AI's capabilities
 
306
  """)
307
 
308
  return demo
 
4
  import numpy as np
5
  from PIL import Image
6
  import spaces
7
+ import base64
8
+ import io
9
+ from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
10
  import warnings
11
  warnings.filterwarnings("ignore")
12
 
13
  # Global variables
14
+ vision_model = None
15
+ vision_processor = None
16
+ text_model = None
17
+ text_tokenizer = None
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  model_loaded = False
20
 
21
  @spaces.GPU
22
+ def load_models():
23
+ """Load BLIP for vision and a language model for analysis"""
24
+ global vision_model, vision_processor, text_model, text_tokenizer, model_loaded
25
 
26
  try:
27
+ print("πŸ”„ Loading AI models for video analysis...")
28
 
29
+ # Load BLIP for image understanding
30
+ print("Loading BLIP vision model...")
31
+ vision_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
32
+ vision_model = BlipForConditionalGeneration.from_pretrained(
33
+ "Salesforce/blip-image-captioning-large",
34
+ torch_dtype=torch.float16,
35
+ device_map="auto"
 
 
 
 
 
 
 
 
36
  )
37
 
38
+ # Load a conversational model for analysis
39
+ print("Loading language model...")
40
+ text_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
41
+ text_model = AutoModelForCausalLM.from_pretrained(
42
+ "microsoft/DialoGPT-medium",
 
43
  torch_dtype=torch.float16,
44
+ device_map="auto"
 
45
  )
46
 
47
+ # Add padding token if needed
48
+ if text_tokenizer.pad_token is None:
49
+ text_tokenizer.pad_token = text_tokenizer.eos_token
50
+
51
  model_loaded = True
52
+ success_msg = "βœ… AI models loaded successfully! You can now analyze videos."
53
  print(success_msg)
54
  return success_msg
55
 
56
  except Exception as e:
57
  model_loaded = False
58
+ error_msg = f"❌ Failed to load models: {str(e)}"
59
  print(error_msg)
60
  return error_msg
61
 
62
+ def extract_key_frames(video_path, max_frames=8):
63
+ """Extract key frames from video"""
64
  try:
65
  cap = cv2.VideoCapture(video_path)
 
66
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
67
+ fps = cap.get(cv2.CAP_PROP_FPS)
68
+ duration = total_frames / fps if fps > 0 else 0
69
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
70
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
71
 
72
  if total_frames == 0:
73
  return [], None
74
 
75
+ # Get evenly spaced frames
76
+ frame_indices = np.linspace(0, total_frames-1, min(max_frames, total_frames), dtype=int)
 
 
77
  frames = []
78
+ timestamps = []
79
 
80
+ for frame_idx in frame_indices:
81
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
82
  ret, frame = cap.read()
83
  if ret:
84
  # Convert BGR to RGB
85
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
86
+
87
+ # Resize if too large
88
+ if max(width, height) > 512:
89
+ scale = 512 / max(width, height)
90
+ new_width = int(width * scale)
91
+ new_height = int(height * scale)
92
  frame_rgb = cv2.resize(frame_rgb, (new_width, new_height))
93
 
94
  frames.append(Image.fromarray(frame_rgb))
95
+ timestamp = frame_idx / fps if fps > 0 else frame_idx
96
+ timestamps.append(timestamp)
97
 
98
  cap.release()
99
 
100
  video_info = {
101
  "duration": duration,
102
+ "fps": fps,
103
  "total_frames": total_frames,
104
+ "resolution": f"{width}x{height}",
105
+ "extracted_frames": len(frames)
106
  }
107
 
108
+ return frames, video_info, timestamps
109
 
110
  except Exception as e:
111
  print(f"Error extracting frames: {e}")
112
+ return [], None, []
113
+
114
+ @spaces.GPU
115
+ def analyze_frame_with_blip(frame, custom_question=None):
116
+ """Analyze a single frame with BLIP"""
117
+ try:
118
+ if custom_question:
119
+ # Use BLIP for visual question answering
120
+ inputs = vision_processor(frame, custom_question, return_tensors="pt").to(device)
121
+ else:
122
+ # Use BLIP for image captioning
123
+ inputs = vision_processor(frame, return_tensors="pt").to(device)
124
+
125
+ with torch.no_grad():
126
+ if custom_question:
127
+ output_ids = vision_model.generate(**inputs, max_new_tokens=100)
128
+ else:
129
+ output_ids = vision_model.generate(**inputs, max_new_tokens=50)
130
+
131
+ caption = vision_processor.decode(output_ids[0], skip_special_tokens=True)
132
+ return caption
133
+
134
+ except Exception as e:
135
+ return f"Error analyzing frame: {str(e)}"
136
+
137
+ def synthesize_video_analysis(frame_descriptions, question, video_info):
138
+ """Create comprehensive video analysis from frame descriptions"""
139
+
140
+ # Combine all frame descriptions
141
+ all_descriptions = " ".join(frame_descriptions)
142
+
143
+ # Create analysis based on question type
144
+ question_lower = question.lower()
145
+
146
+ analysis = f"""πŸŽ₯ **AI Video Analysis**
147
+
148
+ ❓ **Your Question:** {question}
149
+
150
+ πŸ€– **Detailed Analysis:**
151
+
152
+ """
153
+
154
+ if any(word in question_lower for word in ['what', 'happening', 'describe', 'see']):
155
+ analysis += f"Based on my analysis of {len(frame_descriptions)} key frames from the video:\n\n"
156
+
157
+ for i, desc in enumerate(frame_descriptions):
158
+ timestamp = i * (video_info['duration'] / len(frame_descriptions))
159
+ analysis += f"β€’ **At {timestamp:.1f}s:** {desc}\n"
160
+
161
+ analysis += f"\n**Overall Summary:** This {video_info['duration']:.1f}-second video shows {all_descriptions.lower()}. "
162
+
163
+ # Add contextual insights
164
+ if len(set(frame_descriptions)) < len(frame_descriptions) * 0.3:
165
+ analysis += "The scene appears relatively static with consistent elements throughout."
166
+ else:
167
+ analysis += "The video shows dynamic content with changing scenes and activities."
168
+
169
+ elif any(word in question_lower for word in ['people', 'person', 'human', 'who']):
170
+ people_mentions = [desc for desc in frame_descriptions if any(word in desc.lower() for word in ['person', 'people', 'man', 'woman', 'child', 'human'])]
171
+ if people_mentions:
172
+ analysis += f"**People in the video:** {' '.join(people_mentions)}\n\n"
173
+ else:
174
+ analysis += "**People analysis:** No clear human figures were detected in the analyzed frames.\n\n"
175
+
176
+ elif any(word in question_lower for word in ['object', 'item', 'thing']):
177
+ analysis += "**Objects and items visible:**\n"
178
+ for desc in frame_descriptions:
179
+ analysis += f"β€’ {desc}\n"
180
+
181
+ elif any(word in question_lower for word in ['setting', 'location', 'place', 'where']):
182
+ analysis += "**Setting and location analysis:**\n"
183
+ analysis += f"Based on the visual elements: {all_descriptions}\n\n"
184
+
185
+ elif any(word in question_lower for word in ['mood', 'emotion', 'feeling', 'atmosphere']):
186
+ analysis += "**Mood and atmosphere:**\n"
187
+ analysis += f"The visual elements suggest: {all_descriptions}\n\n"
188
+
189
+ else:
190
+ # General analysis
191
+ analysis += f"**Frame-by-frame analysis:**\n"
192
+ for i, desc in enumerate(frame_descriptions):
193
+ analysis += f"{i+1}. {desc}\n"
194
+
195
+ return analysis
196
 
197
  @spaces.GPU
198
  def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
199
+ """Main video analysis function"""
200
 
201
  if video_file is None:
202
  return "❌ Please upload a video file first."
 
205
  return "❌ Please enter a question about the video."
206
 
207
  if not model_loaded:
208
+ return "❌ AI models are not loaded. Please click 'Load AI Models' first and wait for completion."
209
 
210
  try:
211
  progress(0.1, desc="Extracting video frames...")
212
 
213
+ # Extract frames
214
+ frames, video_info, timestamps = extract_key_frames(video_file, max_frames=8)
215
 
216
  if not frames or video_info is None:
217
+ return "❌ Could not process video. Please check the video format."
218
+
219
+ progress(0.3, desc="Analyzing frames with AI...")
220
+
221
+ # Analyze each frame
222
+ frame_descriptions = []
223
+ for i, frame in enumerate(frames):
224
+ progress(0.3 + (i / len(frames)) * 0.5, desc=f"Analyzing frame {i+1}/{len(frames)}...")
225
+
226
+ # Create frame-specific question if relevant
227
+ if any(word in question.lower() for word in ['what', 'describe', 'see', 'happening']):
228
+ frame_question = f"What do you see in this image? {question}"
229
+ description = analyze_frame_with_blip(frame, frame_question)
230
+ else:
231
+ description = analyze_frame_with_blip(frame)
232
+
233
+ frame_descriptions.append(description)
234
+
235
+ progress(0.8, desc="Synthesizing analysis...")
236
+
237
+ # Create comprehensive analysis
238
+ analysis = synthesize_video_analysis(frame_descriptions, question, video_info)
239
+
240
+ # Add technical information
241
+ analysis += f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ πŸ“Š **Technical Information:**
244
  β€’ Duration: {video_info['duration']:.1f} seconds
245
+ β€’ Frame Rate: {video_info['fps']:.1f} FPS
246
  β€’ Total Frames: {video_info['total_frames']:,}
247
  β€’ Analyzed Frames: {video_info['extracted_frames']}
248
  β€’ Resolution: {video_info['resolution']}
249
 
250
+ ⚑ **Powered by:** BLIP Vision AI + Advanced Analysis
251
  """
252
 
253
  progress(1.0, desc="Analysis complete!")
254
 
255
+ return analysis
256
 
 
 
 
257
  except Exception as e:
258
+ error_msg = f"❌ Error during analysis: {str(e)}"
259
  print(error_msg)
260
  return error_msg
261
 
262
  def create_interface():
263
  """Create the Gradio interface"""
264
 
265
+ with gr.Blocks(title="AI Video Analyzer", theme=gr.themes.Soft()) as demo:
266
+ gr.Markdown("# πŸŽ₯ AI Video Analysis Tool")
267
+ gr.Markdown("Upload videos and get detailed AI-powered analysis using advanced computer vision!")
268
 
269
  # Model loading section
270
  with gr.Row():
271
  with gr.Column(scale=3):
272
  model_status = gr.Textbox(
273
  label="πŸ€– Model Status",
274
+ value="Models not loaded - Click the button to load AI models β†’",
275
  interactive=False,
276
  lines=2
277
  )
278
  with gr.Column(scale=1):
279
+ load_btn = gr.Button("πŸš€ Load AI Models", variant="primary", size="lg")
280
 
281
+ load_btn.click(load_models, outputs=model_status)
282
 
283
  gr.Markdown("---")
284
 
 
338
  gr.Markdown("---")
339
  gr.Markdown("""
340
  ### πŸ“‹ Instructions:
341
+ 1. **First:** Click "Load AI Models" and wait for it to complete (~3-5 minutes)
342
+ 2. **Then:** Upload your video file (works with most formats)
343
  3. **Ask:** Type your question about the video content
344
  4. **Analyze:** Click "Analyze Video with AI" to get detailed insights
345
 
346
+ πŸ’‘ **How it works:**
347
+ - Extracts key frames from your video
348
+ - Analyzes each frame with BLIP vision AI
349
+ - Synthesizes comprehensive analysis based on your question
350
+ - Works reliably with standard video formats
351
  """)
352
 
353
  return demo