cweigendev commited on
Commit
98b2fa8
Β·
verified Β·
1 Parent(s): 67e6bf3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -82
app.py CHANGED
@@ -35,30 +35,30 @@ model_loaded = False
35
 
36
  @spaces.GPU
37
  def load_videollama3_model():
38
- """Load VideoLLaMA3 model"""
39
  global model, processor, model_loaded
40
 
41
  try:
42
  print("πŸ”„ Loading VideoLLaMA3-7B model...")
43
 
44
- model_name = "DAMO-NLP-SG/VideoLLaMA3-7B"
45
 
46
  print("Loading processor...")
47
  processor = AutoProcessor.from_pretrained(
48
- model_name,
49
  trust_remote_code=True
50
  )
51
 
52
  print("Loading VideoLLaMA3 model (this may take several minutes)...")
53
  model = AutoModelForCausalLM.from_pretrained(
54
- model_name,
55
  trust_remote_code=True,
56
- device_map="auto",
57
  torch_dtype=torch.bfloat16,
58
  )
59
 
60
  model_loaded = True
61
- success_msg = "βœ… VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI."
62
  print(success_msg)
63
  return success_msg
64
 
@@ -70,7 +70,7 @@ def load_videollama3_model():
70
 
71
  @spaces.GPU
72
  def analyze_video_with_videollama3(video_file, question, progress=gr.Progress()):
73
- """Analyze video using VideoLLaMA3"""
74
 
75
  if video_file is None:
76
  return "❌ Please upload a video file first."
@@ -82,63 +82,72 @@ def analyze_video_with_videollama3(video_file, question, progress=gr.Progress())
82
  return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for completion."
83
 
84
  try:
85
- progress(0.1, desc="Preparing video for analysis...")
86
 
87
- # Create the conversation in the format VideoLLaMA3 expects
88
  conversation = [
89
- {"role": "system", "content": "You are a helpful assistant that can analyze videos."},
90
  {
91
  "role": "user",
92
  "content": [
93
- {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 64}},
94
  {"type": "text", "text": question}
95
  ]
96
  }
97
  ]
98
 
99
- progress(0.3, desc="Processing video with VideoLLaMA3...")
100
 
101
- # Process the conversation
102
- inputs = processor(conversation=conversation, return_tensors="pt")
 
 
 
 
 
 
 
103
  inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
104
 
105
  if "pixel_values" in inputs:
106
  inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
107
 
108
- progress(0.7, desc="Generating AI response...")
109
 
110
- # Generate response
111
  with torch.no_grad():
112
  output_ids = model.generate(
113
  **inputs,
114
  max_new_tokens=512,
115
- temperature=0.7,
116
  do_sample=True,
 
 
117
  pad_token_id=processor.tokenizer.eos_token_id,
118
  eos_token_id=processor.tokenizer.eos_token_id
119
  )
120
 
121
- progress(0.9, desc="Processing response...")
122
 
123
- # Decode response
124
  response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
125
 
126
- # Extract assistant response
127
  if "assistant" in response.lower():
128
  ai_response = response.split("assistant")[-1].strip()
129
- elif "user:" in response.lower():
130
- parts = response.split("user:")
 
 
 
131
  if len(parts) > 1:
132
  ai_response = parts[-1].strip()
133
  else:
134
  ai_response = response.strip()
135
- else:
136
- ai_response = response.strip()
137
 
138
- # Clean up the response
139
- ai_response = ai_response.replace("</s>", "").strip()
140
 
141
- # Get video info for technical details
142
  cap = cv2.VideoCapture(video_file)
143
  fps = cap.get(cv2.CAP_PROP_FPS)
144
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -147,74 +156,52 @@ def analyze_video_with_videollama3(video_file, question, progress=gr.Progress())
147
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
148
  cap.release()
149
 
150
- progress(1.0, desc="Analysis complete!")
151
 
152
- # Format the final response
153
  formatted_response = f"""πŸŽ₯ **VideoLLaMA3 AI Video Analysis**
154
 
155
  ❓ **Your Question:**
156
  {question}
157
 
158
- πŸ€– **AI Analysis:**
159
  {ai_response}
160
 
161
- πŸ“Š **Video Information:**
162
  β€’ Duration: {duration:.1f} seconds
 
163
  β€’ Frame Rate: {fps:.1f} FPS
164
  β€’ Total Frames: {total_frames:,}
165
- β€’ Resolution: {width}x{height}
166
 
167
- ⚑ **Powered by:** VideoLLaMA3-7B (Multimodal AI)
168
  """
169
 
170
  return formatted_response
171
 
172
  except Exception as e:
173
- error_msg = f"❌ Error during VideoLLaMA3 analysis: {str(e)}"
174
- print(error_msg)
175
-
176
- # Fallback: Basic video analysis if VideoLLaMA3 fails
177
- try:
178
- cap = cv2.VideoCapture(video_file)
179
- fps = cap.get(cv2.CAP_PROP_FPS)
180
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
181
- duration = total_frames / fps if fps > 0 else 0
182
- cap.release()
183
-
184
- fallback_response = f"""❌ VideoLLaMA3 analysis failed, but here's what I can tell you:
185
-
186
- **Video Technical Info:**
187
- β€’ Duration: {duration:.1f} seconds
188
- β€’ Frame Rate: {fps:.1f} FPS
189
- β€’ Total Frames: {total_frames:,}
190
-
191
- **Error:** {str(e)}
192
-
193
- **Suggestion:** Try reloading the model or using a shorter video file.
194
- """
195
- return fallback_response
196
-
197
- except:
198
- return error_msg
199
 
200
  def create_interface():
201
- """Create the Gradio interface"""
202
 
203
- with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo:
204
  gr.Markdown("# πŸŽ₯ VideoLLaMA3 Video Analysis Tool")
205
- gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!")
206
 
207
  # Model loading section
208
  with gr.Row():
209
  with gr.Column(scale=3):
210
  model_status = gr.Textbox(
211
- label="πŸ€– Model Status",
212
- value="Model not loaded - Click the button to load VideoLLaMA3-7B β†’",
213
  interactive=False,
214
  lines=2
215
  )
216
  with gr.Column(scale=1):
217
- load_btn = gr.Button("πŸš€ Load VideoLLaMA3 Model", variant="primary", size="lg")
218
 
219
  load_btn.click(load_videollama3_model, outputs=model_status)
220
 
@@ -228,31 +215,31 @@ def create_interface():
228
  height=350
229
  )
230
  question_input = gr.Textbox(
231
- label="❓ Ask about the video",
232
  placeholder="What is happening in this video? Describe it in detail.",
233
  lines=3,
234
  max_lines=5
235
  )
236
- analyze_btn = gr.Button("πŸ” Analyze Video with VideoLLaMA3", variant="primary", size="lg")
237
 
238
  with gr.Column(scale=1):
239
  output = gr.Textbox(
240
- label="🎯 AI Analysis Results",
241
  lines=25,
242
  max_lines=30,
243
  show_copy_button=True
244
  )
245
 
246
  # Example questions
247
- gr.Markdown("### πŸ’‘ Example Questions (click to use):")
248
 
249
  example_questions = [
250
  "What is happening in this video? Describe the scene in detail.",
251
  "Who are the people in this video and what are they doing?",
252
  "Describe the setting, location, and environment shown.",
253
- "What objects, animals, or items can you see in the video?",
254
- "What is the mood, atmosphere, or emotion conveyed?",
255
- "Summarize the key events that occur chronologically."
256
  ]
257
 
258
  with gr.Row():
@@ -265,7 +252,7 @@ def create_interface():
265
  btn2 = gr.Button(example_questions[i+1], size="sm")
266
  btn2.click(lambda x=example_questions[i+1]: x, outputs=question_input)
267
 
268
- # Connect the analyze button
269
  analyze_btn.click(
270
  analyze_video_with_videollama3,
271
  inputs=[video_input, question_input],
@@ -275,16 +262,17 @@ def create_interface():
275
 
276
  gr.Markdown("---")
277
  gr.Markdown("""
278
- ### πŸ“‹ Instructions:
279
- 1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes)
280
- 2. **Then:** Upload your video file (works best with videos under 2 minutes)
281
- 3. **Ask:** Type your question about the video content
282
- 4. **Analyze:** Click "Analyze Video with VideoLLaMA3" to get detailed insights
283
 
284
- πŸ’‘ **Tips:**
285
- - Keep videos under 2 minutes for best performance
286
- - Ask specific, detailed questions for better results
287
- - The model will analyze up to 64 frames from your video
 
288
  """)
289
 
290
  return demo
 
35
 
36
  @spaces.GPU
37
  def load_videollama3_model():
38
+ """Load VideoLLaMA3 model with the correct implementation"""
39
  global model, processor, model_loaded
40
 
41
  try:
42
  print("πŸ”„ Loading VideoLLaMA3-7B model...")
43
 
44
+ model_path = "DAMO-NLP-SG/VideoLLaMA3-7B"
45
 
46
  print("Loading processor...")
47
  processor = AutoProcessor.from_pretrained(
48
+ model_path,
49
  trust_remote_code=True
50
  )
51
 
52
  print("Loading VideoLLaMA3 model (this may take several minutes)...")
53
  model = AutoModelForCausalLM.from_pretrained(
54
+ model_path,
55
  trust_remote_code=True,
56
+ device_map={"": device},
57
  torch_dtype=torch.bfloat16,
58
  )
59
 
60
  model_loaded = True
61
+ success_msg = "βœ… VideoLLaMA3-7B model loaded successfully! Ready for video analysis."
62
  print(success_msg)
63
  return success_msg
64
 
 
70
 
71
  @spaces.GPU
72
  def analyze_video_with_videollama3(video_file, question, progress=gr.Progress()):
73
+ """Analyze video using VideoLLaMA3 - REAL implementation"""
74
 
75
  if video_file is None:
76
  return "❌ Please upload a video file first."
 
82
  return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for completion."
83
 
84
  try:
85
+ progress(0.1, desc="Preparing video for VideoLLaMA3...")
86
 
87
+ # Create the exact conversation format from VideoLLaMA3 official implementation
88
  conversation = [
89
+ {"role": "system", "content": "You are a helpful assistant."},
90
  {
91
  "role": "user",
92
  "content": [
93
+ {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 128}},
94
  {"type": "text", "text": question}
95
  ]
96
  }
97
  ]
98
 
99
+ progress(0.3, desc="Processing with VideoLLaMA3...")
100
 
101
+ # Use the EXACT processor call from official VideoLLaMA3 code
102
+ inputs = processor(
103
+ conversation=conversation,
104
+ add_system_prompt=True,
105
+ add_generation_prompt=True,
106
+ return_tensors="pt"
107
+ )
108
+
109
+ # Move inputs to device
110
  inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
111
 
112
  if "pixel_values" in inputs:
113
  inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
114
 
115
+ progress(0.7, desc="Generating VideoLLaMA3 response...")
116
 
117
+ # Generate response with VideoLLaMA3
118
  with torch.no_grad():
119
  output_ids = model.generate(
120
  **inputs,
121
  max_new_tokens=512,
 
122
  do_sample=True,
123
+ temperature=0.1,
124
+ use_cache=True,
125
  pad_token_id=processor.tokenizer.eos_token_id,
126
  eos_token_id=processor.tokenizer.eos_token_id
127
  )
128
 
129
+ progress(0.9, desc="Processing VideoLLaMA3 response...")
130
 
131
+ # Decode the response
132
  response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
133
 
134
+ # Extract assistant response - VideoLLaMA3 specific parsing
135
  if "assistant" in response.lower():
136
  ai_response = response.split("assistant")[-1].strip()
137
+ elif "<|im_start|>assistant" in response:
138
+ ai_response = response.split("<|im_start|>assistant")[-1].strip()
139
+ else:
140
+ # Fallback: extract everything after the user's question
141
+ parts = response.split(question)
142
  if len(parts) > 1:
143
  ai_response = parts[-1].strip()
144
  else:
145
  ai_response = response.strip()
 
 
146
 
147
+ # Clean up response tokens
148
+ ai_response = ai_response.replace("<|im_end|>", "").replace("</s>", "").strip()
149
 
150
+ # Get video metadata
151
  cap = cv2.VideoCapture(video_file)
152
  fps = cap.get(cv2.CAP_PROP_FPS)
153
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
156
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
157
  cap.release()
158
 
159
+ progress(1.0, desc="VideoLLaMA3 analysis complete!")
160
 
161
+ # Format response
162
  formatted_response = f"""πŸŽ₯ **VideoLLaMA3 AI Video Analysis**
163
 
164
  ❓ **Your Question:**
165
  {question}
166
 
167
+ πŸ€– **VideoLLaMA3 Response:**
168
  {ai_response}
169
 
170
+ πŸ“Š **Video Details:**
171
  β€’ Duration: {duration:.1f} seconds
172
+ β€’ Resolution: {width}x{height}
173
  β€’ Frame Rate: {fps:.1f} FPS
174
  β€’ Total Frames: {total_frames:,}
175
+ β€’ Analyzed with: Up to 128 frames at 1 FPS
176
 
177
+ ⚑ **Powered by:** VideoLLaMA3-7B (Official Implementation)
178
  """
179
 
180
  return formatted_response
181
 
182
  except Exception as e:
183
+ error_msg = f"❌ VideoLLaMA3 analysis failed: {str(e)}"
184
+ print(f"Full error: {e}")
185
+ return error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  def create_interface():
188
+ """Create the VideoLLaMA3 interface"""
189
 
190
+ with gr.Blocks(title="VideoLLaMA3 Official", theme=gr.themes.Soft()) as demo:
191
  gr.Markdown("# πŸŽ₯ VideoLLaMA3 Video Analysis Tool")
192
+ gr.Markdown("**Official VideoLLaMA3-7B implementation** - Upload videos and get detailed AI analysis!")
193
 
194
  # Model loading section
195
  with gr.Row():
196
  with gr.Column(scale=3):
197
  model_status = gr.Textbox(
198
+ label="πŸ€– VideoLLaMA3 Model Status",
199
+ value="Model not loaded - Click button to load VideoLLaMA3-7B β†’",
200
  interactive=False,
201
  lines=2
202
  )
203
  with gr.Column(scale=1):
204
+ load_btn = gr.Button("πŸš€ Load VideoLLaMA3", variant="primary", size="lg")
205
 
206
  load_btn.click(load_videollama3_model, outputs=model_status)
207
 
 
215
  height=350
216
  )
217
  question_input = gr.Textbox(
218
+ label="❓ Ask VideoLLaMA3 about the video",
219
  placeholder="What is happening in this video? Describe it in detail.",
220
  lines=3,
221
  max_lines=5
222
  )
223
+ analyze_btn = gr.Button("πŸ” Analyze with VideoLLaMA3", variant="primary", size="lg")
224
 
225
  with gr.Column(scale=1):
226
  output = gr.Textbox(
227
+ label="🎯 VideoLLaMA3 Analysis Results",
228
  lines=25,
229
  max_lines=30,
230
  show_copy_button=True
231
  )
232
 
233
  # Example questions
234
+ gr.Markdown("### πŸ’‘ Example Questions for VideoLLaMA3:")
235
 
236
  example_questions = [
237
  "What is happening in this video? Describe the scene in detail.",
238
  "Who are the people in this video and what are they doing?",
239
  "Describe the setting, location, and environment shown.",
240
+ "What objects can you identify in this video?",
241
+ "What is the mood or atmosphere of this video?",
242
+ "Can you summarize the key events chronologically?"
243
  ]
244
 
245
  with gr.Row():
 
252
  btn2 = gr.Button(example_questions[i+1], size="sm")
253
  btn2.click(lambda x=example_questions[i+1]: x, outputs=question_input)
254
 
255
+ # Connect analyze button
256
  analyze_btn.click(
257
  analyze_video_with_videollama3,
258
  inputs=[video_input, question_input],
 
262
 
263
  gr.Markdown("---")
264
  gr.Markdown("""
265
+ ### πŸ“‹ How to Use VideoLLaMA3:
266
+ 1. **Load Model:** Click "Load VideoLLaMA3" and wait (~10 minutes for first load)
267
+ 2. **Upload Video:** Choose your video file (works best under 2 minutes)
268
+ 3. **Ask Question:** Type what you want to know about the video
269
+ 4. **Analyze:** Click "Analyze with VideoLLaMA3" for AI-powered insights
270
 
271
+ ### πŸ”§ Technical Details:
272
+ - **Model:** VideoLLaMA3-7B (Official DAMO-NLP-SG implementation)
273
+ - **Analysis:** Processes up to 128 frames at 1 FPS sampling
274
+ - **Capabilities:** Video understanding, object detection, scene description, temporal reasoning
275
+ - **Best Performance:** Videos under 2 minutes, clear visuals, specific questions
276
  """)
277
 
278
  return demo