jomasego commited on
Commit
fb2e8d8
·
1 Parent(s): 0036cd7

Update backend and dependencies

Browse files
Files changed (2) hide show
  1. modal_whisper_app.py +708 -411
  2. requirements.txt +0 -2
modal_whisper_app.py CHANGED
@@ -1,514 +1,811 @@
1
  import modal
 
2
  import os
3
  import tempfile
4
- import io
5
-
6
- # Environment variable for model name, configurable in Modal UI or via .env
7
- # This will be used by both the pre-caching function and the runtime function
8
- WHISPER_MODEL_NAME = os.environ.get("HF_WHISPER_MODEL_NAME", "openai/whisper-large-v3")
 
 
 
 
 
 
 
 
 
9
  CAPTION_MODEL_NAME = "Neleac/SpaceTimeGPT"
10
- CAPTION_PROCESSOR_NAME = "MCG-NJU/videomae-base"
11
- CAPTION_TOKENIZER_NAME = "gpt2" # SpaceTimeGPT uses GPT-2 as decoder
12
  ACTION_MODEL_NAME = "MCG-NJU/videomae-base-finetuned-kinetics"
13
- ACTION_PROCESSOR_NAME = "MCG-NJU/videomae-base-finetuned-kinetics" # Often the same as model for VideoMAE
14
-
15
- # Initialize a Modal Dict for caching results
16
- # The key will be a hash of the video URL or video content
17
- video_analysis_cache = modal.Dict.from_name(
18
- "video-analysis-cache", create_if_missing=True
19
- )
20
-
21
- def download_whisper_model():
22
- import torch
23
- from transformers import pipeline
24
- print(f"Downloading and caching Whisper model: {WHISPER_MODEL_NAME}")
25
- pipeline(
26
- "automatic-speech-recognition",
27
- model=WHISPER_MODEL_NAME,
28
- torch_dtype=torch.float32,
29
- device="cpu"
30
- )
31
- print(f"Whisper model {WHISPER_MODEL_NAME} cached successfully.")
32
 
33
- def download_caption_model():
34
- import torch
35
- from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
36
- print(f"Downloading and caching caption model: {CAPTION_MODEL_NAME}")
37
- # Download image processor
38
- AutoImageProcessor.from_pretrained(CAPTION_PROCESSOR_NAME)
39
- print(f"Image processor {CAPTION_PROCESSOR_NAME} cached.")
40
- # Download tokenizer
41
- AutoTokenizer.from_pretrained(CAPTION_TOKENIZER_NAME)
42
- print(f"Tokenizer {CAPTION_TOKENIZER_NAME} cached.")
43
- # Download main model
44
- VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_NAME)
45
- print(f"Caption model {CAPTION_MODEL_NAME} cached successfully.")
46
-
47
- def download_action_model():
48
- from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
49
- print(f"Downloading and caching action recognition model: {ACTION_MODEL_NAME}")
50
- # Download image processor
51
- VideoMAEImageProcessor.from_pretrained(ACTION_PROCESSOR_NAME)
52
- print(f"Action model processor {ACTION_PROCESSOR_NAME} cached.")
53
- # Download main model
54
- VideoMAEForVideoClassification.from_pretrained(ACTION_MODEL_NAME)
55
- print(f"Action model {ACTION_MODEL_NAME} cached successfully.")
56
-
57
- # Define the Modal image
58
- whisper_image = (
59
  modal.Image.debian_slim(python_version="3.10")
60
  .apt_install("ffmpeg")
61
- .run_commands(
62
- "echo 'Force reinstalling moviepy...'",
63
- "pip install --force-reinstall moviepy",
64
- "echo 'Checking moviepy installation...'",
65
- "pip show moviepy || echo 'pip show moviepy failed'",
66
- "echo 'Attempting to import moviepy.editor during build:'",
67
- "python -c 'import moviepy; print(f\"moviepy module loaded from: {moviepy.__file__}\"); from moviepy.video.io.VideoFileClip import VideoFileClip; print(\"moviepy.video.io.VideoFileClip.VideoFileClip class import successful\")'"
68
- ) # Force install moviepy and add diagnostics
69
  .pip_install(
70
- "transformers[torch]",
71
- "accelerate",
72
- "soundfile",
73
- "moviepy", # Essential for audio extraction from video
74
- "huggingface_hub",
75
- "ffmpeg-python",
76
- "av", # For video frame extraction
77
- "fastapi[standard]" # For web endpoints
 
 
 
78
  )
79
- .run_function(download_whisper_model)
80
- .run_function(download_caption_model)
81
- .run_function(download_action_model) # This runs download_action_model during image build
82
  )
83
 
84
- app = modal.App(name="whisper-transcriber") # Changed from modal.Stub to modal.App
 
85
 
 
86
 
 
 
87
 
88
- # Hugging Face Token - retrieve from memory and set as Modal Secret
89
- # IMPORTANT: Create a Modal Secret named 'my-huggingface-secret' with your actual HF_TOKEN.
90
- # Example: modal secret create my-huggingface-secret HF_TOKEN=your_hf_token_here
91
  HF_TOKEN_SECRET = modal.Secret.from_name("my-huggingface-secret")
92
 
93
- @app.function(
94
- image=whisper_image,
95
- secrets=[HF_TOKEN_SECRET],
96
- timeout=1200,
97
- gpu="any" # Request any available GPU
98
- )
99
- def transcribe_video_audio(video_bytes: bytes) -> str:
100
- # Imports moved inside the function to avoid local ModuleNotFoundError during `modal deploy`
101
- from moviepy.video.io.VideoFileClip import VideoFileClip # More specific import for moviepy 2.2.1
102
- import soundfile as sf
103
- import torch
104
- from transformers import pipeline # This will now use the pre-cached model
105
  from huggingface_hub import login
106
-
107
- if not video_bytes:
108
- return "Error: No video data received."
109
-
110
- # Login to Hugging Face Hub using the token from Modal secrets
111
- hf_token = os.environ.get("HF_TOKEN") # Standard key for Hugging Face token in Modal secrets if set as HF_TOKEN=...
112
  if hf_token:
113
  try:
114
  login(token=hf_token)
115
  print("Successfully logged into Hugging Face Hub.")
 
116
  except Exception as e:
117
- print(f"Hugging Face Hub login failed: {e}. Proceeding, but private models may not be accessible.")
 
118
  else:
119
- print("HF_TOKEN secret not found. Proceeding without login (works for public models).")
 
120
 
121
- print(f"Processing video for transcription using model: {WHISPER_MODEL_NAME}")
122
-
123
- # Initialize pipeline inside the function.
124
- # For production/frequent use, consider @stub.cls to load the model once per container lifecycle.
125
- print("Loading Whisper model...")
126
- device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
127
- # Use float16 for GPU for faster inference and less memory, float32 for CPU
128
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
129
-
130
- transcriber = pipeline(
131
- "automatic-speech-recognition",
132
- model=WHISPER_MODEL_NAME,
133
- torch_dtype=torch_dtype,
134
- device=device_map,
135
- )
136
- print(f"Whisper model loaded on device: {device_map} with dtype: {torch_dtype}")
137
-
138
- video_path = None
139
- audio_path = None
140
 
 
 
141
  try:
 
142
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
143
  tmp_video_file.write(video_bytes)
144
  video_path = tmp_video_file.name
145
- print(f"Temporary video file saved: {video_path}")
146
-
147
- print("Extracting audio from video...")
148
- video_clip = VideoFileClip(video_path)
149
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_file:
150
- audio_path = tmp_audio_file.name
151
- video_clip.audio.write_audiofile(audio_path, codec='pcm_s16le', logger=None)
152
- video_clip.close()
153
- print(f"Audio extracted to: {audio_path}")
154
-
155
- audio_input, samplerate = sf.read(audio_path)
156
- if audio_input.ndim > 1:
157
- audio_input = audio_input.mean(axis=1) # Convert to mono
158
 
159
- print(f"Audio data shape: {audio_input.shape}, Samplerate: {samplerate}")
160
- print("Starting transcription...")
161
- # Pass audio as a dictionary for more control, or directly as numpy array
162
- # Adding chunk_length_s for handling long audio files better.
163
- result = transcriber(audio_input.copy(), chunk_length_s=30, batch_size=8, return_timestamps=False, generate_kwargs={"temperature": 0.2, "no_repeat_ngram_size": 3, "language": "en"})
164
- transcribed_text = result["text"]
 
 
 
165
 
166
- print(f"Transcription successful. Length: {len(transcribed_text)}")
167
- if len(transcribed_text) > 100:
168
- print(f"Transcription preview: {transcribed_text[:100]}...")
169
- else:
170
- print(f"Transcription: {transcribed_text}")
171
-
172
- return transcribed_text
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  except Exception as e:
175
- print(f"Error during transcription process: {e}")
176
  import traceback
177
- traceback.print_exc()
178
- return f"Error: Transcription failed. Details: {str(e)}"
179
  finally:
180
- for p in [video_path, audio_path]:
181
- if p and os.path.exists(p):
182
- try:
183
- os.remove(p)
184
- print(f"Removed temporary file: {p}")
185
- except Exception as e_rm:
186
- print(f"Error removing temporary file {p}: {e_rm}")
187
-
188
- # This is a local entrypoint for testing the Modal function if you run `modal run modal_whisper_app.py`
189
- @app.local_entrypoint()
190
- def main():
191
- # This is just an example of how you might test.
192
- # You'd need a sample video file (e.g., "sample.mp4") in the same directory.
193
- # For actual deployment, this main function isn't strictly necessary as Gradio will call the webhook.
194
- sample_video_path = "sample.mp4"
195
- if not os.path.exists(sample_video_path):
196
- print(f"Sample video {sample_video_path} not found. Skipping local test run.")
197
- return
198
-
199
- with open(sample_video_path, "rb") as f:
200
- video_bytes_content = f.read()
201
-
202
- print(f"Testing transcription with {sample_video_path}...")
203
- transcription = transcribe_video_audio.remote(video_bytes_content)
204
- print("----")
205
- print(f"Transcription Result: {transcription}")
206
- print("----")
207
-
208
- # To call this function from another Python script (after deployment):
209
- # import modal
210
- # Ensure the app name matches the one in modal.App(name=...)
211
- # The exact lookup method might vary slightly with modal.App, often it's:
212
- # deployed_app = modal.App.lookup("whisper-transcriber")
213
- # or by accessing the function directly if the app is deployed with a name.
214
- # For a deployed function, you might use its tag or webhook URL directly.
215
- # Example using a direct function call if deployed and accessible:
216
- # f = modal.Function.lookup("whisper-transcriber/transcribe_video_audio") # Or similar based on deployment output
217
- # For invoking:
218
- # result = f.remote(your_video_bytes) # for async
219
- # print(result)
220
- # Or, if you have the app object:
221
- # result = app.functions.transcribe_video_audio.remote(your_video_bytes)
222
- # Consult Modal documentation for the precise invocation method for your Modal version and deployment style.
223
-
224
- # Note: When deploying to Modal, Modal uses the `app.serve()` or `app.deploy()` mechanism.
225
- # The Gradio app will call the deployed Modal function via its HTTP endpoint.
226
 
 
227
  @app.function(
228
- image=whisper_image,
229
  secrets=[HF_TOKEN_SECRET],
230
- timeout=900, # Potentially shorter if model is pre-loaded and efficient
231
- gpu="any" # Request any available GPU
232
  )
233
- def generate_video_caption(video_bytes: bytes) -> str:
 
234
  import torch
235
- import av # PyAV for frame extraction
236
- from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
237
- import tempfile
238
- import os
239
  import numpy as np
 
240
 
241
- if not video_bytes:
242
- return "Error: No video data received for captioning."
243
-
244
- print(f"Starting video captioning with {CAPTION_MODEL_NAME}...")
245
  video_path = None
246
  try:
247
- # 1. Load pre-cached model, processor, and tokenizer
248
- # Ensure these names match what's used in download_caption_model
249
- image_processor = AutoImageProcessor.from_pretrained(CAPTION_PROCESSOR_NAME)
250
- tokenizer = AutoTokenizer.from_pretrained(CAPTION_TOKENIZER_NAME)
251
- model = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_NAME)
 
 
 
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
254
  model.to(device)
255
- print(f"Caption model loaded on device: {device}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- # 2. Save video_bytes to a temporary file to be read by PyAV
 
 
258
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
259
  tmp_video_file.write(video_bytes)
260
  video_path = tmp_video_file.name
261
- print(f"Temporary video file for captioning saved: {video_path}")
262
 
263
- # 3. Frame extraction using PyAV
264
  container = av.open(video_path)
265
- # Select 8 frames evenly spaced throughout the video
266
- # Similar to the SpaceTimeGPT example
267
- total_frames = container.streams.video[0].frames
268
- num_frames_to_sample = 8
269
- indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
 
 
270
 
271
- frames = []
272
- container.seek(0) # Reset stream to the beginning
273
- frame_idx = 0
274
- target_idx_ptr = 0
275
- for frame in container.decode(video=0):
276
- if target_idx_ptr < len(indices) and frame_idx == indices[target_idx_ptr]:
277
- frames.append(frame.to_image()) # Convert to PIL Image
278
- target_idx_ptr += 1
279
- frame_idx += 1
280
- if len(frames) == num_frames_to_sample:
281
- break
282
  container.close()
 
 
 
 
 
 
 
 
283
 
284
- if not frames:
285
- print("No frames extracted, cannot generate caption.")
286
- return "Error: Could not extract frames for captioning."
287
- print(f"Extracted {len(frames)} frames for captioning.")
288
-
289
- # 4. Generate caption
290
- # The SpaceTimeGPT example doesn't use a specific prompt, it generates from frames directly
291
- pixel_values = image_processor(images=frames, return_tensors="pt").pixel_values.to(device)
292
- # The model card for Neleac/SpaceTimeGPT uses max_length=128, num_beams=5
293
- generated_ids = model.generate(pixel_values, max_length=128, num_beams=5)
294
- caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
- print(f"Generated caption: {caption}")
297
- return caption
 
 
 
 
 
 
 
 
 
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  except Exception as e:
300
- print(f"Error during video captioning: {e}")
301
  import traceback
302
  traceback.print_exc()
303
- return f"Error: Video captioning failed. Details: {str(e)}"
304
  finally:
305
  if video_path and os.path.exists(video_path):
306
- try:
307
- os.remove(video_path)
308
- print(f"Removed temporary video file for captioning: {video_path}")
309
- except Exception as e_rm:
310
- print(f"Error removing temporary captioning video file {video_path}: {e_rm}")
311
 
 
 
312
  @app.function(
313
- image=whisper_image,
314
  secrets=[HF_TOKEN_SECRET],
315
- timeout=1800, # Increased timeout for combined processing
316
- gpu="any"
 
 
317
  )
318
- @modal.concurrent(max_inputs=10) # Replaces allow_concurrent_inputs
319
- @modal.fastapi_endpoint(method="POST") # Replaces web_endpoint
320
- async def process_video_context(video_bytes: bytes, video_url: str = None):
321
- import json
322
- import hashlib
323
-
324
- if not video_bytes:
325
- return modal.Response(status_code=400, body=json.dumps({"error": "No video data provided."}))
326
-
327
- # Generate a cache key
328
- # If URL is provided, use it. Otherwise, hash the video content (can be slow for large videos).
329
- cache_key = ""
330
- if video_url:
331
- cache_key = hashlib.sha256(video_url.encode()).hexdigest()
332
- else:
333
- # Hashing large video_bytes can be memory/CPU intensive. Consider alternatives if this is an issue.
334
- # For now, let's proceed with hashing bytes if no URL.
335
- cache_key = hashlib.sha256(video_bytes).hexdigest()
336
-
337
- print(f"Generated cache key: {cache_key}")
338
 
339
- # Check cache first
340
- if cache_key in video_analysis_cache:
341
- print(f"Cache hit for key: {cache_key}")
342
- cached_result = video_analysis_cache[cache_key]
343
- return modal.Response(status_code=200, body=json.dumps(cached_result))
344
-
345
- print(f"Cache miss for key: {cache_key}. Processing video...")
 
346
 
 
347
  results = {}
348
- error_messages = []
349
 
350
- # Call transcription and captioning in parallel
351
- transcription_future = transcribe_video_audio.spawn(video_bytes)
352
- caption_call = generate_video_caption.spawn(video_bytes)
353
- action_call = generate_action_labels.spawn(video_bytes) # Placeholder for now
 
 
 
354
 
 
355
  try:
356
- transcription_result = await transcription_future
357
- if transcription_result.startswith("Error:"):
358
- error_messages.append(f"Transcription: {transcription_result}")
359
- results["transcription"] = None
360
- else:
361
- results["transcription"] = transcription_result
362
  except Exception as e:
363
- print(f"Error in transcription task: {e}")
364
- error_messages.append(f"Transcription: Failed with exception - {str(e)}")
365
- results["transcription"] = None
366
 
 
367
  try:
368
- caption_result = await caption_call
369
- if caption_result.startswith("Error:"):
370
- error_messages.append(f"Captioning: {caption_result}")
371
- results["video_caption"] = None
372
- else:
373
- results["video_caption"] = caption_result
374
  except Exception as e:
375
- print(f"Error in captioning task: {e}")
376
- error_messages.append(f"Captioning: Failed with exception - {str(e)}")
377
- results["video_caption"] = None
378
 
 
379
  try:
380
- action_result = await action_call # action_result is a dict from generate_action_labels
381
- if action_result.get("error"):
382
- error_messages.append(f"Action recognition: {action_result.get('error')}")
383
- results["action_recognition"] = None
384
- else:
385
- results["action_recognition"] = action_result.get("actions", "No actions detected or error in result format")
386
  except Exception as e:
387
- print(f"Error in action recognition task: {e}")
388
- import traceback
389
- traceback.print_exc()
390
- error_messages.append(f"Action recognition: Failed with exception - {str(e)}")
391
- results["action_recognition"] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
- # TODO: Add calls for object detection here in the future
394
- results["object_detection"] = "(Object detection/tracking not yet implemented)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
- if error_messages:
397
- results["processing_errors"] = error_messages
398
- # Store partial results in cache even if there are errors
399
- video_analysis_cache[cache_key] = results
400
- return modal.Response(status_code=207, body=json.dumps(results)) # 207 Multi-Status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
- # Store successful full result in cache
403
- video_analysis_cache[cache_key] = results
404
- print(f"Successfully processed and cached results for key: {cache_key}")
405
- return modal.Response(status_code=200, body=json.dumps(results))
406
 
407
- # Update local entrypoint to use the new main processing function if desired for testing
408
- # For now, keeping it as is to test transcription independently if needed.
409
 
 
410
  @app.function(
411
- image=whisper_image,
412
- secrets=[HF_TOKEN_SECRET],
413
- timeout=700, # Increased timeout slightly for model loading and inference
414
- gpu="any" # Requires GPU
415
  )
416
- def generate_action_labels(video_bytes: bytes) -> dict:
417
- import torch
418
- import av
419
- import numpy as np
420
- import tempfile
421
- import os
422
- from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
423
- from huggingface_hub import login
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
- if not video_bytes:
426
- return {"actions": [], "error": "No video data received."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
- hf_token = os.environ.get("HF_TOKEN")
429
- if hf_token:
430
- try:
431
- login(token=hf_token)
432
- print("Action Recognition: Successfully logged into Hugging Face Hub.")
433
- except Exception as e:
434
- print(f"Action Recognition: Hugging Face Hub login failed: {e}.")
435
- else:
436
- print("Action Recognition: HF_TOKEN secret not found. Proceeding without login.")
437
 
438
- video_path = None
 
 
 
 
 
 
 
 
 
439
  try:
440
- device = "cuda" if torch.cuda.is_available() else "cpu"
441
- print(f"Action Recognition: Loading model on device: {device}")
 
 
 
 
 
 
 
 
 
 
442
 
443
- processor = VideoMAEImageProcessor.from_pretrained(ACTION_PROCESSOR_NAME)
444
- model = VideoMAEForVideoClassification.from_pretrained(ACTION_MODEL_NAME)
445
- model.to(device)
446
- model.eval()
447
- print(f"Action Recognition: Model {ACTION_MODEL_NAME} and processor loaded.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
450
- tmp_video_file.write(video_bytes)
451
- video_path = tmp_video_file.name
452
-
453
- container = av.open(video_path)
454
- stream = container.streams.video[0]
455
-
456
- num_frames_to_extract = 16
457
- total_frames = stream.frames
458
- if total_frames == 0:
459
- return {"actions": [], "error": "Video stream has no frames."}
460
-
461
- # Ensure we don't try to select more frames than available, especially for very short videos
462
- if total_frames < num_frames_to_extract:
463
- print(f"Warning: Video has only {total_frames} frames, less than desired {num_frames_to_extract}. Using all available frames.")
464
- num_frames_to_extract = total_frames
465
- if num_frames_to_extract == 0: # Double check after adjustment
466
- return {"actions": [], "error": "Video stream has no frames after adjustment."}
467
-
468
- indices = np.linspace(0, total_frames - 1, num_frames_to_extract, dtype=int)
469
-
470
- frames = []
471
- container.seek(0) # Reset stream to the beginning before decoding specific frames
472
- frame_idx_counter = 0
473
- target_idx_ptr = 0
474
- for frame in container.decode(video=0):
475
- if target_idx_ptr < len(indices) and frame_idx_counter == indices[target_idx_ptr]:
476
- frames.append(frame.to_image()) # Convert to PIL Image
477
- target_idx_ptr += 1
478
- frame_idx_counter += 1
479
- if target_idx_ptr == len(indices):
480
- break
481
 
482
- container.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
 
484
- if not frames:
485
- return {"actions": [], "error": "Could not extract frames from video."}
486
 
487
- print(f"Action Recognition: Extracted {len(frames)} frames.")
488
 
489
- # Process frames and predict
490
- inputs = processor(frames, return_tensors="pt").to(device)
 
 
 
 
491
 
492
- with torch.no_grad():
493
- outputs = model(**inputs)
494
- logits = outputs.logits
495
 
496
- predicted_class_idx = logits.argmax(-1).item()
497
- predicted_label = model.config.id2label[predicted_class_idx]
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
- print(f"Action Recognition: Predicted action: {predicted_label}")
500
- return {"actions": [predicted_label], "error": None}
 
501
 
502
- except Exception as e:
503
- print(f"Error during action recognition: {e}")
504
- import traceback
505
- traceback.print_exc()
506
- return {"actions": [], "error": f"Action recognition failed: {str(e)}"}
507
- finally:
508
- if video_path and os.path.exists(video_path):
509
- try:
510
- os.remove(video_path)
511
- print(f"Removed temporary video file for action recognition: {video_path}")
512
- except Exception as e_rm:
513
- print(f"Error removing temporary action recognition video file {video_path}: {e_rm}")
514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import modal
2
+ from fastapi import FastAPI, UploadFile, File, Body, Query
3
  import os
4
  import tempfile
5
+ import io # Used by Whisper for BytesIO
6
+ import hashlib # For generating cache keys
7
+ import httpx # For downloading video from URL if needed by endpoint
8
+ import gradio as gr
9
+ import gradio.routes
10
+ from typing import Dict, List, Any, Optional # For type hinting results and Optional in Pydantic
11
+ from fastapi.responses import JSONResponse # For FastAPI endpoint
12
+ from fastapi import File, Body, UploadFile, Query # For FastAPI file uploads, request body parts, and query parameters
13
+ from pydantic import BaseModel # For FastAPI request body validation
14
+ import re # For parsing search results
15
+ import asyncio # For concurrent video processing
16
+
17
+ # --- Constants for Model Names ---
18
+ WHISPER_MODEL_NAME = "openai/whisper-large-v3"
19
  CAPTION_MODEL_NAME = "Neleac/SpaceTimeGPT"
20
+ CAPTION_PROCESSOR_NAME = "MCG-NJU/videomae-base" # For SpaceTimeGPT's video encoder
21
+ # CAPTION_TOKENIZER_NAME = "gpt2" # For SpaceTimeGPT's text decoder (usually part of processor)
22
  ACTION_MODEL_NAME = "MCG-NJU/videomae-base-finetuned-kinetics"
23
+ ACTION_PROCESSOR_NAME = "MCG-NJU/videomae-base" # Or VideoMAEImageProcessor.from_pretrained(ACTION_MODEL_NAME)
24
+ OBJECT_DETECTION_MODEL_NAME = "facebook/detr-resnet-50"
25
+ OBJECT_DETECTION_PROCESSOR_NAME = "facebook/detr-resnet-50"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # --- Modal Image Definition ---
28
+ video_analysis_image = (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  modal.Image.debian_slim(python_version="3.10")
30
  .apt_install("ffmpeg")
 
 
 
 
 
 
 
 
31
  .pip_install(
32
+ "gradio==3.50.2", # Pin Gradio version for stability
33
+ "transformers[torch]", # For all Hugging Face models and PyTorch
34
+ "soundfile", # For Whisper
35
+ "av", # For video frame extraction
36
+ "Pillow", # For image processing
37
+ "timm", # Often a dependency for vision models
38
+ "torchvision",
39
+ "torchaudio",
40
+ "fastapi[standard]", # For web endpoints
41
+ "pydantic", # For request body validation
42
+ "httpx" # For downloading video from URL
43
  )
 
 
 
44
  )
45
 
46
+ # --- Modal App Definition ---
47
+ app = modal.App(name="video-analysis-gradio-pipeline") # New app name, using App
48
 
49
+ fastapi_app = FastAPI() # Initialize FastAPI app
50
 
51
+ # --- Modal Distributed Dictionary for Caching ---
52
+ video_analysis_cache = modal.Dict.from_name("video_analysis_cache", create_if_missing=True)
53
 
54
+ # --- Hugging Face Token Secret ---
 
 
55
  HF_TOKEN_SECRET = modal.Secret.from_name("my-huggingface-secret")
56
 
57
+ # --- Helper: Hugging Face Login ---
58
+ def _login_to_hf():
59
+ import os
 
 
 
 
 
 
 
 
 
60
  from huggingface_hub import login
61
+ hf_token = os.environ.get("HF_TOKEN")
 
 
 
 
 
62
  if hf_token:
63
  try:
64
  login(token=hf_token)
65
  print("Successfully logged into Hugging Face Hub.")
66
+ return True
67
  except Exception as e:
68
+ print(f"Hugging Face Hub login failed: {e}")
69
+ return False
70
  else:
71
+ print("HF_TOKEN secret not found. Some models might fail to load.")
72
+ return False
73
 
74
+ # === 1. Transcription with Whisper ===
75
+ @app.function(
76
+ image=video_analysis_image,
77
+ secrets=[HF_TOKEN_SECRET],
78
+ gpu="any",
79
+ timeout=600
80
+ )
81
+ def transcribe_video_with_whisper(video_bytes: bytes) -> str:
82
+ _login_to_hf()
83
+ import torch
84
+ from transformers import pipeline
85
+ import soundfile as sf
86
+ import av # For robust audio extraction
87
+ import numpy as np
88
+ import io
 
 
 
 
89
 
90
+ print("[Whisper] Starting transcription.")
91
+ temp_audio_path = None
92
  try:
93
+ # Robust audio extraction using PyAV
94
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
95
  tmp_video_file.write(video_bytes)
96
  video_path = tmp_video_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ container = av.open(video_path)
99
+ audio_stream = next((s for s in container.streams if s.type == 'audio'), None)
100
+ if audio_stream is None:
101
+ return "Whisper Error: No audio stream found in video."
102
+
103
+ # Decode and resample audio to 16kHz mono WAV
104
+ # Store resampled audio in a temporary WAV file
105
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_file_for_sf:
106
+ temp_audio_path = tmp_audio_file_for_sf.name
107
 
108
+ output_container = av.open(temp_audio_path, mode='w')
109
+ output_stream = output_container.add_stream('pcm_s16le', rate=16000, layout='mono')
 
 
 
 
 
110
 
111
+ for frame in container.decode(audio_stream):
112
+ for packet in output_stream.encode(frame):
113
+ output_container.mux(packet)
114
+
115
+ # Flush stream
116
+ for packet in output_stream.encode():
117
+ output_container.mux(packet)
118
+
119
+ output_container.close()
120
+ container.close()
121
+ os.remove(video_path) # Clean up temp video file
122
+
123
+ pipe = pipeline(
124
+ "automatic-speech-recognition",
125
+ model=WHISPER_MODEL_NAME,
126
+ torch_dtype=torch.float16,
127
+ device="cuda:0" if torch.cuda.is_available() else "cpu",
128
+ )
129
+ print(f"[Whisper] Pipeline loaded. Transcribing {temp_audio_path}...")
130
+ outputs = pipe(temp_audio_path, chunk_length_s=30, batch_size=8, return_timestamps=False)
131
+ transcription = outputs["text"]
132
+ print(f"[Whisper] Transcription successful: {transcription[:100]}...")
133
+ return transcription
134
  except Exception as e:
135
+ print(f"[Whisper] Error: {e}")
136
  import traceback
137
+ traceback.print_exc()
138
+ return f"Whisper Error: {str(e)}"
139
  finally:
140
+ if temp_audio_path and os.path.exists(temp_audio_path):
141
+ os.remove(temp_audio_path)
142
+ if 'video_path' in locals() and video_path and os.path.exists(video_path):
143
+ os.remove(video_path) # Ensure temp video is cleaned up if audio extraction failed early
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ # === 2. Captioning with SpaceTimeGPT ===
146
  @app.function(
147
+ image=video_analysis_image,
148
  secrets=[HF_TOKEN_SECRET],
149
+ gpu="any",
150
+ timeout=600
151
  )
152
+ def generate_captions_with_spacetimegpt(video_bytes: bytes) -> str:
153
+ _login_to_hf()
154
  import torch
155
+ from transformers import AutoProcessor, AutoModelForCausalLM
156
+ import av
 
 
157
  import numpy as np
158
+ import tempfile
159
 
160
+ print("[SpaceTimeGPT] Starting captioning.")
 
 
 
161
  video_path = None
162
  try:
163
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
164
+ tmp_video_file.write(video_bytes)
165
+ video_path = tmp_video_file.name
166
+
167
+ container = av.open(video_path)
168
+ video_stream = next((s for s in container.streams if s.type == 'video'), None)
169
+ if video_stream is None:
170
+ return "SpaceTimeGPT Error: No video stream found."
171
 
172
+ num_frames_to_sample = 16
173
+ total_frames = video_stream.frames
174
+ if total_frames == 0: return "SpaceTimeGPT Error: Video has no frames."
175
+
176
+ indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
177
+ frames = []
178
+ for i in indices:
179
+ container.seek(i, stream=video_stream)
180
+ frame = next(container.decode(video_stream))
181
+ frames.append(frame.to_rgb().to_ndarray())
182
+ container.close()
183
+ video_frames_np = np.stack(frames)
184
+
185
+ processor = AutoProcessor.from_pretrained(CAPTION_PROCESSOR_NAME, trust_remote_code=True)
186
+ model = AutoModelForCausalLM.from_pretrained(CAPTION_MODEL_NAME, trust_remote_code=True)
187
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
188
  model.to(device)
189
+ if hasattr(processor, 'tokenizer'): # Check if tokenizer exists
190
+ processor.tokenizer.padding_side = "right"
191
+
192
+ print("[SpaceTimeGPT] Model and processor loaded. Generating captions...")
193
+ inputs = processor(text=None, videos=list(video_frames_np), return_tensors="pt", padding=True).to(device)
194
+
195
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
196
+ captions = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
197
+
198
+ print(f"[SpaceTimeGPT] Captioning successful: {captions}")
199
+ return captions
200
+ except Exception as e:
201
+ print(f"[SpaceTimeGPT] Error: {e}")
202
+ import traceback
203
+ traceback.print_exc()
204
+ return f"SpaceTimeGPT Error: {str(e)}"
205
+ finally:
206
+ if video_path and os.path.exists(video_path):
207
+ os.remove(video_path)
208
+
209
+ # === 3. Action Recognition with VideoMAE ===
210
+ @app.function(
211
+ image=video_analysis_image,
212
+ secrets=[HF_TOKEN_SECRET],
213
+ gpu="any",
214
+ timeout=600
215
+ )
216
+ def generate_action_labels(video_bytes: bytes) -> List[Dict[str, Any]]:
217
+ _login_to_hf()
218
+ import torch
219
+ from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
220
+ import av
221
+ import numpy as np
222
+ import tempfile
223
 
224
+ print("[VideoMAE] Starting action recognition.")
225
+ video_path = None
226
+ try:
227
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
228
  tmp_video_file.write(video_bytes)
229
  video_path = tmp_video_file.name
 
230
 
 
231
  container = av.open(video_path)
232
+ video_stream = next((s for s in container.streams if s.type == 'video'), None)
233
+ if video_stream is None:
234
+ return [{"error": "VideoMAE Error: No video stream found."}]
235
+
236
+ num_frames_to_sample = 16
237
+ total_frames = video_stream.frames
238
+ if total_frames == 0: return [{"error": "VideoMAE Error: Video has no frames."}]
239
 
240
+ indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
241
+ video_frames_list = []
242
+ for i in indices:
243
+ container.seek(i, stream=video_stream)
244
+ frame = next(container.decode(video_stream))
245
+ video_frames_list.append(frame.to_rgb().to_ndarray())
 
 
 
 
 
246
  container.close()
247
+
248
+ processor = VideoMAEImageProcessor.from_pretrained(ACTION_PROCESSOR_NAME)
249
+ model = VideoMAEForVideoClassification.from_pretrained(ACTION_MODEL_NAME)
250
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
251
+ model.to(device)
252
+
253
+ print("[VideoMAE] Model and processor loaded. Classifying actions...")
254
+ inputs = processor(video_frames_list, return_tensors="pt").to(device)
255
 
256
+ with torch.no_grad():
257
+ outputs = model(**inputs)
258
+ logits = outputs.logits
259
+
260
+ top_k = 5
261
+ probabilities = torch.softmax(logits, dim=-1)
262
+ top_probs, top_indices = torch.topk(probabilities, top_k)
263
+
264
+ results = []
265
+ for i in range(top_k):
266
+ label = model.config.id2label[top_indices[0, i].item()]
267
+ score = top_probs[0, i].item()
268
+ results.append({"action": label, "confidence": round(score, 4)})
269
+
270
+ print(f"[VideoMAE] Action recognition successful: {results}")
271
+ return results
272
+ except Exception as e:
273
+ print(f"[VideoMAE] Error: {e}")
274
+ import traceback
275
+ traceback.print_exc()
276
+ return [{"error": f"VideoMAE Error: {str(e)}"}]
277
+ finally:
278
+ if video_path and os.path.exists(video_path):
279
+ os.remove(video_path)
280
+
281
+
282
+ # === 4. Object Detection with DETR ===
283
+ @app.function(
284
+ image=video_analysis_image,
285
+ secrets=[HF_TOKEN_SECRET],
286
+ gpu="any",
287
+ timeout=600
288
+ )
289
+ def generate_object_detection(video_bytes: bytes) -> List[Dict[str, Any]]:
290
+ _login_to_hf()
291
+ import torch
292
+ from transformers import DetrImageProcessor, DetrForObjectDetection
293
+ from PIL import Image # Imported but not directly used, av.frame.to_image() is used
294
+ import av
295
+ import numpy as np
296
+ import tempfile
297
+
298
+ print("[DETR] Starting object detection.")
299
+ video_path = None
300
+ try:
301
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
302
+ tmp_video_file.write(video_bytes)
303
+ video_path = tmp_video_file.name
304
+
305
+ container = av.open(video_path)
306
+ video_stream = next((s for s in container.streams if s.type == 'video'), None)
307
+ if video_stream is None:
308
+ return [{"error": "DETR Error: No video stream found."}]
309
+
310
+ num_frames_to_extract = 3
311
+ total_frames = video_stream.frames
312
+ if total_frames == 0: return [{"error": "DETR Error: Video has no frames."}]
313
+
314
+ frame_indices = np.linspace(0, total_frames - 1, num_frames_to_extract, dtype=int)
315
 
316
+ processor = DetrImageProcessor.from_pretrained(OBJECT_DETECTION_PROCESSOR_NAME)
317
+ model = DetrForObjectDetection.from_pretrained(OBJECT_DETECTION_MODEL_NAME)
318
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
319
+ model.to(device)
320
+ print("[DETR] Model and processor loaded.")
321
+
322
+ all_frame_detections = []
323
+ for frame_num, target_frame_index in enumerate(frame_indices):
324
+ container.seek(target_frame_index, stream=video_stream)
325
+ frame = next(container.decode(video_stream))
326
+ pil_image = frame.to_image()
327
 
328
+ print(f"[DETR] Processing frame {frame_num + 1}/{num_frames_to_extract} (original index {target_frame_index})...")
329
+ inputs = processor(images=pil_image, return_tensors="pt").to(device)
330
+ outputs = model(**inputs)
331
+
332
+ target_sizes = torch.tensor([pil_image.size[::-1]], device=device)
333
+ results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.7)[0]
334
+
335
+ frame_detections = []
336
+ for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
337
+ frame_detections.append({
338
+ "label": model.config.id2label[label.item()],
339
+ "confidence": round(score.item(), 3),
340
+ "box": [round(coord) for coord in box.tolist()]
341
+ })
342
+ if frame_detections: # Only add if detections are present for this frame
343
+ all_frame_detections.append({
344
+ "frame_number": frame_num + 1,
345
+ "original_frame_index": int(target_frame_index),
346
+ "detections": frame_detections
347
+ })
348
+ container.close()
349
+ print(f"[DETR] Object detection successful: {all_frame_detections if all_frame_detections else 'No objects detected with threshold.'}")
350
+ return all_frame_detections if all_frame_detections else [{"info": "No objects detected with current threshold."}]
351
  except Exception as e:
352
+ print(f"[DETR] Error: {e}")
353
  import traceback
354
  traceback.print_exc()
355
+ return [{"error": f"DETR Error: {str(e)}"}]
356
  finally:
357
  if video_path and os.path.exists(video_path):
358
+ os.remove(video_path)
 
 
 
 
359
 
360
+
361
+ # === 5. Comprehensive Video Analysis (Orchestrator) ===
362
  @app.function(
363
+ image=video_analysis_image,
364
  secrets=[HF_TOKEN_SECRET],
365
+ gpu="any", # Request GPU as some sub-tasks will need it
366
+ timeout=1800, # Generous timeout for all models
367
+ # allow_concurrent_inputs=10, # Optional: if you expect many parallel requests
368
+ # keep_warm=1 # Optional: to keep one instance warm for faster cold starts
369
  )
370
+ async def analyze_video_comprehensive(video_bytes: bytes) -> Dict[str, Any]:
371
+ print("[Orchestrator] Starting comprehensive video analysis.")
372
+ cache_key = hashlib.sha256(video_bytes).hexdigest()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
+ try:
375
+ cached_result = await video_analysis_cache.get(cache_key)
376
+ if cached_result:
377
+ print(f"[Orchestrator] Cache hit for key: {cache_key}")
378
+ return cached_result
379
+ except Exception as e:
380
+ # Log error but proceed with analysis if cache get fails
381
+ print(f"[Orchestrator] Cache GET error: {e}. Proceeding with fresh analysis.")
382
 
383
+ print(f"[Orchestrator] Cache miss for key: {cache_key}. Performing full analysis.")
384
  results = {}
 
385
 
386
+ print("[Orchestrator] Calling transcription...")
387
+ try:
388
+ # .call() is synchronous in the context of the Modal function execution
389
+ results["transcription"] = transcribe_video_with_whisper.call(video_bytes)
390
+ except Exception as e:
391
+ print(f"[Orchestrator] Error in transcription: {e}")
392
+ results["transcription"] = f"Transcription Error: {str(e)}"
393
 
394
+ print("[Orchestrator] Calling captioning...")
395
  try:
396
+ results["caption"] = generate_captions_with_spacetimegpt.call(video_bytes)
 
 
 
 
 
397
  except Exception as e:
398
+ print(f"[Orchestrator] Error in captioning: {e}")
399
+ results["caption"] = f"Captioning Error: {str(e)}"
 
400
 
401
+ print("[Orchestrator] Calling action recognition...")
402
  try:
403
+ results["actions"] = generate_action_labels.call(video_bytes)
 
 
 
 
 
404
  except Exception as e:
405
+ print(f"[Orchestrator] Error in action recognition: {e}")
406
+ results["actions"] = [{"error": f"Action Recognition Error: {str(e)}"}] # Ensure list type for error
 
407
 
408
+ print("[Orchestrator] Calling object detection...")
409
  try:
410
+ results["objects"] = generate_object_detection.call(video_bytes)
 
 
 
 
 
411
  except Exception as e:
412
+ print(f"[Orchestrator] Error in object detection: {e}")
413
+ results["objects"] = [{"error": f"Object Detection Error: {str(e)}"}] # Ensure list type for error
414
+
415
+ print("[Orchestrator] All analyses attempted. Storing results in cache.")
416
+ try:
417
+ await video_analysis_cache.put(cache_key, results)
418
+ print(f"[Orchestrator] Successfully cached results for key: {cache_key}")
419
+ except Exception as e:
420
+ print(f"[Orchestrator] Cache PUT error: {e}")
421
+
422
+ return results
423
+
424
+
425
+ # --- Pydantic model for FastAPI request ---
426
+ class VideoAnalysisRequestPayload(BaseModel):
427
+ video_url: Optional[str] = None
428
+
429
+
430
+ # === FastAPI Endpoint for Comprehensive Analysis ===
431
+ @fastapi_app.post("/analyze_video")
432
+ async def process_video_for_analysis(
433
+ payload: Optional[VideoAnalysisRequestPayload] = Body(None),
434
+ video_file: Optional[UploadFile] = File(None) # Use UploadFile for type hint and async read
435
+ ):
436
+ print("[FastAPI Endpoint] Received request for comprehensive analysis.")
437
+ video_bytes_content: Optional[bytes] = None
438
+ video_source_description: str = "Unknown"
439
+
440
+ if video_file:
441
+ print(f"[FastAPI Endpoint] Processing uploaded video file: {video_file.filename}, size: {video_file.size} bytes.")
442
+ video_bytes_content = await video_file.read() # Use await for async read
443
+ video_source_description = f"direct file upload: {video_file.filename}"
444
+ elif payload and payload.video_url:
445
+ video_url = str(payload.video_url) # Ensure it's a string
446
+ print(f"[FastAPI Endpoint] Processing video_url: {video_url}")
447
+ video_source_description = f"URL: {video_url}"
448
+ try:
449
+ async with httpx.AsyncClient() as client:
450
+ response = await client.get(video_url, follow_redirects=True, timeout=60.0)
451
+ response.raise_for_status()
452
+ video_bytes_content = await response.aread()
453
+ if not video_bytes_content:
454
+ print(f"[FastAPI Endpoint] Download failed: content was empty for URL: {video_url}")
455
+ return JSONResponse(status_code=400, content={"error": f"Failed to download video from URL: {video_url}. Content was empty."})
456
+ print(f"[FastAPI Endpoint] Successfully downloaded {len(video_bytes_content)} bytes from {video_url}")
457
+ except httpx.RequestError as e:
458
+ print(f"[FastAPI Endpoint] httpx.RequestError downloading video: {e}")
459
+ return JSONResponse(status_code=400, content={"error": f"Error downloading video from URL: {video_url}. Details: {str(e)}"})
460
+ except Exception as e:
461
+ print(f"[FastAPI Endpoint] Unexpected Exception downloading video: {e}")
462
+ return JSONResponse(status_code=500, content={"error": f"Unexpected error downloading video. Details: {str(e)}"})
463
+ else:
464
+ print("[FastAPI Endpoint] No video_url in payload and no video_file uploaded.")
465
+ return JSONResponse(status_code=400, content={"error": "Either 'video_url' in JSON payload or a 'video_file' in form-data must be provided."})
466
 
467
+ if not video_bytes_content:
468
+ print("[FastAPI Endpoint] Critical error: video_bytes_content is not populated after input processing.")
469
+ return JSONResponse(status_code=500, content={"error": "Internal server error: video data could not be obtained."})
470
+
471
+ print(f"[FastAPI Endpoint] Calling analyze_video_comprehensive for video from {video_source_description} ({len(video_bytes_content)} bytes).")
472
+ try:
473
+ # Since process_video_for_analysis is an @app.function, it can .call() another @app.function
474
+ analysis_results = await analyze_video_comprehensive.call(video_bytes_content)
475
+ print("[FastAPI Endpoint] Comprehensive analysis finished.")
476
+ return JSONResponse(status_code=200, content=analysis_results)
477
+ except modal.exception.ModalError as e:
478
+ print(f"[FastAPI Endpoint] ModalError during comprehensive analysis: {e}")
479
+ return JSONResponse(status_code=500, content={"error": f"Modal processing error: {str(e)}"})
480
+ except Exception as e:
481
+ print(f"[FastAPI Endpoint] Unexpected Exception during comprehensive analysis: {e}")
482
+ # import traceback # Uncomment for detailed server-side stack trace
483
+ # traceback.print_exc() # Uncomment for detailed server-side stack trace
484
+ return JSONResponse(status_code=500, content={"error": f"Unexpected server error during analysis: {str(e)}"})
485
 
486
+
487
+ @fastapi_app.post("/analyze_topic")
488
+ async def analyze_topic_endpoint(topic: str = Query(..., min_length=3, description="The topic to search videos for."),
489
+ max_videos: Optional[int] = Query(3, ge=1, le=10, description="Maximum number of videos to find and analyze.")):
490
+ """Endpoint to find videos for a topic, analyze them, and return aggregated results."""
491
+ print(f"[FastAPI /analyze_topic] Received request for topic: '{topic}', max_videos: {max_videos}")
492
+
493
+ # This endpoint is orchestrated by Cascade. Cascade will:
494
+ # 1. Call its `search_web` tool with the `topic`.
495
+ # 2. Call the local Python helper `extract_video_urls_from_search` with the search results.
496
+ # 3. Call the Modal function `analyze_videos_by_topic.remote()` with the extracted URLs and topic.
497
+ # The actual implementation of these steps happens in Cascade's execution flow, not directly in this FastAPI code.
498
+ # This FastAPI endpoint definition tells Modal to expect such a route and parameters.
499
+ # The body of this function in the Python file is a placeholder for Cascade's orchestration.
500
+
501
+ # Placeholder: The actual call to analyze_videos_by_topic.remote() will be made by Cascade
502
+ # after it performs the search and URL extraction.
503
+ # For standalone Modal testing, one might simulate this:
504
+ # if modal.is_local():
505
+ # # Simulate search and extraction
506
+ # simulated_search_results = [{"link": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}]
507
+ # video_urls = extract_video_urls_from_search(simulated_search_results, max_videos)
508
+ # if not video_urls:
509
+ # return JSONResponse(status_code=404, content={"error": "No relevant video URLs found for the topic after search."})
510
+ # try:
511
+ # result = await analyze_videos_by_topic.coro(video_urls=video_urls, topic=topic)
512
+ # return result
513
+ # except Exception as e:
514
+ # return JSONResponse(status_code=500, content={"error": f"Error during topic analysis: {str(e)}"})
515
+ # else:
516
+ # # In deployed Modal, Cascade handles the call chain.
517
+ # # This function body might not even be executed directly if Cascade calls .remote() on analyze_videos_by_topic directly.
518
+ # # However, having a defined endpoint is good practice for discoverability and potential direct calls.
519
+ pass
520
+
521
+ # The actual logic is: Cascade calls search_web -> Cascade calls extract_video_urls_from_search -> Cascade calls analyze_videos_by_topic.remote().
522
+ # This endpoint is the entry point for the USER's request to Cascade.
523
+ # Cascade will then perform the sequence of operations.
524
+ # So, this function body is more of a declaration for the endpoint.
525
+ # We expect Cascade to handle the full orchestration when this endpoint is invoked.
526
+ # For the purpose of defining the Modal app structure, this is sufficient.
527
+ # The `analyze_videos_by_topic` function is what Modal will ultimately run with the list of URLs provided by Cascade.
528
 
529
+ # Return a message indicating that the process is initiated by Cascade.
530
+ # This response won't typically be seen if Cascade directly calls the .remote() of the target function.
531
+ return JSONResponse(status_code=202, content={"message": "Topic analysis process initiated. Cascade will orchestrate the search and analysis."})
532
+
533
 
 
 
534
 
535
+ # === 6. Topic-Based Video Search ===
536
  @app.function(
537
+ image=video_analysis_image,
538
+ secrets=[HF_TOKEN_SECRET],
539
+ timeout=300
 
540
  )
541
+ def find_video_urls_for_topic(topic: str, max_results: int = 3) -> List[str]:
542
+ """Finds video URLs (YouTube, direct links) for a given topic using web search."""
543
+ print(f"[TopicSearch] Finding video URLs for topic: '{topic}', max_results={max_results}")
544
+
545
+ # This import is inside because search_web is a tool available to Cascade, not directly to Modal runtime
546
+ # This function will be called via .remote() and its implementation will be provided by Cascade's tool execution
547
+ # For now, this is a placeholder for where the search_web tool would be invoked.
548
+ # In a real Modal execution, this function would need to use a library like 'requests' and 'beautifulsoup'
549
+ # or a dedicated search API (e.g., SerpApi, Google Search API) if called from within Modal directly.
550
+ # Since Cascade calls this, it will use its 'search_web' tool.
551
+
552
+ # Simulate search results for now, as direct tool call from Modal code isn't standard.
553
+ # When Cascade calls this, it should intercept and use its search_web tool.
554
+ # For local testing or direct Modal runs, this would need a real search implementation.
555
+
556
+ # Placeholder: In a real scenario, this function would use a search tool/API.
557
+ # For the purpose of this exercise, we'll assume Cascade's `search_web` tool will be used
558
+ # when this function is invoked through Cascade's orchestration.
559
+ # If running this Modal app standalone, this part needs a concrete implementation.
560
+
561
+ # Example of what the logic would look like if we had search results:
562
+ # query = f"{topic} video youtube OR .mp4 OR .mov"
563
+ # search_results = [] # This would be populated by a search_web call
564
+
565
+ # For demonstration, let's return some dummy URLs. Replace with actual search logic.
566
+ # print(f"[TopicSearch] This is a placeholder. Actual search via Cascade's 'search_web' tool is expected.")
567
+ # print(f"[TopicSearch] If running standalone, implement search logic here.")
568
 
569
+ # The actual implementation will be handled by Cascade's search_web tool call
570
+ # when this function is called via .remote() by another function that Cascade is orchestrating.
571
+ # This function definition serves as a Modal-compatible stub for Cascade's tool.
572
+
573
+ # This function is more of a declaration for Cascade to use its tool.
574
+ # The actual search logic will be implicitly handled by Cascade's tool call mechanism
575
+ # when `find_video_urls_for_topic.remote()` is used in a subsequent step orchestrated by Cascade.
576
+
577
+ # If this function were to be *truly* self-contained within Modal and callable independently
578
+ # *without* Cascade's direct tool invocation, it would need its own HTTP client and parsing logic here.
579
+ # However, given the context of Cascade's operation, this stub is appropriate for Cascade to inject its tool usage.
580
+
581
+ # The `search_web` tool will be called by Cascade when it orchestrates the call to this function.
582
+ # So, this Python function in `modal_whisper_app.py` mostly defines the signature and intent.
583
+ # We will rely on Cascade to make the actual search_web call and provide the results back to the orchestrator.
584
+
585
+ # This function, when called by Cascade, will trigger a `search_web` tool call.
586
+ # The tool call will be made by Cascade, not by the Modal runtime directly.
587
+ # For now, let's assume this function's body is a placeholder for that interaction.
588
+ # The key is that the *calling* function (e.g., analyze_videos_by_topic) will use .remote(),
589
+ # and Cascade will manage the search_web tool call.
590
+
591
+ # To make this runnable standalone (for testing Modal part without Cascade), one might add:
592
+ # if modal.is_local():
593
+ # # basic requests/bs4 search or return dummy data
594
+ # pass
595
+
596
+ # For the flow with Cascade, this function primarily serves as a named Modal function
597
+ # that Cascade understands it needs to provide search results for.
598
+ # The actual search logic is deferred to Cascade's tool execution.
599
+ # We will return an empty list here, expecting Cascade to populate it via its mechanisms when called.
600
+ print(f"[TopicSearch] Function '{find_video_urls_for_topic.__name__}' called. Expecting Cascade to perform web search.")
601
+ # This is a conceptual placeholder. The actual search will be done by Cascade's tool.
602
+ # When `analyze_videos_by_topic` calls `find_video_urls_for_topic.remote()`,
603
+ # Cascade will execute its `search_web` tool and the result will be used.
604
+ return [] # Placeholder: Cascade will provide actual URLs via its search_web tool.
605
+
606
+ # Helper function (not a Modal function) to extract video URLs from search results
607
+ def extract_video_urls_from_search(search_results: List[Dict[str, str]], max_urls: int = 3) -> List[str]:
608
+ """Extracts video URLs from a list of search result dictionaries."""
609
+ video_urls = []
610
+ seen_urls = set()
611
+
612
+ # Regex for YouTube, Vimeo, and common video file extensions
613
+ # Simplified YouTube regex to catch most common video and shorts links
614
+ youtube_regex = r"(?:https?://)?(?:www\.)?(?:youtube\.com/(?:watch\?v=|embed/|shorts/)|youtu\.be/)([a-zA-Z0-9_-]{11})"
615
+ vimeo_regex = r"(?:https?://)?(?:www\.)?vimeo\.com/(\d+)"
616
+ direct_video_regex = r"https?://[^\s]+\.(mp4|mov|avi|webm|mkv)(\?[^\s]*)?"
617
+
618
+ patterns = [
619
+ re.compile(youtube_regex),
620
+ re.compile(vimeo_regex),
621
+ re.compile(direct_video_regex)
622
+ ]
623
+
624
+ for item in search_results:
625
+ url = item.get("link") or item.get("url") # Common keys for URL in search results
626
+ if not url:
627
+ continue
628
+
629
+ for pattern in patterns:
630
+ match = pattern.search(url)
631
+ if match:
632
+ # Reconstruct canonical YouTube URL if it's a short link or embed
633
+ if pattern.pattern == youtube_regex and match.group(1):
634
+ normalized_url = f"https://www.youtube.com/watch?v={match.group(1)}"
635
+ else:
636
+ normalized_url = url
637
+
638
+ if normalized_url not in seen_urls:
639
+ video_urls.append(normalized_url)
640
+ seen_urls.add(normalized_url)
641
+ if len(video_urls) >= max_urls:
642
+ break
643
+ if len(video_urls) >= max_urls:
644
+ break
645
+
646
+ print(f"[URL Extraction] Extracted {len(video_urls)} video URLs: {video_urls}")
647
+ return video_urls
648
 
 
 
 
 
 
 
 
 
 
649
 
650
+ # === 7. Topic-Based Video Analysis Orchestrator ===
651
+ @app.function(
652
+ image=video_analysis_image,
653
+ secrets=[HF_TOKEN_SECRET],
654
+ gpu="any", # Child functions use GPU
655
+ timeout=3600 # Allow up to 1 hour for multiple video analyses
656
+ )
657
+ async def _download_and_analyze_one_video(client: httpx.AsyncClient, video_url: str, topic: str) -> Dict[str, Any]:
658
+ """Helper to download and analyze a single video. Returns result or error dict."""
659
+ print(f"[TopicAnalysisWorker] Processing video URL for topic '{topic}': {video_url}")
660
  try:
661
+ # 1. Download video
662
+ print(f"[TopicAnalysisWorker] Downloading video from: {video_url}")
663
+ response = await client.get(video_url)
664
+ response.raise_for_status() # Raise HTTPError for bad responses (4XX or 5XX)
665
+ video_bytes = await response.aread()
666
+ print(f"[TopicAnalysisWorker] Downloaded {len(video_bytes)} bytes from {video_url}")
667
+
668
+ if not video_bytes:
669
+ raise ValueError("Downloaded video content is empty.")
670
+
671
+ # 2. Analyze video
672
+ analysis_result = await analyze_video_comprehensive.coro(video_bytes)
673
 
674
+ # Check if the analysis itself returned an error structure
675
+ if isinstance(analysis_result, dict) and any(key + "_error" in analysis_result for key in ["transcription", "caption", "actions", "objects"]):
676
+ print(f"[TopicAnalysisWorker] Comprehensive analysis for {video_url} reported errors: {analysis_result}")
677
+ return {"url": video_url, "error_type": "analysis_error", "error_details": analysis_result}
678
+ else:
679
+ return {"url": video_url, "analysis": analysis_result}
680
+
681
+ except httpx.HTTPStatusError as e:
682
+ print(f"[TopicAnalysisWorker] HTTP error downloading {video_url}: {e}")
683
+ return {"url": video_url, "error_type": "download_error", "error_details": f"HTTP {e.response.status_code}: {e.response.text[:200]}"}
684
+ except httpx.RequestError as e:
685
+ print(f"[TopicAnalysisWorker] Request error downloading {video_url}: {e}")
686
+ return {"url": video_url, "error_type": "download_error", "error_details": f"Failed to download: {str(e)}"}
687
+ except Exception as e:
688
+ print(f"[TopicAnalysisWorker] Error processing video {video_url}: {e}")
689
+ import traceback
690
+ # Consider logging traceback.format_exc() instead of printing if running in a less verbose environment
691
+ # traceback.print_exc() # This might be too verbose for regular Modal logs
692
+ return {"url": video_url, "error_type": "processing_error", "error_details": str(e), "traceback": traceback.format_exc()[:1000]}
693
 
694
+ async def analyze_videos_by_topic(video_urls: List[str], topic: str) -> Dict[str, Any]:
695
+ """Analyzes a list of videos (by URL) concurrently and aggregates results for a topic."""
696
+ print(f"[TopicAnalysis] Starting concurrent analysis for topic: '{topic}' with {len(video_urls)} video(s).")
697
+
698
+ results_aggregator = {
699
+ "topic": topic,
700
+ "analyzed_videos": [],
701
+ "errors": []
702
+ }
703
+
704
+ if not video_urls:
705
+ results_aggregator["errors"].append({"topic_error": "No video URLs provided or found for the topic."})
706
+ return results_aggregator
707
+
708
+ async with httpx.AsyncClient(timeout=300.0) as client: # 5 min timeout for individual downloads
709
+ tasks = [_download_and_analyze_one_video(client, url, topic) for url in video_urls]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
+ # return_exceptions=True allows us to get results for successful tasks even if others fail
712
+ individual_results = await asyncio.gather(*tasks, return_exceptions=True)
713
+
714
+ for res_or_exc in individual_results:
715
+ if isinstance(res_or_exc, Exception):
716
+ # This handles exceptions not caught within _download_and_analyze_one_video itself (should be rare)
717
+ # Or if return_exceptions=True was used and _download_and_analyze_one_video raised an unhandled one.
718
+ print(f"[TopicAnalysis] An unexpected exception occurred during asyncio.gather: {res_or_exc}")
719
+ results_aggregator["errors"].append({"url": "unknown_url_due_to_gather_exception", "processing_error": str(res_or_exc)})
720
+ elif isinstance(res_or_exc, dict):
721
+ if "error_type" in res_or_exc:
722
+ results_aggregator["errors"].append(res_or_exc) # Append the error dict directly
723
+ elif "analysis" in res_or_exc:
724
+ results_aggregator["analyzed_videos"].append(res_or_exc)
725
+ else:
726
+ print(f"[TopicAnalysis] Received an unexpected dictionary structure: {res_or_exc}")
727
+ results_aggregator["errors"].append({"url": res_or_exc.get("url", "unknown"), "processing_error": "Unknown result structure"})
728
+ else:
729
+ print(f"[TopicAnalysis] Received an unexpected result type from asyncio.gather: {type(res_or_exc)}")
730
+ results_aggregator["errors"].append({"url": "unknown", "processing_error": f"Unexpected result type: {type(res_or_exc)}"})
731
 
732
+ print(f"[TopicAnalysis] Finished concurrent analysis for topic '{topic}'.")
733
+ return results_aggregator
734
 
 
735
 
736
+ # === Gradio Interface ===
737
+ def video_analyzer_gradio_ui():
738
+ print("[Gradio] UI function called to define interface.")
739
+
740
+ def analyze_video_all_models(video_filepath):
741
+ print(f"[Gradio] Received video filepath for analysis: {video_filepath}")
742
 
743
+ if not video_filepath or not os.path.exists(video_filepath):
744
+ return "Error: Video file path is invalid or does not exist.", "", "[]", "[]"
 
745
 
746
+ with open(video_filepath, "rb") as f:
747
+ video_bytes_content = f.read()
748
+ print(f"[Gradio] Read {len(video_bytes_content)} bytes from video path: {video_filepath}")
749
+
750
+ if not video_bytes_content:
751
+ return "Error: Could not read video bytes.", "", "[]", "[]"
752
+
753
+ print("[Gradio] Calling Whisper...")
754
+ transcription = transcribe_video_with_whisper.call(video_bytes_content)
755
+ print(f"[Gradio] Whisper result length: {len(transcription)}")
756
+
757
+ print("[Gradio] Calling SpaceTimeGPT...")
758
+ captions = generate_captions_with_spacetimegpt.call(video_bytes_content)
759
+ print(f"[Gradio] SpaceTimeGPT result: {captions}")
760
 
761
+ print("[Gradio] Calling VideoMAE...")
762
+ action_labels = generate_action_labels.call(video_bytes_content)
763
+ print(f"[Gradio] VideoMAE result: {action_labels}")
764
 
765
+ print("[Gradio] Calling DETR...")
766
+ object_detections = generate_object_detection.call(video_bytes_content)
767
+ print(f"[Gradio] DETR result: {object_detections}")
768
+
769
+ return transcription, captions, str(action_labels), str(object_detections)
 
 
 
 
 
 
 
770
 
771
+ with gr.Blocks(title="Comprehensive Video Analyzer", theme=gr.themes.Soft()) as demo:
772
+ gr.Markdown("# Comprehensive Video Analyzer")
773
+ gr.Markdown("Upload a video to get transcription, captions, action labels, and object detections.")
774
+
775
+ with gr.Row():
776
+ video_input = gr.Video(label="Upload Video", sources=["upload"], type="filepath")
777
+
778
+ submit_button = gr.Button("Analyze Video", variant="primary")
779
+
780
+ with gr.Tabs():
781
+ with gr.TabItem("Transcription (Whisper)"):
782
+ transcription_output = gr.Textbox(label="Transcription", lines=10, interactive=False)
783
+ with gr.TabItem("Dense Captions (SpaceTimeGPT)"):
784
+ caption_output = gr.Textbox(label="Captions", lines=10, interactive=False)
785
+ with gr.TabItem("Action Recognition (VideoMAE)"):
786
+ action_output = gr.Textbox(label="Predicted Actions (JSON format)", lines=10, interactive=False)
787
+ with gr.TabItem("Object Detection (DETR)"):
788
+ object_output = gr.Textbox(label="Detected Objects (JSON format)", lines=10, interactive=False)
789
+
790
+ submit_button.click(
791
+ fn=analyze_video_all_models,
792
+ inputs=[video_input],
793
+ outputs=[transcription_output, caption_output, action_output, object_output]
794
+ )
795
+
796
+ gr.Markdown("### Example Video")
797
+ gr.Markdown("You can test with a short video. Processing may take a few minutes depending on video length and model inference times.")
798
+
799
+ print("[Gradio] UI definition complete.")
800
+ return gr.routes.App.create_app(demo)
801
+
802
+
803
+ # === Main ASGI App (FastAPI + Gradio) ===
804
+ @modal.asgi_app()
805
+ def main_asgi():
806
+ # fastapi_app is defined globally
807
+ # video_analyzer_gradio_ui returns an ASGI-compatible Gradio app
808
+ gradio_asgi_app = video_analyzer_gradio_ui()
809
+ fastapi_app.mount("/gradio", gradio_asgi_app, name="gradio_ui")
810
+ print("FastAPI app with Gradio UI mounted on /gradio is ready.")
811
+ return fastapi_app
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
  gradio
2
- moviepy
3
  requests
4
  yt-dlp
5
- modal
 
1
  gradio
 
2
  requests
3
  yt-dlp