vidhi0405 commited on
Commit
c3d87b5
·
1 Parent(s): b98c114
Files changed (3) hide show
  1. app.py +11 -1
  2. huggingface_exact_approach.py +110 -56
  3. src/smolvlm2_handler.py +13 -6
app.py CHANGED
@@ -25,7 +25,6 @@ CACHE_DIR = os.path.join("/tmp", ".cache", "huggingface")
25
  os.makedirs(CACHE_DIR, exist_ok=True)
26
  os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
27
  os.environ["HF_HOME"] = CACHE_DIR
28
- os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
29
  os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
30
  os.environ["TORCH_HOME"] = os.path.join("/tmp", ".cache", "torch")
31
  os.environ["XDG_CACHE_HOME"] = os.path.join("/tmp", ".cache")
@@ -178,6 +177,17 @@ async def health_check():
178
  }
179
 
180
 
 
 
 
 
 
 
 
 
 
 
 
181
  @app.get("/ready")
182
  async def readiness_check():
183
  loaded = detector_registry.loaded_models()
 
25
  os.makedirs(CACHE_DIR, exist_ok=True)
26
  os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
27
  os.environ["HF_HOME"] = CACHE_DIR
 
28
  os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
29
  os.environ["TORCH_HOME"] = os.path.join("/tmp", ".cache", "torch")
30
  os.environ["XDG_CACHE_HOME"] = os.path.join("/tmp", ".cache")
 
177
  }
178
 
179
 
180
+ @app.get("/")
181
+ async def root():
182
+ return {
183
+ "service": "SmolVLM2 Video Highlights API",
184
+ "status": "ok",
185
+ "health": "/health",
186
+ "ready": "/ready",
187
+ "upload": "/upload-video",
188
+ }
189
+
190
+
191
  @app.get("/ready")
192
  async def readiness_check():
193
  loaded = detector_registry.loaded_models()
huggingface_exact_approach.py CHANGED
@@ -67,11 +67,19 @@ class VideoHighlightDetector:
67
 
68
  # Initialize model and processor
69
  self.processor = AutoProcessor.from_pretrained(model_path)
70
- self.model = AutoModelForImageTextToText.from_pretrained(
71
- model_path,
72
- torch_dtype=self.dtype,
73
- # _attn_implementation="flash_attention_2"
74
- ).to(device)
 
 
 
 
 
 
 
 
75
 
76
  # Store model path for reference
77
  self.model_path = model_path
@@ -424,9 +432,26 @@ class VideoHighlightDetector:
424
  self._concatenate_with_effects(video_path, scene_times, output_path)
425
  else:
426
  self._concatenate_basic(video_path, scene_times, output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
  def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
429
  """Basic concatenation without effects."""
 
430
  filter_complex_parts = []
431
  concat_inputs = []
432
  for i, (start_sec, end_sec) in enumerate(scene_times):
@@ -434,48 +459,62 @@ class VideoHighlightDetector:
434
  f"[0:v]trim=start={start_sec}:end={end_sec},"
435
  f"setpts=PTS-STARTPTS[v{i}];"
436
  )
437
- filter_complex_parts.append(
438
- f"[0:a]atrim=start={start_sec}:end={end_sec},"
439
- f"asetpts=PTS-STARTPTS[a{i}];"
440
- )
441
- concat_inputs.append(f"[v{i}][a{i}]")
442
-
443
- concat_filter = f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=1[outv][outa]"
 
 
 
 
 
 
 
444
  filter_complex = "".join(filter_complex_parts) + concat_filter
445
 
446
- cmd = [
447
- "ffmpeg",
448
- "-y",
449
- "-i", video_path,
450
- "-filter_complex", filter_complex,
451
- "-map", "[outv]",
452
- "-map", "[outa]",
453
- "-c:v", "libx264",
454
- "-c:a", "aac",
455
- output_path
456
- ]
457
 
458
  logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
459
  subprocess.run(cmd, check=True, capture_output=True, text=True)
460
 
461
  def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
462
  """Concatenate with fade effects between segments."""
 
463
  if len(scene_times) == 1:
464
  # Single segment - just extract with fade in/out
465
  start_sec, end_sec = scene_times[0]
466
  duration = end_sec - start_sec
467
  fade_duration = min(0.5, duration / 4) # 0.5s or 25% of duration, whichever is shorter
468
-
469
- cmd = [
470
- "ffmpeg", "-y",
471
- "-i", video_path,
472
- "-ss", str(start_sec),
473
- "-t", str(duration),
474
- "-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
475
- "-af", f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}",
476
- "-c:v", "libx264", "-c:a", "aac",
477
- output_path
478
- ]
 
 
 
 
 
 
 
 
 
 
 
 
479
  else:
480
  # Multiple segments - create with crossfade transitions
481
  filter_parts = []
@@ -491,31 +530,46 @@ class VideoHighlightDetector:
491
  f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
492
  )
493
 
494
- # Audio with fade
495
- audio_parts.append(
496
- f"[0:a]atrim=start={start_sec}:end={end_sec},asetpts=PTS-STARTPTS,"
497
- f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}[a{i}]"
498
- )
 
499
 
500
  # Concatenate all segments
501
  video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
502
- audio_concat = "".join([f"[a{i}]" for i in range(len(scene_times))])
503
-
504
- filter_complex = (
505
- ";".join(filter_parts) + ";" +
506
- ";".join(audio_parts) + ";" +
507
- f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv];" +
508
- f"{audio_concat}concat=n={len(scene_times)}:v=0:a=1[outa]"
509
- )
510
-
511
- cmd = [
512
- "ffmpeg", "-y",
513
- "-i", video_path,
514
- "-filter_complex", filter_complex,
515
- "-map", "[outv]", "-map", "[outa]",
516
- "-c:v", "libx264", "-c:a", "aac",
517
- output_path
518
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
 
520
  logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
521
  result = subprocess.run(cmd, capture_output=True, text=True)
 
67
 
68
  # Initialize model and processor
69
  self.processor = AutoProcessor.from_pretrained(model_path)
70
+ try:
71
+ self.model = AutoModelForImageTextToText.from_pretrained(
72
+ model_path,
73
+ dtype=self.dtype,
74
+ # _attn_implementation="flash_attention_2"
75
+ ).to(device)
76
+ except TypeError:
77
+ # Backward compatibility for older Transformers versions.
78
+ self.model = AutoModelForImageTextToText.from_pretrained(
79
+ model_path,
80
+ torch_dtype=self.dtype,
81
+ # _attn_implementation="flash_attention_2"
82
+ ).to(device)
83
 
84
  # Store model path for reference
85
  self.model_path = model_path
 
432
  self._concatenate_with_effects(video_path, scene_times, output_path)
433
  else:
434
  self._concatenate_basic(video_path, scene_times, output_path)
435
+
436
+ def _video_has_audio(self, video_path: str) -> bool:
437
+ """Return True when the input contains at least one audio stream."""
438
+ cmd = [
439
+ "ffprobe",
440
+ "-v", "error",
441
+ "-select_streams", "a",
442
+ "-show_entries", "stream=index",
443
+ "-of", "csv=p=0",
444
+ video_path,
445
+ ]
446
+ try:
447
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
448
+ return bool(result.stdout.strip())
449
+ except Exception:
450
+ return False
451
 
452
  def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
453
  """Basic concatenation without effects."""
454
+ has_audio = self._video_has_audio(video_path)
455
  filter_complex_parts = []
456
  concat_inputs = []
457
  for i, (start_sec, end_sec) in enumerate(scene_times):
 
459
  f"[0:v]trim=start={start_sec}:end={end_sec},"
460
  f"setpts=PTS-STARTPTS[v{i}];"
461
  )
462
+ if has_audio:
463
+ filter_complex_parts.append(
464
+ f"[0:a]atrim=start={start_sec}:end={end_sec},"
465
+ f"asetpts=PTS-STARTPTS[a{i}];"
466
+ )
467
+ concat_inputs.append(f"[v{i}][a{i}]")
468
+ else:
469
+ concat_inputs.append(f"[v{i}]")
470
+
471
+ concat_filter = (
472
+ f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=1[outv][outa]"
473
+ if has_audio
474
+ else f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=0[outv]"
475
+ )
476
  filter_complex = "".join(filter_complex_parts) + concat_filter
477
 
478
+ cmd = ["ffmpeg", "-y", "-i", video_path, "-filter_complex", filter_complex, "-map", "[outv]"]
479
+ if has_audio:
480
+ cmd += ["-map", "[outa]", "-c:v", "libx264", "-c:a", "aac", output_path]
481
+ else:
482
+ cmd += ["-an", "-c:v", "libx264", output_path]
 
 
 
 
 
 
483
 
484
  logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
485
  subprocess.run(cmd, check=True, capture_output=True, text=True)
486
 
487
  def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
488
  """Concatenate with fade effects between segments."""
489
+ has_audio = self._video_has_audio(video_path)
490
  if len(scene_times) == 1:
491
  # Single segment - just extract with fade in/out
492
  start_sec, end_sec = scene_times[0]
493
  duration = end_sec - start_sec
494
  fade_duration = min(0.5, duration / 4) # 0.5s or 25% of duration, whichever is shorter
495
+
496
+ if has_audio:
497
+ cmd = [
498
+ "ffmpeg", "-y",
499
+ "-i", video_path,
500
+ "-ss", str(start_sec),
501
+ "-t", str(duration),
502
+ "-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
503
+ "-af", f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}",
504
+ "-c:v", "libx264", "-c:a", "aac",
505
+ output_path
506
+ ]
507
+ else:
508
+ cmd = [
509
+ "ffmpeg", "-y",
510
+ "-i", video_path,
511
+ "-ss", str(start_sec),
512
+ "-t", str(duration),
513
+ "-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
514
+ "-an",
515
+ "-c:v", "libx264",
516
+ output_path
517
+ ]
518
  else:
519
  # Multiple segments - create with crossfade transitions
520
  filter_parts = []
 
530
  f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
531
  )
532
 
533
+ if has_audio:
534
+ # Audio with fade
535
+ audio_parts.append(
536
+ f"[0:a]atrim=start={start_sec}:end={end_sec},asetpts=PTS-STARTPTS,"
537
+ f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}[a{i}]"
538
+ )
539
 
540
  # Concatenate all segments
541
  video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
542
+
543
+ if has_audio:
544
+ audio_concat = "".join([f"[a{i}]" for i in range(len(scene_times))])
545
+ filter_complex = (
546
+ ";".join(filter_parts) + ";" +
547
+ ";".join(audio_parts) + ";" +
548
+ f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv];" +
549
+ f"{audio_concat}concat=n={len(scene_times)}:v=0:a=1[outa]"
550
+ )
551
+ cmd = [
552
+ "ffmpeg", "-y",
553
+ "-i", video_path,
554
+ "-filter_complex", filter_complex,
555
+ "-map", "[outv]", "-map", "[outa]",
556
+ "-c:v", "libx264", "-c:a", "aac",
557
+ output_path
558
+ ]
559
+ else:
560
+ filter_complex = (
561
+ ";".join(filter_parts) + ";" +
562
+ f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv]"
563
+ )
564
+ cmd = [
565
+ "ffmpeg", "-y",
566
+ "-i", video_path,
567
+ "-filter_complex", filter_complex,
568
+ "-map", "[outv]",
569
+ "-an",
570
+ "-c:v", "libx264",
571
+ output_path
572
+ ]
573
 
574
  logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
575
  result = subprocess.run(cmd, capture_output=True, text=True)
src/smolvlm2_handler.py CHANGED
@@ -13,7 +13,6 @@ if 'HF_HOME' not in os.environ:
13
  os.makedirs(CACHE_DIR, exist_ok=True)
14
  os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
15
  os.environ['HF_HOME'] = CACHE_DIR
16
- os.environ['TRANSFORMERS_CACHE'] = CACHE_DIR
17
  os.environ['HF_DATASETS_CACHE'] = CACHE_DIR
18
  os.environ['TORCH_HOME'] = os.path.join("/tmp", ".cache", "torch")
19
  os.environ['XDG_CACHE_HOME'] = os.path.join("/tmp", ".cache")
@@ -93,11 +92,19 @@ class SmolVLM2Handler:
93
  dtype = self._get_torch_dtype()
94
  logger.info(f"Using torch dtype: {dtype}")
95
 
96
- self.model = AutoModelForImageTextToText.from_pretrained(
97
- self.model_name,
98
- torch_dtype=dtype,
99
- trust_remote_code=True
100
- )
 
 
 
 
 
 
 
 
101
  self.model = self.model.to(self.device)
102
 
103
  logger.info("✅ Model loaded successfully!")
 
13
  os.makedirs(CACHE_DIR, exist_ok=True)
14
  os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
15
  os.environ['HF_HOME'] = CACHE_DIR
 
16
  os.environ['HF_DATASETS_CACHE'] = CACHE_DIR
17
  os.environ['TORCH_HOME'] = os.path.join("/tmp", ".cache", "torch")
18
  os.environ['XDG_CACHE_HOME'] = os.path.join("/tmp", ".cache")
 
92
  dtype = self._get_torch_dtype()
93
  logger.info(f"Using torch dtype: {dtype}")
94
 
95
+ try:
96
+ self.model = AutoModelForImageTextToText.from_pretrained(
97
+ self.model_name,
98
+ dtype=dtype,
99
+ trust_remote_code=True
100
+ )
101
+ except TypeError:
102
+ # Backward compatibility for older Transformers versions.
103
+ self.model = AutoModelForImageTextToText.from_pretrained(
104
+ self.model_name,
105
+ torch_dtype=dtype,
106
+ trust_remote_code=True
107
+ )
108
  self.model = self.model.to(self.device)
109
 
110
  logger.info("✅ Model loaded successfully!")