Spaces:
Sleeping
Sleeping
commit 3
Browse files- app.py +11 -1
- huggingface_exact_approach.py +110 -56
- src/smolvlm2_handler.py +13 -6
app.py
CHANGED
|
@@ -25,7 +25,6 @@ CACHE_DIR = os.path.join("/tmp", ".cache", "huggingface")
|
|
| 25 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 26 |
os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
|
| 27 |
os.environ["HF_HOME"] = CACHE_DIR
|
| 28 |
-
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
|
| 29 |
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
|
| 30 |
os.environ["TORCH_HOME"] = os.path.join("/tmp", ".cache", "torch")
|
| 31 |
os.environ["XDG_CACHE_HOME"] = os.path.join("/tmp", ".cache")
|
|
@@ -178,6 +177,17 @@ async def health_check():
|
|
| 178 |
}
|
| 179 |
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
@app.get("/ready")
|
| 182 |
async def readiness_check():
|
| 183 |
loaded = detector_registry.loaded_models()
|
|
|
|
| 25 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 26 |
os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
|
| 27 |
os.environ["HF_HOME"] = CACHE_DIR
|
|
|
|
| 28 |
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
|
| 29 |
os.environ["TORCH_HOME"] = os.path.join("/tmp", ".cache", "torch")
|
| 30 |
os.environ["XDG_CACHE_HOME"] = os.path.join("/tmp", ".cache")
|
|
|
|
| 177 |
}
|
| 178 |
|
| 179 |
|
| 180 |
+
@app.get("/")
|
| 181 |
+
async def root():
|
| 182 |
+
return {
|
| 183 |
+
"service": "SmolVLM2 Video Highlights API",
|
| 184 |
+
"status": "ok",
|
| 185 |
+
"health": "/health",
|
| 186 |
+
"ready": "/ready",
|
| 187 |
+
"upload": "/upload-video",
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
|
| 191 |
@app.get("/ready")
|
| 192 |
async def readiness_check():
|
| 193 |
loaded = detector_registry.loaded_models()
|
huggingface_exact_approach.py
CHANGED
|
@@ -67,11 +67,19 @@ class VideoHighlightDetector:
|
|
| 67 |
|
| 68 |
# Initialize model and processor
|
| 69 |
self.processor = AutoProcessor.from_pretrained(model_path)
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
# Store model path for reference
|
| 77 |
self.model_path = model_path
|
|
@@ -424,9 +432,26 @@ class VideoHighlightDetector:
|
|
| 424 |
self._concatenate_with_effects(video_path, scene_times, output_path)
|
| 425 |
else:
|
| 426 |
self._concatenate_basic(video_path, scene_times, output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
|
| 428 |
def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
|
| 429 |
"""Basic concatenation without effects."""
|
|
|
|
| 430 |
filter_complex_parts = []
|
| 431 |
concat_inputs = []
|
| 432 |
for i, (start_sec, end_sec) in enumerate(scene_times):
|
|
@@ -434,48 +459,62 @@ class VideoHighlightDetector:
|
|
| 434 |
f"[0:v]trim=start={start_sec}:end={end_sec},"
|
| 435 |
f"setpts=PTS-STARTPTS[v{i}];"
|
| 436 |
)
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
filter_complex = "".join(filter_complex_parts) + concat_filter
|
| 445 |
|
| 446 |
-
cmd = [
|
| 447 |
-
|
| 448 |
-
"-
|
| 449 |
-
|
| 450 |
-
"-
|
| 451 |
-
"-map", "[outv]",
|
| 452 |
-
"-map", "[outa]",
|
| 453 |
-
"-c:v", "libx264",
|
| 454 |
-
"-c:a", "aac",
|
| 455 |
-
output_path
|
| 456 |
-
]
|
| 457 |
|
| 458 |
logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
|
| 459 |
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
| 460 |
|
| 461 |
def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
|
| 462 |
"""Concatenate with fade effects between segments."""
|
|
|
|
| 463 |
if len(scene_times) == 1:
|
| 464 |
# Single segment - just extract with fade in/out
|
| 465 |
start_sec, end_sec = scene_times[0]
|
| 466 |
duration = end_sec - start_sec
|
| 467 |
fade_duration = min(0.5, duration / 4) # 0.5s or 25% of duration, whichever is shorter
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
else:
|
| 480 |
# Multiple segments - create with crossfade transitions
|
| 481 |
filter_parts = []
|
|
@@ -491,31 +530,46 @@ class VideoHighlightDetector:
|
|
| 491 |
f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
|
| 492 |
)
|
| 493 |
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
|
|
|
| 499 |
|
| 500 |
# Concatenate all segments
|
| 501 |
video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
|
| 520 |
logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
|
| 521 |
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
| 67 |
|
| 68 |
# Initialize model and processor
|
| 69 |
self.processor = AutoProcessor.from_pretrained(model_path)
|
| 70 |
+
try:
|
| 71 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 72 |
+
model_path,
|
| 73 |
+
dtype=self.dtype,
|
| 74 |
+
# _attn_implementation="flash_attention_2"
|
| 75 |
+
).to(device)
|
| 76 |
+
except TypeError:
|
| 77 |
+
# Backward compatibility for older Transformers versions.
|
| 78 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 79 |
+
model_path,
|
| 80 |
+
torch_dtype=self.dtype,
|
| 81 |
+
# _attn_implementation="flash_attention_2"
|
| 82 |
+
).to(device)
|
| 83 |
|
| 84 |
# Store model path for reference
|
| 85 |
self.model_path = model_path
|
|
|
|
| 432 |
self._concatenate_with_effects(video_path, scene_times, output_path)
|
| 433 |
else:
|
| 434 |
self._concatenate_basic(video_path, scene_times, output_path)
|
| 435 |
+
|
| 436 |
+
def _video_has_audio(self, video_path: str) -> bool:
|
| 437 |
+
"""Return True when the input contains at least one audio stream."""
|
| 438 |
+
cmd = [
|
| 439 |
+
"ffprobe",
|
| 440 |
+
"-v", "error",
|
| 441 |
+
"-select_streams", "a",
|
| 442 |
+
"-show_entries", "stream=index",
|
| 443 |
+
"-of", "csv=p=0",
|
| 444 |
+
video_path,
|
| 445 |
+
]
|
| 446 |
+
try:
|
| 447 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 448 |
+
return bool(result.stdout.strip())
|
| 449 |
+
except Exception:
|
| 450 |
+
return False
|
| 451 |
|
| 452 |
def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
|
| 453 |
"""Basic concatenation without effects."""
|
| 454 |
+
has_audio = self._video_has_audio(video_path)
|
| 455 |
filter_complex_parts = []
|
| 456 |
concat_inputs = []
|
| 457 |
for i, (start_sec, end_sec) in enumerate(scene_times):
|
|
|
|
| 459 |
f"[0:v]trim=start={start_sec}:end={end_sec},"
|
| 460 |
f"setpts=PTS-STARTPTS[v{i}];"
|
| 461 |
)
|
| 462 |
+
if has_audio:
|
| 463 |
+
filter_complex_parts.append(
|
| 464 |
+
f"[0:a]atrim=start={start_sec}:end={end_sec},"
|
| 465 |
+
f"asetpts=PTS-STARTPTS[a{i}];"
|
| 466 |
+
)
|
| 467 |
+
concat_inputs.append(f"[v{i}][a{i}]")
|
| 468 |
+
else:
|
| 469 |
+
concat_inputs.append(f"[v{i}]")
|
| 470 |
+
|
| 471 |
+
concat_filter = (
|
| 472 |
+
f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=1[outv][outa]"
|
| 473 |
+
if has_audio
|
| 474 |
+
else f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=0[outv]"
|
| 475 |
+
)
|
| 476 |
filter_complex = "".join(filter_complex_parts) + concat_filter
|
| 477 |
|
| 478 |
+
cmd = ["ffmpeg", "-y", "-i", video_path, "-filter_complex", filter_complex, "-map", "[outv]"]
|
| 479 |
+
if has_audio:
|
| 480 |
+
cmd += ["-map", "[outa]", "-c:v", "libx264", "-c:a", "aac", output_path]
|
| 481 |
+
else:
|
| 482 |
+
cmd += ["-an", "-c:v", "libx264", output_path]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
| 484 |
logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
|
| 485 |
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
| 486 |
|
| 487 |
def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
|
| 488 |
"""Concatenate with fade effects between segments."""
|
| 489 |
+
has_audio = self._video_has_audio(video_path)
|
| 490 |
if len(scene_times) == 1:
|
| 491 |
# Single segment - just extract with fade in/out
|
| 492 |
start_sec, end_sec = scene_times[0]
|
| 493 |
duration = end_sec - start_sec
|
| 494 |
fade_duration = min(0.5, duration / 4) # 0.5s or 25% of duration, whichever is shorter
|
| 495 |
+
|
| 496 |
+
if has_audio:
|
| 497 |
+
cmd = [
|
| 498 |
+
"ffmpeg", "-y",
|
| 499 |
+
"-i", video_path,
|
| 500 |
+
"-ss", str(start_sec),
|
| 501 |
+
"-t", str(duration),
|
| 502 |
+
"-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
|
| 503 |
+
"-af", f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}",
|
| 504 |
+
"-c:v", "libx264", "-c:a", "aac",
|
| 505 |
+
output_path
|
| 506 |
+
]
|
| 507 |
+
else:
|
| 508 |
+
cmd = [
|
| 509 |
+
"ffmpeg", "-y",
|
| 510 |
+
"-i", video_path,
|
| 511 |
+
"-ss", str(start_sec),
|
| 512 |
+
"-t", str(duration),
|
| 513 |
+
"-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
|
| 514 |
+
"-an",
|
| 515 |
+
"-c:v", "libx264",
|
| 516 |
+
output_path
|
| 517 |
+
]
|
| 518 |
else:
|
| 519 |
# Multiple segments - create with crossfade transitions
|
| 520 |
filter_parts = []
|
|
|
|
| 530 |
f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
|
| 531 |
)
|
| 532 |
|
| 533 |
+
if has_audio:
|
| 534 |
+
# Audio with fade
|
| 535 |
+
audio_parts.append(
|
| 536 |
+
f"[0:a]atrim=start={start_sec}:end={end_sec},asetpts=PTS-STARTPTS,"
|
| 537 |
+
f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}[a{i}]"
|
| 538 |
+
)
|
| 539 |
|
| 540 |
# Concatenate all segments
|
| 541 |
video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
|
| 542 |
+
|
| 543 |
+
if has_audio:
|
| 544 |
+
audio_concat = "".join([f"[a{i}]" for i in range(len(scene_times))])
|
| 545 |
+
filter_complex = (
|
| 546 |
+
";".join(filter_parts) + ";" +
|
| 547 |
+
";".join(audio_parts) + ";" +
|
| 548 |
+
f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv];" +
|
| 549 |
+
f"{audio_concat}concat=n={len(scene_times)}:v=0:a=1[outa]"
|
| 550 |
+
)
|
| 551 |
+
cmd = [
|
| 552 |
+
"ffmpeg", "-y",
|
| 553 |
+
"-i", video_path,
|
| 554 |
+
"-filter_complex", filter_complex,
|
| 555 |
+
"-map", "[outv]", "-map", "[outa]",
|
| 556 |
+
"-c:v", "libx264", "-c:a", "aac",
|
| 557 |
+
output_path
|
| 558 |
+
]
|
| 559 |
+
else:
|
| 560 |
+
filter_complex = (
|
| 561 |
+
";".join(filter_parts) + ";" +
|
| 562 |
+
f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv]"
|
| 563 |
+
)
|
| 564 |
+
cmd = [
|
| 565 |
+
"ffmpeg", "-y",
|
| 566 |
+
"-i", video_path,
|
| 567 |
+
"-filter_complex", filter_complex,
|
| 568 |
+
"-map", "[outv]",
|
| 569 |
+
"-an",
|
| 570 |
+
"-c:v", "libx264",
|
| 571 |
+
output_path
|
| 572 |
+
]
|
| 573 |
|
| 574 |
logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
|
| 575 |
result = subprocess.run(cmd, capture_output=True, text=True)
|
src/smolvlm2_handler.py
CHANGED
|
@@ -13,7 +13,6 @@ if 'HF_HOME' not in os.environ:
|
|
| 13 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 14 |
os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
|
| 15 |
os.environ['HF_HOME'] = CACHE_DIR
|
| 16 |
-
os.environ['TRANSFORMERS_CACHE'] = CACHE_DIR
|
| 17 |
os.environ['HF_DATASETS_CACHE'] = CACHE_DIR
|
| 18 |
os.environ['TORCH_HOME'] = os.path.join("/tmp", ".cache", "torch")
|
| 19 |
os.environ['XDG_CACHE_HOME'] = os.path.join("/tmp", ".cache")
|
|
@@ -93,11 +92,19 @@ class SmolVLM2Handler:
|
|
| 93 |
dtype = self._get_torch_dtype()
|
| 94 |
logger.info(f"Using torch dtype: {dtype}")
|
| 95 |
|
| 96 |
-
|
| 97 |
-
self.
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
self.model = self.model.to(self.device)
|
| 102 |
|
| 103 |
logger.info("✅ Model loaded successfully!")
|
|
|
|
| 13 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 14 |
os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
|
| 15 |
os.environ['HF_HOME'] = CACHE_DIR
|
|
|
|
| 16 |
os.environ['HF_DATASETS_CACHE'] = CACHE_DIR
|
| 17 |
os.environ['TORCH_HOME'] = os.path.join("/tmp", ".cache", "torch")
|
| 18 |
os.environ['XDG_CACHE_HOME'] = os.path.join("/tmp", ".cache")
|
|
|
|
| 92 |
dtype = self._get_torch_dtype()
|
| 93 |
logger.info(f"Using torch dtype: {dtype}")
|
| 94 |
|
| 95 |
+
try:
|
| 96 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 97 |
+
self.model_name,
|
| 98 |
+
dtype=dtype,
|
| 99 |
+
trust_remote_code=True
|
| 100 |
+
)
|
| 101 |
+
except TypeError:
|
| 102 |
+
# Backward compatibility for older Transformers versions.
|
| 103 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 104 |
+
self.model_name,
|
| 105 |
+
torch_dtype=dtype,
|
| 106 |
+
trust_remote_code=True
|
| 107 |
+
)
|
| 108 |
self.model = self.model.to(self.device)
|
| 109 |
|
| 110 |
logger.info("✅ Model loaded successfully!")
|