Spaces:

vidhi0405
/

VideoToText

Sleeping

App Files Files Community

vidhi0405 commited on 8 days ago

Commit

c3d87b5

1 Parent(s): b98c114

commit 3

Browse files

Files changed (3) hide show

app.py +11 -1
huggingface_exact_approach.py +110 -56
src/smolvlm2_handler.py +13 -6

app.py CHANGED Viewed

@@ -25,7 +25,6 @@ CACHE_DIR = os.path.join("/tmp", ".cache", "huggingface")
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
 os.environ["HF_HOME"] = CACHE_DIR
-os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
 os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
 os.environ["TORCH_HOME"] = os.path.join("/tmp", ".cache", "torch")
 os.environ["XDG_CACHE_HOME"] = os.path.join("/tmp", ".cache")
@@ -178,6 +177,17 @@ async def health_check():
     }
 @app.get("/ready")
 async def readiness_check():
     loaded = detector_registry.loaded_models()

 os.makedirs(CACHE_DIR, exist_ok=True)
 os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
 os.environ["HF_HOME"] = CACHE_DIR
 os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
 os.environ["TORCH_HOME"] = os.path.join("/tmp", ".cache", "torch")
 os.environ["XDG_CACHE_HOME"] = os.path.join("/tmp", ".cache")
     }
+@app.get("/")
+async def root():
+    return {
+        "service": "SmolVLM2 Video Highlights API",
+        "status": "ok",
+        "health": "/health",
+        "ready": "/ready",
+        "upload": "/upload-video",
+    }
 @app.get("/ready")
 async def readiness_check():
     loaded = detector_registry.loaded_models()

huggingface_exact_approach.py CHANGED Viewed

@@ -67,11 +67,19 @@ class VideoHighlightDetector:
         # Initialize model and processor
         self.processor = AutoProcessor.from_pretrained(model_path)
-        self.model = AutoModelForImageTextToText.from_pretrained(
-            model_path,
-            torch_dtype=self.dtype,
-            # _attn_implementation="flash_attention_2"
-        ).to(device)
         # Store model path for reference
         self.model_path = model_path
@@ -424,9 +432,26 @@ class VideoHighlightDetector:
             self._concatenate_with_effects(video_path, scene_times, output_path)
         else:
             self._concatenate_basic(video_path, scene_times, output_path)
     def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
         """Basic concatenation without effects."""
         filter_complex_parts = []
         concat_inputs = []
         for i, (start_sec, end_sec) in enumerate(scene_times):
@@ -434,48 +459,62 @@ class VideoHighlightDetector:
                 f"[0:v]trim=start={start_sec}:end={end_sec},"
                 f"setpts=PTS-STARTPTS[v{i}];"
             )
-            filter_complex_parts.append(
-                f"[0:a]atrim=start={start_sec}:end={end_sec},"
-                f"asetpts=PTS-STARTPTS[a{i}];"
-            )
-            concat_inputs.append(f"[v{i}][a{i}]")
-        concat_filter = f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=1[outv][outa]"
         filter_complex = "".join(filter_complex_parts) + concat_filter
-        cmd = [
-            "ffmpeg",
-            "-y",
-            "-i", video_path,
-            "-filter_complex", filter_complex,
-            "-map", "[outv]",
-            "-map", "[outa]",
-            "-c:v", "libx264",
-            "-c:a", "aac",
-            output_path
-        ]
         logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
         subprocess.run(cmd, check=True, capture_output=True, text=True)
     def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
         """Concatenate with fade effects between segments."""
         if len(scene_times) == 1:
             # Single segment - just extract with fade in/out
             start_sec, end_sec = scene_times[0]
             duration = end_sec - start_sec
             fade_duration = min(0.5, duration / 4)  # 0.5s or 25% of duration, whichever is shorter
-            cmd = [
-                "ffmpeg", "-y",
-                "-i", video_path,
-                "-ss", str(start_sec),
-                "-t", str(duration),
-                "-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
-                "-af", f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}",
-                "-c:v", "libx264", "-c:a", "aac",
-                output_path
-            ]
         else:
             # Multiple segments - create with crossfade transitions
             filter_parts = []
@@ -491,31 +530,46 @@ class VideoHighlightDetector:
                     f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
                 )
-                # Audio with fade
-                audio_parts.append(
-                    f"[0:a]atrim=start={start_sec}:end={end_sec},asetpts=PTS-STARTPTS,"
-                    f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}[a{i}]"
-                )
             # Concatenate all segments
             video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
-            audio_concat = "".join([f"[a{i}]" for i in range(len(scene_times))])
-            filter_complex = (
-                ";".join(filter_parts) + ";" +
-                ";".join(audio_parts) + ";" +
-                f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv];" +
-                f"{audio_concat}concat=n={len(scene_times)}:v=0:a=1[outa]"
-            )
-            cmd = [
-                "ffmpeg", "-y",
-                "-i", video_path,
-                "-filter_complex", filter_complex,
-                "-map", "[outv]", "-map", "[outa]",
-                "-c:v", "libx264", "-c:a", "aac",
-                output_path
-            ]
         logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
         result = subprocess.run(cmd, capture_output=True, text=True)

         # Initialize model and processor
         self.processor = AutoProcessor.from_pretrained(model_path)
+        try:
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                model_path,
+                dtype=self.dtype,
+                # _attn_implementation="flash_attention_2"
+            ).to(device)
+        except TypeError:
+            # Backward compatibility for older Transformers versions.
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                model_path,
+                torch_dtype=self.dtype,
+                # _attn_implementation="flash_attention_2"
+            ).to(device)
         # Store model path for reference
         self.model_path = model_path
             self._concatenate_with_effects(video_path, scene_times, output_path)
         else:
             self._concatenate_basic(video_path, scene_times, output_path)
+    def _video_has_audio(self, video_path: str) -> bool:
+        """Return True when the input contains at least one audio stream."""
+        cmd = [
+            "ffprobe",
+            "-v", "error",
+            "-select_streams", "a",
+            "-show_entries", "stream=index",
+            "-of", "csv=p=0",
+            video_path,
+        ]
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            return bool(result.stdout.strip())
+        except Exception:
+            return False
     def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
         """Basic concatenation without effects."""
+        has_audio = self._video_has_audio(video_path)
         filter_complex_parts = []
         concat_inputs = []
         for i, (start_sec, end_sec) in enumerate(scene_times):
                 f"[0:v]trim=start={start_sec}:end={end_sec},"
                 f"setpts=PTS-STARTPTS[v{i}];"
             )
+            if has_audio:
+                filter_complex_parts.append(
+                    f"[0:a]atrim=start={start_sec}:end={end_sec},"
+                    f"asetpts=PTS-STARTPTS[a{i}];"
+                )
+                concat_inputs.append(f"[v{i}][a{i}]")
+            else:
+                concat_inputs.append(f"[v{i}]")
+        concat_filter = (
+            f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=1[outv][outa]"
+            if has_audio
+            else f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=0[outv]"
+        )
         filter_complex = "".join(filter_complex_parts) + concat_filter
+        cmd = ["ffmpeg", "-y", "-i", video_path, "-filter_complex", filter_complex, "-map", "[outv]"]
+        if has_audio:
+            cmd += ["-map", "[outa]", "-c:v", "libx264", "-c:a", "aac", output_path]
+        else:
+            cmd += ["-an", "-c:v", "libx264", output_path]
         logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
         subprocess.run(cmd, check=True, capture_output=True, text=True)
     def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
         """Concatenate with fade effects between segments."""
+        has_audio = self._video_has_audio(video_path)
         if len(scene_times) == 1:
             # Single segment - just extract with fade in/out
             start_sec, end_sec = scene_times[0]
             duration = end_sec - start_sec
             fade_duration = min(0.5, duration / 4)  # 0.5s or 25% of duration, whichever is shorter
+            if has_audio:
+                cmd = [
+                    "ffmpeg", "-y",
+                    "-i", video_path,
+                    "-ss", str(start_sec),
+                    "-t", str(duration),
+                    "-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
+                    "-af", f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}",
+                    "-c:v", "libx264", "-c:a", "aac",
+                    output_path
+                ]
+            else:
+                cmd = [
+                    "ffmpeg", "-y",
+                    "-i", video_path,
+                    "-ss", str(start_sec),
+                    "-t", str(duration),
+                    "-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
+                    "-an",
+                    "-c:v", "libx264",
+                    output_path
+                ]
         else:
             # Multiple segments - create with crossfade transitions
             filter_parts = []
                     f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
                 )
+                if has_audio:
+                    # Audio with fade
+                    audio_parts.append(
+                        f"[0:a]atrim=start={start_sec}:end={end_sec},asetpts=PTS-STARTPTS,"
+                        f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}[a{i}]"
+                    )
             # Concatenate all segments
             video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
+            if has_audio:
+                audio_concat = "".join([f"[a{i}]" for i in range(len(scene_times))])
+                filter_complex = (
+                    ";".join(filter_parts) + ";" +
+                    ";".join(audio_parts) + ";" +
+                    f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv];" +
+                    f"{audio_concat}concat=n={len(scene_times)}:v=0:a=1[outa]"
+                )
+                cmd = [
+                    "ffmpeg", "-y",
+                    "-i", video_path,
+                    "-filter_complex", filter_complex,
+                    "-map", "[outv]", "-map", "[outa]",
+                    "-c:v", "libx264", "-c:a", "aac",
+                    output_path
+                ]
+            else:
+                filter_complex = (
+                    ";".join(filter_parts) + ";" +
+                    f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv]"
+                )
+                cmd = [
+                    "ffmpeg", "-y",
+                    "-i", video_path,
+                    "-filter_complex", filter_complex,
+                    "-map", "[outv]",
+                    "-an",
+                    "-c:v", "libx264",
+                    output_path
+                ]
         logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
         result = subprocess.run(cmd, capture_output=True, text=True)

src/smolvlm2_handler.py CHANGED Viewed

@@ -13,7 +13,6 @@ if 'HF_HOME' not in os.environ:
     os.makedirs(CACHE_DIR, exist_ok=True)
     os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
     os.environ['HF_HOME'] = CACHE_DIR
-    os.environ['TRANSFORMERS_CACHE'] = CACHE_DIR
     os.environ['HF_DATASETS_CACHE'] = CACHE_DIR
     os.environ['TORCH_HOME'] = os.path.join("/tmp", ".cache", "torch")
     os.environ['XDG_CACHE_HOME'] = os.path.join("/tmp", ".cache")
@@ -93,11 +92,19 @@ class SmolVLM2Handler:
             dtype = self._get_torch_dtype()
             logger.info(f"Using torch dtype: {dtype}")
-            self.model = AutoModelForImageTextToText.from_pretrained(
-                self.model_name,
-                torch_dtype=dtype,
-                trust_remote_code=True
-            )
             self.model = self.model.to(self.device)
             logger.info("✅ Model loaded successfully!")

     os.makedirs(CACHE_DIR, exist_ok=True)
     os.makedirs(os.path.join("/tmp", ".cache", "torch"), exist_ok=True)
     os.environ['HF_HOME'] = CACHE_DIR
     os.environ['HF_DATASETS_CACHE'] = CACHE_DIR
     os.environ['TORCH_HOME'] = os.path.join("/tmp", ".cache", "torch")
     os.environ['XDG_CACHE_HOME'] = os.path.join("/tmp", ".cache")
             dtype = self._get_torch_dtype()
             logger.info(f"Using torch dtype: {dtype}")
+            try:
+                self.model = AutoModelForImageTextToText.from_pretrained(
+                    self.model_name,
+                    dtype=dtype,
+                    trust_remote_code=True
+                )
+            except TypeError:
+                # Backward compatibility for older Transformers versions.
+                self.model = AutoModelForImageTextToText.from_pretrained(
+                    self.model_name,
+                    torch_dtype=dtype,
+                    trust_remote_code=True
+                )
             self.model = self.model.to(self.device)
             logger.info("✅ Model loaded successfully!")