Spaces:

factorstudios
/

segment

Running

App Files Files Community

factorstudios commited on 2 days ago

Commit

ddb7115

verified ·

1 Parent(s): abb20ff

Update server.py

Browse files

Files changed (1) hide show

server.py +54 -51

server.py CHANGED Viewed

@@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import os
 import json
@@ -41,13 +42,12 @@ processing_state = {
     "current_file": None,
     "error_count": 0,
     "last_error": None,
-    "processed_files": []
 }
-# Load Whisper model once at startup (small = good balance of speed/accuracy)
-print("Loading Whisper small model...")
-whisper_model = WhisperModel("small", device="auto", compute_type="int8")
-print("✓ Whisper model loaded")
 HF_DATASET_REPO = "factorstudios/movs"
 HOOKS_FOLDER = "hooks"
@@ -55,6 +55,15 @@ READY_VIDEOS_FOLDER = "ready_videos"
 TRANSCRIPTION_FOLDER = "transcriptions"
 def timestamp_to_seconds(timestamp: str) -> float:
     """Convert HH:MM:SS to seconds."""
     try:
@@ -72,10 +81,10 @@ def extract_audio_segment(video_path: str, start_seconds: float, end_seconds: fl
         "-ss", str(start_seconds),
         "-to", str(end_seconds),
         "-i", video_path,
-        "-vn",                    # no video
-        "-acodec", "pcm_s16le",   # WAV format Whisper expects
-        "-ar", "16000",           # 16kHz sample rate (Whisper requirement)
-        "-ac", "1",               # mono
         output_wav
     ]
     result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
@@ -85,14 +94,14 @@ def extract_audio_segment(video_path: str, start_seconds: float, end_seconds: fl
 def transcribe_segment(audio_path: str) -> List[Tuple[float, float, str]]:
     """
     Transcribe audio with Whisper small.
-    Returns list of (start_sec, end_sec, text) — all relative to segment start.
     """
     print("  Transcribing audio with Whisper small...")
     segments, info = whisper_model.transcribe(
         audio_path,
         beam_size=5,
-        language=None,        # auto-detect language
-        vad_filter=True,      # skip silence
         vad_parameters=dict(min_silence_duration_ms=500)
     )
@@ -112,33 +121,27 @@ def apply_color_grading_wedding_retro(frame: np.ndarray) -> np.ndarray:
     lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
     l_channel, a_channel, b_channel = cv2.split(lab)
-    # 1. VINTAGE/RETRO: warm tones
     a_channel = cv2.add(a_channel, 5)
     b_channel = cv2.add(b_channel, 8)
-    # 2. WEDDING LOOK: soft highlights via CLAHE
     clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
     l_channel = clahe.apply(l_channel)
     lab_enhanced = cv2.merge([l_channel, a_channel, b_channel])
     frame = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
-    # 3. SATURATION BOOST
     hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV).astype(np.float32)
     hsv[:, :, 1] = np.clip(hsv[:, :, 1] * 1.3, 0, 255)
     frame = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)
-    # 4. CONTRAST ENHANCEMENT
     frame = cv2.convertScaleAbs(frame, alpha=1.15, beta=10)
-    # 5. HIGH SHARPENING
     kernel = np.array([[-1, -1, -1],
                        [-1,  9, -1],
                        [-1, -1, -1]]) / 1.2
     sharpened = cv2.filter2D(frame, -1, kernel)
     frame = cv2.addWeighted(frame, 0.4, sharpened, 0.6, 0)
-    # 6. SLIGHT VIGNETTE
     rows, cols = frame.shape[:2]
     X_kernel = cv2.getGaussianKernel(cols, cols / 2)
     Y_kernel = cv2.getGaussianKernel(rows, rows / 2)
@@ -164,7 +167,6 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
     except Exception:
         font = ImageFont.load_default()
-    # Word-wrap text
     max_width = width - 80
     wrapped_lines = []
     words = text.split()
@@ -184,8 +186,6 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
     line_height = font_size + 12
     total_text_height = len(wrapped_lines) * line_height
-    # Position: 80% down the frame (near bottom, not center)
     y_start = int(height * 0.80) - total_text_height // 2
     shadow_offset = 3
@@ -195,9 +195,7 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
         x = (width - line_width) // 2
         y = y_start + i * line_height
-        # Shadow layer
         draw.text((x + shadow_offset, y + shadow_offset), line, font=font, fill=(0, 0, 0, 200))
-        # Main white text
         draw.text((x, y), line, font=font, fill=(255, 255, 255, 255))
     frame_pil = Image.alpha_composite(frame_pil, overlay).convert('RGB')
@@ -205,10 +203,7 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
 def build_frame_caption_map(captions: List[Tuple[float, float, str]], fps: float) -> Dict[int, str]:
-    """
-    Convert Whisper (start, end, text) segments into a per-frame caption map.
-    Each frame number maps to the caption active at that time.
-    """
     frame_map = {}
     for start_sec, end_sec, text in captions:
         start_frame = int(start_sec * fps)
@@ -234,7 +229,7 @@ def process_video_segment(
     4. Mux processed video with original audio
     """
     ffmpeg_video_proc = None
-    temp_wav = None
     temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
     try:
@@ -256,18 +251,17 @@ def process_video_segment(
         print(f"Video info: {fps} fps, {original_width}x{original_height}")
         print(f"Extracting segment: {start_time} to {end_time} ({duration:.1f}s)")
-        # ── Step 1: Extract audio segment as WAV ──────────────────────────────
-        temp_wav = output_path.replace(".mp4", "_audio.wav")
         print("  Extracting audio segment...")
         audio_ok = extract_audio_segment(video_path, start_seconds, end_seconds, temp_wav)
-        if not audio_ok:
-            print("  Warning: Audio extraction failed, captions will be skipped")
-            captions = []
-        else:
-            # ── Step 2: Transcribe with Whisper ───────────────────────────────
             captions = transcribe_segment(temp_wav)
-        # Build per-frame caption lookup from Whisper timestamps
         frame_caption_map = build_frame_caption_map(captions, fps)
         # ── Step 3: Process frames → pipe to FFmpeg ───────────────────────────
@@ -308,7 +302,6 @@ def process_video_segment(
                 print(f"Warning: Could not read frame at position {processed_frames}")
                 break
-            # Crop to target aspect ratio
             aspect_ratio = target_width / target_height
             if original_width / original_height > aspect_ratio:
                 new_width = int(original_height * aspect_ratio)
@@ -322,12 +315,10 @@ def process_video_segment(
             frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
             frame = apply_color_grading_wedding_retro(frame)
-            # Update caption from Whisper frame map
-            if processed_frames in frame_caption_map:
-                current_caption = frame_caption_map[processed_frames]
-            elif processed_frames not in frame_caption_map and current_caption:
-                # Clear caption when we're past its end frame
-                current_caption = frame_caption_map.get(processed_frames, "")
             if current_caption:
                 frame = burn_captions_to_frame(frame, current_caption)
@@ -376,7 +367,7 @@ def process_video_segment(
             print(f"✗ FFmpeg audio mux failed (code {mux_result.returncode})")
             return False
-        print(f"✓ Video segment with audio saved: {output_path}")
         return True
     except Exception as e:
@@ -390,10 +381,12 @@ def process_video_segment(
         return False
     finally:
-        # Clean up temp files
         for tmp in [temp_video_path, temp_wav]:
             if tmp and os.path.exists(tmp):
-                os.remove(tmp)
 async def process_movie_segments(movie_name: str) -> bool:
@@ -404,7 +397,6 @@ async def process_movie_segments(movie_name: str) -> bool:
         print(f"Processing movie: {movie_name}")
         print(f"{'='*80}")
-        # Download original video
         video_file = f"{movie_name}.mkv"
         print(f"Downloading video: {video_file}")
@@ -422,7 +414,6 @@ async def process_movie_segments(movie_name: str) -> bool:
             print(f"Error: Could not download video: {e}")
             return False
-        # List segment JSON files
         hooks_folder = f"{HOOKS_FOLDER}/{movie_name}"
         print(f"Listing segments from: {hooks_folder}")
@@ -442,7 +433,6 @@ async def process_movie_segments(movie_name: str) -> bool:
             return False
         print(f"Found {len(segment_files)} segments")
         temp_dir = tempfile.mkdtemp()
         try:
@@ -519,6 +509,7 @@ async def scan_and_process_videos():
         print("Video processing already running, skipping...")
         return
     print("Waiting 3 minutes before starting video processing...")
     await asyncio.sleep(180)
@@ -562,6 +553,11 @@ async def scan_and_process_videos():
 @app.on_event("startup")
 async def startup_event():
     asyncio.create_task(scan_and_process_videos())
@@ -570,6 +566,7 @@ async def health():
     return JSONResponse({
         "status": "running",
         "service": "Video Processing Service",
         "is_processing": processing_state["is_running"],
         "total_processed": processing_state["total_processed"],
         "error_count": processing_state["error_count"],
@@ -582,6 +579,7 @@ async def health():
 @app.get("/status")
 async def get_status():
     return JSONResponse({
         "is_running": processing_state["is_running"],
         "total_processed": processing_state["total_processed"],
         "error_count": processing_state["error_count"],
@@ -598,6 +596,11 @@ async def trigger_processing():
             "status": "already_running",
             "message": "Video processing is already in progress"
         })
     asyncio.create_task(scan_and_process_videos())
     return JSONResponse({
         "status": "started",
@@ -607,5 +610,5 @@ async def trigger_processing():
 if __name__ == "__main__":
     print("Starting Video Processing Service on port 7860...")
-    print("Processing will begin 3 minutes after startup")
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+ENDOFFILE'
 #!/usr/bin/env python3
 import os
 import json
     "current_file": None,
     "error_count": 0,
     "last_error": None,
+    "processed_files": [],
+    "whisper_ready": False
 }
+# Whisper model — loaded async at startup, not at import time
+whisper_model = None
 HF_DATASET_REPO = "factorstudios/movs"
 HOOKS_FOLDER = "hooks"
 TRANSCRIPTION_FOLDER = "transcriptions"
+def _load_whisper_model():
+    """Blocking model load — runs in thread executor."""
+    global whisper_model
+    print("Loading Whisper small model...")
+    whisper_model = WhisperModel("small", device="auto", compute_type="int8")
+    processing_state["whisper_ready"] = True
+    print("✓ Whisper model loaded")
 def timestamp_to_seconds(timestamp: str) -> float:
     """Convert HH:MM:SS to seconds."""
     try:
         "-ss", str(start_seconds),
         "-to", str(end_seconds),
         "-i", video_path,
+        "-vn",
+        "-acodec", "pcm_s16le",
+        "-ar", "16000",
+        "-ac", "1",
         output_wav
     ]
     result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 def transcribe_segment(audio_path: str) -> List[Tuple[float, float, str]]:
     """
     Transcribe audio with Whisper small.
+    Returns list of (start_sec, end_sec, text) relative to segment start.
     """
     print("  Transcribing audio with Whisper small...")
     segments, info = whisper_model.transcribe(
         audio_path,
         beam_size=5,
+        language=None,
+        vad_filter=True,
         vad_parameters=dict(min_silence_duration_ms=500)
     )
     lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
     l_channel, a_channel, b_channel = cv2.split(lab)
     a_channel = cv2.add(a_channel, 5)
     b_channel = cv2.add(b_channel, 8)
     clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
     l_channel = clahe.apply(l_channel)
     lab_enhanced = cv2.merge([l_channel, a_channel, b_channel])
     frame = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
     hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV).astype(np.float32)
     hsv[:, :, 1] = np.clip(hsv[:, :, 1] * 1.3, 0, 255)
     frame = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)
     frame = cv2.convertScaleAbs(frame, alpha=1.15, beta=10)
     kernel = np.array([[-1, -1, -1],
                        [-1,  9, -1],
                        [-1, -1, -1]]) / 1.2
     sharpened = cv2.filter2D(frame, -1, kernel)
     frame = cv2.addWeighted(frame, 0.4, sharpened, 0.6, 0)
     rows, cols = frame.shape[:2]
     X_kernel = cv2.getGaussianKernel(cols, cols / 2)
     Y_kernel = cv2.getGaussianKernel(rows, rows / 2)
     except Exception:
         font = ImageFont.load_default()
     max_width = width - 80
     wrapped_lines = []
     words = text.split()
     line_height = font_size + 12
     total_text_height = len(wrapped_lines) * line_height
     y_start = int(height * 0.80) - total_text_height // 2
     shadow_offset = 3
         x = (width - line_width) // 2
         y = y_start + i * line_height
         draw.text((x + shadow_offset, y + shadow_offset), line, font=font, fill=(0, 0, 0, 200))
         draw.text((x, y), line, font=font, fill=(255, 255, 255, 255))
     frame_pil = Image.alpha_composite(frame_pil, overlay).convert('RGB')
 def build_frame_caption_map(captions: List[Tuple[float, float, str]], fps: float) -> Dict[int, str]:
+    """Convert Whisper segments into a per-frame caption lookup."""
     frame_map = {}
     for start_sec, end_sec, text in captions:
         start_frame = int(start_sec * fps)
     4. Mux processed video with original audio
     """
     ffmpeg_video_proc = None
+    temp_wav = output_path.replace(".mp4", "_audio.wav")
     temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
     try:
         print(f"Video info: {fps} fps, {original_width}x{original_height}")
         print(f"Extracting segment: {start_time} to {end_time} ({duration:.1f}s)")
+        # ── Step 1: Extract audio → WAV ───────────────────────────────────────
         print("  Extracting audio segment...")
         audio_ok = extract_audio_segment(video_path, start_seconds, end_seconds, temp_wav)
+        # ── Step 2: Transcribe with Whisper ───────────────────────────────────
+        if audio_ok and whisper_model is not None:
             captions = transcribe_segment(temp_wav)
+        else:
+            print("  Warning: Skipping transcription (audio failed or model not ready)")
+            captions = []
         frame_caption_map = build_frame_caption_map(captions, fps)
         # ── Step 3: Process frames → pipe to FFmpeg ───────────────────────────
                 print(f"Warning: Could not read frame at position {processed_frames}")
                 break
             aspect_ratio = target_width / target_height
             if original_width / original_height > aspect_ratio:
                 new_width = int(original_height * aspect_ratio)
             frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
             frame = apply_color_grading_wedding_retro(frame)
+            current_caption = frame_caption_map.get(processed_frames, current_caption)
+            # Clear caption if this frame isn't in the map and the previous caption has ended
+            if processed_frames not in frame_caption_map:
+                current_caption = ""
             if current_caption:
                 frame = burn_captions_to_frame(frame, current_caption)
             print(f"✗ FFmpeg audio mux failed (code {mux_result.returncode})")
             return False
+        print(f"✓ Segment complete: {output_path}")
         return True
     except Exception as e:
         return False
     finally:
         for tmp in [temp_video_path, temp_wav]:
             if tmp and os.path.exists(tmp):
+                try:
+                    os.remove(tmp)
+                except Exception:
+                    pass
 async def process_movie_segments(movie_name: str) -> bool:
         print(f"Processing movie: {movie_name}")
         print(f"{'='*80}")
         video_file = f"{movie_name}.mkv"
         print(f"Downloading video: {video_file}")
             print(f"Error: Could not download video: {e}")
             return False
         hooks_folder = f"{HOOKS_FOLDER}/{movie_name}"
         print(f"Listing segments from: {hooks_folder}")
             return False
         print(f"Found {len(segment_files)} segments")
         temp_dir = tempfile.mkdtemp()
         try:
         print("Video processing already running, skipping...")
         return
+    # Wait 3 minutes for Space to fully initialize
     print("Waiting 3 minutes before starting video processing...")
     await asyncio.sleep(180)
 @app.on_event("startup")
 async def startup_event():
+    """Load Whisper in background, then kick off video processing after 3 min."""
+    loop = asyncio.get_event_loop()
+    # Load Whisper model in thread so it doesn't block the event loop / health check
+    await loop.run_in_executor(None, _load_whisper_model)
+    # Kick off processing task (has its own 3-min delay inside)
     asyncio.create_task(scan_and_process_videos())
     return JSONResponse({
         "status": "running",
         "service": "Video Processing Service",
+        "whisper_ready": processing_state["whisper_ready"],
         "is_processing": processing_state["is_running"],
         "total_processed": processing_state["total_processed"],
         "error_count": processing_state["error_count"],
 @app.get("/status")
 async def get_status():
     return JSONResponse({
+        "whisper_ready": processing_state["whisper_ready"],
         "is_running": processing_state["is_running"],
         "total_processed": processing_state["total_processed"],
         "error_count": processing_state["error_count"],
             "status": "already_running",
             "message": "Video processing is already in progress"
         })
+    if not processing_state["whisper_ready"]:
+        return JSONResponse({
+            "status": "not_ready",
+            "message": "Whisper model is still loading, try again shortly"
+        })
     asyncio.create_task(scan_and_process_videos())
     return JSONResponse({
         "status": "started",
 if __name__ == "__main__":
     print("Starting Video Processing Service on port 7860...")
+    print("Whisper will load at startup, processing begins 3 minutes after")
+    uvicorn.run(app, host="0.0.0.0", port=7860)