Spaces:

factorstudios
/

segment

Sleeping

App Files Files Community

factorstudios commited on 3 days ago

Commit

abb20ff

verified ·

1 Parent(s): 3b93ec7

Update server.py

Browse files

Files changed (1) hide show

server.py +112 -86

server.py CHANGED Viewed

@@ -8,7 +8,7 @@ import subprocess
 from pathlib import Path
 from datetime import datetime
 from dotenv import load_dotenv
-from typing import List, Dict, Optional
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
@@ -19,8 +19,10 @@ try:
     import cv2
     import numpy as np
     from PIL import Image, ImageDraw, ImageFont
 except ImportError as e:
     print(f"Missing dependency: {e}")
     exit(1)
 # Load environment variables
@@ -42,6 +44,11 @@ processing_state = {
     "processed_files": []
 }
 HF_DATASET_REPO = "factorstudios/movs"
 HOOKS_FOLDER = "hooks"
 READY_VIDEOS_FOLDER = "ready_videos"
@@ -52,33 +59,51 @@ def timestamp_to_seconds(timestamp: str) -> float:
     """Convert HH:MM:SS to seconds."""
     try:
         parts = timestamp.split(":")
-        hours = int(parts[0])
-        minutes = int(parts[1])
-        seconds = int(parts[2])
-        return hours * 3600 + minutes * 60 + seconds
     except Exception as e:
         print(f"Error converting timestamp {timestamp}: {e}")
         return 0.0
-def extract_captions_for_segment(transcript_content: str, start_time: str, end_time: str) -> List[tuple]:
-    """Extract captions from transcript that fall within segment timeframe.
-    Returns list of (relative_seconds, text) tuples."""
-    captions = []
-    start_seconds = timestamp_to_seconds(start_time)
-    end_seconds = timestamp_to_seconds(end_time)
-    lines = transcript_content.strip().split('\n')
-    for line in lines:
-        match = re.match(r'\[(\d{2}):(\d{2}):(\d{2})\]\s+(.*)', line)
-        if match:
-            h, m, s, text = match.groups()
-            line_seconds = int(h) * 3600 + int(m) * 60 + int(s)
-            if start_seconds <= line_seconds <= end_seconds:
-                relative_time = line_seconds - start_seconds
-                captions.append((relative_time, text.strip()))
     return captions
@@ -162,7 +187,6 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
     # Position: 80% down the frame (near bottom, not center)
     y_start = int(height * 0.80) - total_text_height // 2
     shadow_offset = 3
     for i, line in enumerate(wrapped_lines):
@@ -171,26 +195,48 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
         x = (width - line_width) // 2
         y = y_start + i * line_height
-        # Draw shadow (dark, slightly offset)
         draw.text((x + shadow_offset, y + shadow_offset), line, font=font, fill=(0, 0, 0, 200))
-        # Draw main white text
         draw.text((x, y), line, font=font, fill=(255, 255, 255, 255))
     frame_pil = Image.alpha_composite(frame_pil, overlay).convert('RGB')
     return cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
 def process_video_segment(
     video_path: str,
     output_path: str,
     start_time: str,
     end_time: str,
-    captions: List[tuple],
     target_width: int = 1080,
     target_height: int = 1350
 ) -> bool:
-    """Process video segment: crop, resize, color grade, burn captions, encode with audio via FFmpeg."""
     ffmpeg_video_proc = None
     try:
         print(f"Opening video: {video_path}")
         cap = cv2.VideoCapture(video_path)
@@ -210,9 +256,21 @@ def process_video_segment(
         print(f"Video info: {fps} fps, {original_width}x{original_height}")
         print(f"Extracting segment: {start_time} to {end_time} ({duration:.1f}s)")
-        # Step 1: Write processed frames to a temp video-only file
-        temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
         ffmpeg_video_cmd = [
             "ffmpeg", "-y",
             "-f", "rawvideo",
@@ -235,16 +293,9 @@ def process_video_segment(
             stderr=subprocess.DEVNULL
         )
-        # Seek to start frame
         start_frame = int(start_seconds * fps)
         cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
-        # Build caption lookup: frame_number -> text
-        caption_map = {}
-        for rel_time, caption_text in captions:
-            frame_num = int(rel_time * fps)
-            caption_map[frame_num] = caption_text
         current_caption = ""
         processed_frames = 0
         target_frames = int(duration * fps)
@@ -271,8 +322,12 @@ def process_video_segment(
             frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
             frame = apply_color_grading_wedding_retro(frame)
-            if processed_frames in caption_map:
-                current_caption = caption_map[processed_frames]
             if current_caption:
                 frame = burn_captions_to_frame(frame, current_caption)
@@ -289,22 +344,22 @@ def process_video_segment(
         cap.release()
         if ffmpeg_video_proc.returncode != 0:
-            print(f"✗ FFmpeg video encoding failed with return code {ffmpeg_video_proc.returncode}")
             return False
-        print("✓ Video frames encoded, muxing audio...")
-        # Step 2: Mux processed video with audio extracted directly from source
         ffmpeg_mux_cmd = [
             "ffmpeg", "-y",
-            "-i", temp_video_path,                  # processed video (no audio)
-            "-ss", str(start_seconds),               # seek audio to segment start
-            "-to", str(end_seconds),                 # audio end point
-            "-i", video_path,                        # original source for audio
-            "-map", "0:v:0",                         # video from processed file
-            "-map", "1:a:0",                         # audio from original source
-            "-c:v", "copy",                          # don't re-encode video
-            "-c:a", "aac",                           # encode audio to AAC
             "-b:a", "192k",
             "-shortest",
             "-movflags", "+faststart",
@@ -317,12 +372,8 @@ def process_video_segment(
             stderr=subprocess.DEVNULL
         )
-        # Clean up temp video file
-        if os.path.exists(temp_video_path):
-            os.remove(temp_video_path)
         if mux_result.returncode != 0:
-            print(f"✗ FFmpeg audio mux failed with return code {mux_result.returncode}")
             return False
         print(f"✓ Video segment with audio saved: {output_path}")
@@ -336,12 +387,14 @@ def process_video_segment(
             except Exception:
                 pass
             ffmpeg_video_proc.wait()
-        # Clean up temp file if it exists
-        temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
-        if os.path.exists(temp_video_path):
-            os.remove(temp_video_path)
         return False
 async def process_movie_segments(movie_name: str) -> bool:
     """Process all segments for a movie."""
@@ -351,24 +404,6 @@ async def process_movie_segments(movie_name: str) -> bool:
         print(f"Processing movie: {movie_name}")
         print(f"{'='*80}")
-        # Download transcript
-        transcript_file = f"{TRANSCRIPTION_FOLDER}/{movie_name}.transcript.txt"
-        print(f"Downloading transcript: {transcript_file}")
-        try:
-            transcript_path = hf_hub_download(
-                repo_id=HF_DATASET_REPO,
-                filename=transcript_file,
-                repo_type="dataset",
-                token=HF_TOKEN,
-                cache_dir="/tmp/video_processor_cache"
-            )
-            with open(transcript_path, 'r', encoding='utf-8') as f:
-                transcript_content = f.read()
-        except Exception as e:
-            print(f"Warning: Could not download transcript: {e}")
-            transcript_content = ""
         # Download original video
         video_file = f"{movie_name}.mkv"
         print(f"Downloading video: {video_file}")
@@ -430,9 +465,6 @@ async def process_movie_segments(movie_name: str) -> bool:
                     print(f"\nProcessing segment {segment_number}: {start_time} to {end_time}")
-                    captions = extract_captions_for_segment(transcript_content, start_time, end_time)
-                    print(f"Found {len(captions)} caption lines for this segment")
                     output_filename = f"segment-{segment_number:02d}.mp4"
                     output_path = os.path.join(temp_dir, output_filename)
@@ -440,8 +472,7 @@ async def process_movie_segments(movie_name: str) -> bool:
                         video_path,
                         output_path,
                         start_time,
-                        end_time,
-                        captions
                     )
                     if not success:
@@ -489,7 +520,7 @@ async def scan_and_process_videos():
         return
     print("Waiting 3 minutes before starting video processing...")
-    await asyncio.sleep(180)  # 3-minute startup delay
     processing_state["is_running"] = True
     print("\n" + "="*80)
@@ -531,13 +562,11 @@ async def scan_and_process_videos():
 @app.on_event("startup")
 async def startup_event():
-    """Start video processing on server startup."""
     asyncio.create_task(scan_and_process_videos())
 @app.get("/")
 async def health():
-    """Health check endpoint."""
     return JSONResponse({
         "status": "running",
         "service": "Video Processing Service",
@@ -552,7 +581,6 @@ async def health():
 @app.get("/status")
 async def get_status():
-    """Get current processing status."""
     return JSONResponse({
         "is_running": processing_state["is_running"],
         "total_processed": processing_state["total_processed"],
@@ -565,13 +593,11 @@ async def get_status():
 @app.post("/trigger-processing")
 async def trigger_processing():
-    """Manually trigger video processing (skips the startup delay)."""
     if processing_state["is_running"]:
         return JSONResponse({
             "status": "already_running",
             "message": "Video processing is already in progress"
         })
     asyncio.create_task(scan_and_process_videos())
     return JSONResponse({
         "status": "started",

 from pathlib import Path
 from datetime import datetime
 from dotenv import load_dotenv
+from typing import List, Dict, Optional, Tuple
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
     import cv2
     import numpy as np
     from PIL import Image, ImageDraw, ImageFont
+    from faster_whisper import WhisperModel
 except ImportError as e:
     print(f"Missing dependency: {e}")
+    print("Install with: pip install faster-whisper")
     exit(1)
 # Load environment variables
     "processed_files": []
 }
+# Load Whisper model once at startup (small = good balance of speed/accuracy)
+print("Loading Whisper small model...")
+whisper_model = WhisperModel("small", device="auto", compute_type="int8")
+print("✓ Whisper model loaded")
 HF_DATASET_REPO = "factorstudios/movs"
 HOOKS_FOLDER = "hooks"
 READY_VIDEOS_FOLDER = "ready_videos"
     """Convert HH:MM:SS to seconds."""
     try:
         parts = timestamp.split(":")
+        return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
     except Exception as e:
         print(f"Error converting timestamp {timestamp}: {e}")
         return 0.0
+def extract_audio_segment(video_path: str, start_seconds: float, end_seconds: float, output_wav: str) -> bool:
+    """Extract audio segment from video as WAV for Whisper."""
+    cmd = [
+        "ffmpeg", "-y",
+        "-ss", str(start_seconds),
+        "-to", str(end_seconds),
+        "-i", video_path,
+        "-vn",                    # no video
+        "-acodec", "pcm_s16le",   # WAV format Whisper expects
+        "-ar", "16000",           # 16kHz sample rate (Whisper requirement)
+        "-ac", "1",               # mono
+        output_wav
+    ]
+    result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    return result.returncode == 0
+def transcribe_segment(audio_path: str) -> List[Tuple[float, float, str]]:
+    """
+    Transcribe audio with Whisper small.
+    Returns list of (start_sec, end_sec, text) — all relative to segment start.
+    """
+    print("  Transcribing audio with Whisper small...")
+    segments, info = whisper_model.transcribe(
+        audio_path,
+        beam_size=5,
+        language=None,        # auto-detect language
+        vad_filter=True,      # skip silence
+        vad_parameters=dict(min_silence_duration_ms=500)
+    )
+    captions = []
+    for seg in segments:
+        text = seg.text.strip()
+        if text:
+            captions.append((seg.start, seg.end, text))
+            print(f"  [{seg.start:.1f}s → {seg.end:.1f}s] {text}")
+    print(f"  ✓ Transcribed {len(captions)} caption segments")
     return captions
     # Position: 80% down the frame (near bottom, not center)
     y_start = int(height * 0.80) - total_text_height // 2
     shadow_offset = 3
     for i, line in enumerate(wrapped_lines):
         x = (width - line_width) // 2
         y = y_start + i * line_height
+        # Shadow layer
         draw.text((x + shadow_offset, y + shadow_offset), line, font=font, fill=(0, 0, 0, 200))
+        # Main white text
         draw.text((x, y), line, font=font, fill=(255, 255, 255, 255))
     frame_pil = Image.alpha_composite(frame_pil, overlay).convert('RGB')
     return cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
+def build_frame_caption_map(captions: List[Tuple[float, float, str]], fps: float) -> Dict[int, str]:
+    """
+    Convert Whisper (start, end, text) segments into a per-frame caption map.
+    Each frame number maps to the caption active at that time.
+    """
+    frame_map = {}
+    for start_sec, end_sec, text in captions:
+        start_frame = int(start_sec * fps)
+        end_frame = int(end_sec * fps)
+        for f in range(start_frame, end_frame + 1):
+            frame_map[f] = text
+    return frame_map
 def process_video_segment(
     video_path: str,
     output_path: str,
     start_time: str,
     end_time: str,
     target_width: int = 1080,
     target_height: int = 1350
 ) -> bool:
+    """
+    Full pipeline:
+    1. Extract audio segment → WAV
+    2. Transcribe with Whisper small
+    3. Process frames with color grading + caption burn-in
+    4. Mux processed video with original audio
+    """
     ffmpeg_video_proc = None
+    temp_wav = None
+    temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
     try:
         print(f"Opening video: {video_path}")
         cap = cv2.VideoCapture(video_path)
         print(f"Video info: {fps} fps, {original_width}x{original_height}")
         print(f"Extracting segment: {start_time} to {end_time} ({duration:.1f}s)")
+        # ── Step 1: Extract audio segment as WAV ──────────────────────────────
+        temp_wav = output_path.replace(".mp4", "_audio.wav")
+        print("  Extracting audio segment...")
+        audio_ok = extract_audio_segment(video_path, start_seconds, end_seconds, temp_wav)
+        if not audio_ok:
+            print("  Warning: Audio extraction failed, captions will be skipped")
+            captions = []
+        else:
+            # ── Step 2: Transcribe with Whisper ───────────────────────────────
+            captions = transcribe_segment(temp_wav)
+        # Build per-frame caption lookup from Whisper timestamps
+        frame_caption_map = build_frame_caption_map(captions, fps)
+        # ── Step 3: Process frames → pipe to FFmpeg ───────────────────────────
         ffmpeg_video_cmd = [
             "ffmpeg", "-y",
             "-f", "rawvideo",
             stderr=subprocess.DEVNULL
         )
         start_frame = int(start_seconds * fps)
         cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
         current_caption = ""
         processed_frames = 0
         target_frames = int(duration * fps)
             frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
             frame = apply_color_grading_wedding_retro(frame)
+            # Update caption from Whisper frame map
+            if processed_frames in frame_caption_map:
+                current_caption = frame_caption_map[processed_frames]
+            elif processed_frames not in frame_caption_map and current_caption:
+                # Clear caption when we're past its end frame
+                current_caption = frame_caption_map.get(processed_frames, "")
             if current_caption:
                 frame = burn_captions_to_frame(frame, current_caption)
         cap.release()
         if ffmpeg_video_proc.returncode != 0:
+            print(f"✗ FFmpeg video encoding failed (code {ffmpeg_video_proc.returncode})")
             return False
+        print("✓ Frames encoded, muxing audio...")
+        # ── Step 4: Mux processed video + original audio ──────────────────────
         ffmpeg_mux_cmd = [
             "ffmpeg", "-y",
+            "-i", temp_video_path,
+            "-ss", str(start_seconds),
+            "-to", str(end_seconds),
+            "-i", video_path,
+            "-map", "0:v:0",
+            "-map", "1:a:0",
+            "-c:v", "copy",
+            "-c:a", "aac",
             "-b:a", "192k",
             "-shortest",
             "-movflags", "+faststart",
             stderr=subprocess.DEVNULL
         )
         if mux_result.returncode != 0:
+            print(f"✗ FFmpeg audio mux failed (code {mux_result.returncode})")
             return False
         print(f"✓ Video segment with audio saved: {output_path}")
             except Exception:
                 pass
             ffmpeg_video_proc.wait()
         return False
+    finally:
+        # Clean up temp files
+        for tmp in [temp_video_path, temp_wav]:
+            if tmp and os.path.exists(tmp):
+                os.remove(tmp)
 async def process_movie_segments(movie_name: str) -> bool:
     """Process all segments for a movie."""
         print(f"Processing movie: {movie_name}")
         print(f"{'='*80}")
         # Download original video
         video_file = f"{movie_name}.mkv"
         print(f"Downloading video: {video_file}")
                     print(f"\nProcessing segment {segment_number}: {start_time} to {end_time}")
                     output_filename = f"segment-{segment_number:02d}.mp4"
                     output_path = os.path.join(temp_dir, output_filename)
                         video_path,
                         output_path,
                         start_time,
+                        end_time
                     )
                     if not success:
         return
     print("Waiting 3 minutes before starting video processing...")
+    await asyncio.sleep(180)
     processing_state["is_running"] = True
     print("\n" + "="*80)
 @app.on_event("startup")
 async def startup_event():
     asyncio.create_task(scan_and_process_videos())
 @app.get("/")
 async def health():
     return JSONResponse({
         "status": "running",
         "service": "Video Processing Service",
 @app.get("/status")
 async def get_status():
     return JSONResponse({
         "is_running": processing_state["is_running"],
         "total_processed": processing_state["total_processed"],
 @app.post("/trigger-processing")
 async def trigger_processing():
     if processing_state["is_running"]:
         return JSONResponse({
             "status": "already_running",
             "message": "Video processing is already in progress"
         })
     asyncio.create_task(scan_and_process_videos())
     return JSONResponse({
         "status": "started",