factorstudios commited on
Commit
abb20ff
Β·
verified Β·
1 Parent(s): 3b93ec7

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +112 -86
server.py CHANGED
@@ -8,7 +8,7 @@ import subprocess
8
  from pathlib import Path
9
  from datetime import datetime
10
  from dotenv import load_dotenv
11
- from typing import List, Dict, Optional
12
 
13
  from fastapi import FastAPI, HTTPException
14
  from fastapi.responses import JSONResponse
@@ -19,8 +19,10 @@ try:
19
  import cv2
20
  import numpy as np
21
  from PIL import Image, ImageDraw, ImageFont
 
22
  except ImportError as e:
23
  print(f"Missing dependency: {e}")
 
24
  exit(1)
25
 
26
  # Load environment variables
@@ -42,6 +44,11 @@ processing_state = {
42
  "processed_files": []
43
  }
44
 
 
 
 
 
 
45
  HF_DATASET_REPO = "factorstudios/movs"
46
  HOOKS_FOLDER = "hooks"
47
  READY_VIDEOS_FOLDER = "ready_videos"
@@ -52,33 +59,51 @@ def timestamp_to_seconds(timestamp: str) -> float:
52
  """Convert HH:MM:SS to seconds."""
53
  try:
54
  parts = timestamp.split(":")
55
- hours = int(parts[0])
56
- minutes = int(parts[1])
57
- seconds = int(parts[2])
58
- return hours * 3600 + minutes * 60 + seconds
59
  except Exception as e:
60
  print(f"Error converting timestamp {timestamp}: {e}")
61
  return 0.0
62
 
63
 
64
- def extract_captions_for_segment(transcript_content: str, start_time: str, end_time: str) -> List[tuple]:
65
- """Extract captions from transcript that fall within segment timeframe.
66
- Returns list of (relative_seconds, text) tuples."""
67
- captions = []
68
- start_seconds = timestamp_to_seconds(start_time)
69
- end_seconds = timestamp_to_seconds(end_time)
70
-
71
- lines = transcript_content.strip().split('\n')
72
- for line in lines:
73
- match = re.match(r'\[(\d{2}):(\d{2}):(\d{2})\]\s+(.*)', line)
74
- if match:
75
- h, m, s, text = match.groups()
76
- line_seconds = int(h) * 3600 + int(m) * 60 + int(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- if start_seconds <= line_seconds <= end_seconds:
79
- relative_time = line_seconds - start_seconds
80
- captions.append((relative_time, text.strip()))
 
 
 
81
 
 
82
  return captions
83
 
84
 
@@ -162,7 +187,6 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
162
 
163
  # Position: 80% down the frame (near bottom, not center)
164
  y_start = int(height * 0.80) - total_text_height // 2
165
-
166
  shadow_offset = 3
167
 
168
  for i, line in enumerate(wrapped_lines):
@@ -171,26 +195,48 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
171
  x = (width - line_width) // 2
172
  y = y_start + i * line_height
173
 
174
- # Draw shadow (dark, slightly offset)
175
  draw.text((x + shadow_offset, y + shadow_offset), line, font=font, fill=(0, 0, 0, 200))
176
- # Draw main white text
177
  draw.text((x, y), line, font=font, fill=(255, 255, 255, 255))
178
 
179
  frame_pil = Image.alpha_composite(frame_pil, overlay).convert('RGB')
180
  return cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
181
 
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  def process_video_segment(
184
  video_path: str,
185
  output_path: str,
186
  start_time: str,
187
  end_time: str,
188
- captions: List[tuple],
189
  target_width: int = 1080,
190
  target_height: int = 1350
191
  ) -> bool:
192
- """Process video segment: crop, resize, color grade, burn captions, encode with audio via FFmpeg."""
 
 
 
 
 
 
193
  ffmpeg_video_proc = None
 
 
 
194
  try:
195
  print(f"Opening video: {video_path}")
196
  cap = cv2.VideoCapture(video_path)
@@ -210,9 +256,21 @@ def process_video_segment(
210
  print(f"Video info: {fps} fps, {original_width}x{original_height}")
211
  print(f"Extracting segment: {start_time} to {end_time} ({duration:.1f}s)")
212
 
213
- # Step 1: Write processed frames to a temp video-only file
214
- temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
 
 
 
 
 
 
 
 
 
 
 
215
 
 
216
  ffmpeg_video_cmd = [
217
  "ffmpeg", "-y",
218
  "-f", "rawvideo",
@@ -235,16 +293,9 @@ def process_video_segment(
235
  stderr=subprocess.DEVNULL
236
  )
237
 
238
- # Seek to start frame
239
  start_frame = int(start_seconds * fps)
240
  cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
241
 
242
- # Build caption lookup: frame_number -> text
243
- caption_map = {}
244
- for rel_time, caption_text in captions:
245
- frame_num = int(rel_time * fps)
246
- caption_map[frame_num] = caption_text
247
-
248
  current_caption = ""
249
  processed_frames = 0
250
  target_frames = int(duration * fps)
@@ -271,8 +322,12 @@ def process_video_segment(
271
  frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
272
  frame = apply_color_grading_wedding_retro(frame)
273
 
274
- if processed_frames in caption_map:
275
- current_caption = caption_map[processed_frames]
 
 
 
 
276
 
277
  if current_caption:
278
  frame = burn_captions_to_frame(frame, current_caption)
@@ -289,22 +344,22 @@ def process_video_segment(
289
  cap.release()
290
 
291
  if ffmpeg_video_proc.returncode != 0:
292
- print(f"βœ— FFmpeg video encoding failed with return code {ffmpeg_video_proc.returncode}")
293
  return False
294
 
295
- print("βœ“ Video frames encoded, muxing audio...")
296
 
297
- # Step 2: Mux processed video with audio extracted directly from source
298
  ffmpeg_mux_cmd = [
299
  "ffmpeg", "-y",
300
- "-i", temp_video_path, # processed video (no audio)
301
- "-ss", str(start_seconds), # seek audio to segment start
302
- "-to", str(end_seconds), # audio end point
303
- "-i", video_path, # original source for audio
304
- "-map", "0:v:0", # video from processed file
305
- "-map", "1:a:0", # audio from original source
306
- "-c:v", "copy", # don't re-encode video
307
- "-c:a", "aac", # encode audio to AAC
308
  "-b:a", "192k",
309
  "-shortest",
310
  "-movflags", "+faststart",
@@ -317,12 +372,8 @@ def process_video_segment(
317
  stderr=subprocess.DEVNULL
318
  )
319
 
320
- # Clean up temp video file
321
- if os.path.exists(temp_video_path):
322
- os.remove(temp_video_path)
323
-
324
  if mux_result.returncode != 0:
325
- print(f"βœ— FFmpeg audio mux failed with return code {mux_result.returncode}")
326
  return False
327
 
328
  print(f"βœ“ Video segment with audio saved: {output_path}")
@@ -336,12 +387,14 @@ def process_video_segment(
336
  except Exception:
337
  pass
338
  ffmpeg_video_proc.wait()
339
- # Clean up temp file if it exists
340
- temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
341
- if os.path.exists(temp_video_path):
342
- os.remove(temp_video_path)
343
  return False
344
 
 
 
 
 
 
 
345
 
346
  async def process_movie_segments(movie_name: str) -> bool:
347
  """Process all segments for a movie."""
@@ -351,24 +404,6 @@ async def process_movie_segments(movie_name: str) -> bool:
351
  print(f"Processing movie: {movie_name}")
352
  print(f"{'='*80}")
353
 
354
- # Download transcript
355
- transcript_file = f"{TRANSCRIPTION_FOLDER}/{movie_name}.transcript.txt"
356
- print(f"Downloading transcript: {transcript_file}")
357
-
358
- try:
359
- transcript_path = hf_hub_download(
360
- repo_id=HF_DATASET_REPO,
361
- filename=transcript_file,
362
- repo_type="dataset",
363
- token=HF_TOKEN,
364
- cache_dir="/tmp/video_processor_cache"
365
- )
366
- with open(transcript_path, 'r', encoding='utf-8') as f:
367
- transcript_content = f.read()
368
- except Exception as e:
369
- print(f"Warning: Could not download transcript: {e}")
370
- transcript_content = ""
371
-
372
  # Download original video
373
  video_file = f"{movie_name}.mkv"
374
  print(f"Downloading video: {video_file}")
@@ -430,9 +465,6 @@ async def process_movie_segments(movie_name: str) -> bool:
430
 
431
  print(f"\nProcessing segment {segment_number}: {start_time} to {end_time}")
432
 
433
- captions = extract_captions_for_segment(transcript_content, start_time, end_time)
434
- print(f"Found {len(captions)} caption lines for this segment")
435
-
436
  output_filename = f"segment-{segment_number:02d}.mp4"
437
  output_path = os.path.join(temp_dir, output_filename)
438
 
@@ -440,8 +472,7 @@ async def process_movie_segments(movie_name: str) -> bool:
440
  video_path,
441
  output_path,
442
  start_time,
443
- end_time,
444
- captions
445
  )
446
 
447
  if not success:
@@ -489,7 +520,7 @@ async def scan_and_process_videos():
489
  return
490
 
491
  print("Waiting 3 minutes before starting video processing...")
492
- await asyncio.sleep(180) # 3-minute startup delay
493
 
494
  processing_state["is_running"] = True
495
  print("\n" + "="*80)
@@ -531,13 +562,11 @@ async def scan_and_process_videos():
531
 
532
  @app.on_event("startup")
533
  async def startup_event():
534
- """Start video processing on server startup."""
535
  asyncio.create_task(scan_and_process_videos())
536
 
537
 
538
  @app.get("/")
539
  async def health():
540
- """Health check endpoint."""
541
  return JSONResponse({
542
  "status": "running",
543
  "service": "Video Processing Service",
@@ -552,7 +581,6 @@ async def health():
552
 
553
  @app.get("/status")
554
  async def get_status():
555
- """Get current processing status."""
556
  return JSONResponse({
557
  "is_running": processing_state["is_running"],
558
  "total_processed": processing_state["total_processed"],
@@ -565,13 +593,11 @@ async def get_status():
565
 
566
  @app.post("/trigger-processing")
567
  async def trigger_processing():
568
- """Manually trigger video processing (skips the startup delay)."""
569
  if processing_state["is_running"]:
570
  return JSONResponse({
571
  "status": "already_running",
572
  "message": "Video processing is already in progress"
573
  })
574
-
575
  asyncio.create_task(scan_and_process_videos())
576
  return JSONResponse({
577
  "status": "started",
 
8
  from pathlib import Path
9
  from datetime import datetime
10
  from dotenv import load_dotenv
11
+ from typing import List, Dict, Optional, Tuple
12
 
13
  from fastapi import FastAPI, HTTPException
14
  from fastapi.responses import JSONResponse
 
19
  import cv2
20
  import numpy as np
21
  from PIL import Image, ImageDraw, ImageFont
22
+ from faster_whisper import WhisperModel
23
  except ImportError as e:
24
  print(f"Missing dependency: {e}")
25
+ print("Install with: pip install faster-whisper")
26
  exit(1)
27
 
28
  # Load environment variables
 
44
  "processed_files": []
45
  }
46
 
47
+ # Load Whisper model once at startup (small = good balance of speed/accuracy)
48
+ print("Loading Whisper small model...")
49
+ whisper_model = WhisperModel("small", device="auto", compute_type="int8")
50
+ print("βœ“ Whisper model loaded")
51
+
52
  HF_DATASET_REPO = "factorstudios/movs"
53
  HOOKS_FOLDER = "hooks"
54
  READY_VIDEOS_FOLDER = "ready_videos"
 
59
  """Convert HH:MM:SS to seconds."""
60
  try:
61
  parts = timestamp.split(":")
62
+ return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
 
 
 
63
  except Exception as e:
64
  print(f"Error converting timestamp {timestamp}: {e}")
65
  return 0.0
66
 
67
 
68
+ def extract_audio_segment(video_path: str, start_seconds: float, end_seconds: float, output_wav: str) -> bool:
69
+ """Extract audio segment from video as WAV for Whisper."""
70
+ cmd = [
71
+ "ffmpeg", "-y",
72
+ "-ss", str(start_seconds),
73
+ "-to", str(end_seconds),
74
+ "-i", video_path,
75
+ "-vn", # no video
76
+ "-acodec", "pcm_s16le", # WAV format Whisper expects
77
+ "-ar", "16000", # 16kHz sample rate (Whisper requirement)
78
+ "-ac", "1", # mono
79
+ output_wav
80
+ ]
81
+ result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
82
+ return result.returncode == 0
83
+
84
+
85
+ def transcribe_segment(audio_path: str) -> List[Tuple[float, float, str]]:
86
+ """
87
+ Transcribe audio with Whisper small.
88
+ Returns list of (start_sec, end_sec, text) β€” all relative to segment start.
89
+ """
90
+ print(" Transcribing audio with Whisper small...")
91
+ segments, info = whisper_model.transcribe(
92
+ audio_path,
93
+ beam_size=5,
94
+ language=None, # auto-detect language
95
+ vad_filter=True, # skip silence
96
+ vad_parameters=dict(min_silence_duration_ms=500)
97
+ )
98
 
99
+ captions = []
100
+ for seg in segments:
101
+ text = seg.text.strip()
102
+ if text:
103
+ captions.append((seg.start, seg.end, text))
104
+ print(f" [{seg.start:.1f}s β†’ {seg.end:.1f}s] {text}")
105
 
106
+ print(f" βœ“ Transcribed {len(captions)} caption segments")
107
  return captions
108
 
109
 
 
187
 
188
  # Position: 80% down the frame (near bottom, not center)
189
  y_start = int(height * 0.80) - total_text_height // 2
 
190
  shadow_offset = 3
191
 
192
  for i, line in enumerate(wrapped_lines):
 
195
  x = (width - line_width) // 2
196
  y = y_start + i * line_height
197
 
198
+ # Shadow layer
199
  draw.text((x + shadow_offset, y + shadow_offset), line, font=font, fill=(0, 0, 0, 200))
200
+ # Main white text
201
  draw.text((x, y), line, font=font, fill=(255, 255, 255, 255))
202
 
203
  frame_pil = Image.alpha_composite(frame_pil, overlay).convert('RGB')
204
  return cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
205
 
206
 
207
+ def build_frame_caption_map(captions: List[Tuple[float, float, str]], fps: float) -> Dict[int, str]:
208
+ """
209
+ Convert Whisper (start, end, text) segments into a per-frame caption map.
210
+ Each frame number maps to the caption active at that time.
211
+ """
212
+ frame_map = {}
213
+ for start_sec, end_sec, text in captions:
214
+ start_frame = int(start_sec * fps)
215
+ end_frame = int(end_sec * fps)
216
+ for f in range(start_frame, end_frame + 1):
217
+ frame_map[f] = text
218
+ return frame_map
219
+
220
+
221
  def process_video_segment(
222
  video_path: str,
223
  output_path: str,
224
  start_time: str,
225
  end_time: str,
 
226
  target_width: int = 1080,
227
  target_height: int = 1350
228
  ) -> bool:
229
+ """
230
+ Full pipeline:
231
+ 1. Extract audio segment β†’ WAV
232
+ 2. Transcribe with Whisper small
233
+ 3. Process frames with color grading + caption burn-in
234
+ 4. Mux processed video with original audio
235
+ """
236
  ffmpeg_video_proc = None
237
+ temp_wav = None
238
+ temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
239
+
240
  try:
241
  print(f"Opening video: {video_path}")
242
  cap = cv2.VideoCapture(video_path)
 
256
  print(f"Video info: {fps} fps, {original_width}x{original_height}")
257
  print(f"Extracting segment: {start_time} to {end_time} ({duration:.1f}s)")
258
 
259
+ # ── Step 1: Extract audio segment as WAV ──────────────────────────────
260
+ temp_wav = output_path.replace(".mp4", "_audio.wav")
261
+ print(" Extracting audio segment...")
262
+ audio_ok = extract_audio_segment(video_path, start_seconds, end_seconds, temp_wav)
263
+ if not audio_ok:
264
+ print(" Warning: Audio extraction failed, captions will be skipped")
265
+ captions = []
266
+ else:
267
+ # ── Step 2: Transcribe with Whisper ───────────────────────────────
268
+ captions = transcribe_segment(temp_wav)
269
+
270
+ # Build per-frame caption lookup from Whisper timestamps
271
+ frame_caption_map = build_frame_caption_map(captions, fps)
272
 
273
+ # ── Step 3: Process frames β†’ pipe to FFmpeg ───────────────────────────
274
  ffmpeg_video_cmd = [
275
  "ffmpeg", "-y",
276
  "-f", "rawvideo",
 
293
  stderr=subprocess.DEVNULL
294
  )
295
 
 
296
  start_frame = int(start_seconds * fps)
297
  cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
298
 
 
 
 
 
 
 
299
  current_caption = ""
300
  processed_frames = 0
301
  target_frames = int(duration * fps)
 
322
  frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
323
  frame = apply_color_grading_wedding_retro(frame)
324
 
325
+ # Update caption from Whisper frame map
326
+ if processed_frames in frame_caption_map:
327
+ current_caption = frame_caption_map[processed_frames]
328
+ elif processed_frames not in frame_caption_map and current_caption:
329
+ # Clear caption when we're past its end frame
330
+ current_caption = frame_caption_map.get(processed_frames, "")
331
 
332
  if current_caption:
333
  frame = burn_captions_to_frame(frame, current_caption)
 
344
  cap.release()
345
 
346
  if ffmpeg_video_proc.returncode != 0:
347
+ print(f"βœ— FFmpeg video encoding failed (code {ffmpeg_video_proc.returncode})")
348
  return False
349
 
350
+ print("βœ“ Frames encoded, muxing audio...")
351
 
352
+ # ── Step 4: Mux processed video + original audio ──────────────────────
353
  ffmpeg_mux_cmd = [
354
  "ffmpeg", "-y",
355
+ "-i", temp_video_path,
356
+ "-ss", str(start_seconds),
357
+ "-to", str(end_seconds),
358
+ "-i", video_path,
359
+ "-map", "0:v:0",
360
+ "-map", "1:a:0",
361
+ "-c:v", "copy",
362
+ "-c:a", "aac",
363
  "-b:a", "192k",
364
  "-shortest",
365
  "-movflags", "+faststart",
 
372
  stderr=subprocess.DEVNULL
373
  )
374
 
 
 
 
 
375
  if mux_result.returncode != 0:
376
+ print(f"βœ— FFmpeg audio mux failed (code {mux_result.returncode})")
377
  return False
378
 
379
  print(f"βœ“ Video segment with audio saved: {output_path}")
 
387
  except Exception:
388
  pass
389
  ffmpeg_video_proc.wait()
 
 
 
 
390
  return False
391
 
392
+ finally:
393
+ # Clean up temp files
394
+ for tmp in [temp_video_path, temp_wav]:
395
+ if tmp and os.path.exists(tmp):
396
+ os.remove(tmp)
397
+
398
 
399
  async def process_movie_segments(movie_name: str) -> bool:
400
  """Process all segments for a movie."""
 
404
  print(f"Processing movie: {movie_name}")
405
  print(f"{'='*80}")
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  # Download original video
408
  video_file = f"{movie_name}.mkv"
409
  print(f"Downloading video: {video_file}")
 
465
 
466
  print(f"\nProcessing segment {segment_number}: {start_time} to {end_time}")
467
 
 
 
 
468
  output_filename = f"segment-{segment_number:02d}.mp4"
469
  output_path = os.path.join(temp_dir, output_filename)
470
 
 
472
  video_path,
473
  output_path,
474
  start_time,
475
+ end_time
 
476
  )
477
 
478
  if not success:
 
520
  return
521
 
522
  print("Waiting 3 minutes before starting video processing...")
523
+ await asyncio.sleep(180)
524
 
525
  processing_state["is_running"] = True
526
  print("\n" + "="*80)
 
562
 
563
  @app.on_event("startup")
564
  async def startup_event():
 
565
  asyncio.create_task(scan_and_process_videos())
566
 
567
 
568
  @app.get("/")
569
  async def health():
 
570
  return JSONResponse({
571
  "status": "running",
572
  "service": "Video Processing Service",
 
581
 
582
  @app.get("/status")
583
  async def get_status():
 
584
  return JSONResponse({
585
  "is_running": processing_state["is_running"],
586
  "total_processed": processing_state["total_processed"],
 
593
 
594
  @app.post("/trigger-processing")
595
  async def trigger_processing():
 
596
  if processing_state["is_running"]:
597
  return JSONResponse({
598
  "status": "already_running",
599
  "message": "Video processing is already in progress"
600
  })
 
601
  asyncio.create_task(scan_and_process_videos())
602
  return JSONResponse({
603
  "status": "started",