factorstudios commited on
Commit
ddb7115
Β·
verified Β·
1 Parent(s): abb20ff

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +54 -51
server.py CHANGED
@@ -1,3 +1,4 @@
 
1
  #!/usr/bin/env python3
2
  import os
3
  import json
@@ -41,13 +42,12 @@ processing_state = {
41
  "current_file": None,
42
  "error_count": 0,
43
  "last_error": None,
44
- "processed_files": []
 
45
  }
46
 
47
- # Load Whisper model once at startup (small = good balance of speed/accuracy)
48
- print("Loading Whisper small model...")
49
- whisper_model = WhisperModel("small", device="auto", compute_type="int8")
50
- print("βœ“ Whisper model loaded")
51
 
52
  HF_DATASET_REPO = "factorstudios/movs"
53
  HOOKS_FOLDER = "hooks"
@@ -55,6 +55,15 @@ READY_VIDEOS_FOLDER = "ready_videos"
55
  TRANSCRIPTION_FOLDER = "transcriptions"
56
 
57
 
 
 
 
 
 
 
 
 
 
58
  def timestamp_to_seconds(timestamp: str) -> float:
59
  """Convert HH:MM:SS to seconds."""
60
  try:
@@ -72,10 +81,10 @@ def extract_audio_segment(video_path: str, start_seconds: float, end_seconds: fl
72
  "-ss", str(start_seconds),
73
  "-to", str(end_seconds),
74
  "-i", video_path,
75
- "-vn", # no video
76
- "-acodec", "pcm_s16le", # WAV format Whisper expects
77
- "-ar", "16000", # 16kHz sample rate (Whisper requirement)
78
- "-ac", "1", # mono
79
  output_wav
80
  ]
81
  result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
@@ -85,14 +94,14 @@ def extract_audio_segment(video_path: str, start_seconds: float, end_seconds: fl
85
  def transcribe_segment(audio_path: str) -> List[Tuple[float, float, str]]:
86
  """
87
  Transcribe audio with Whisper small.
88
- Returns list of (start_sec, end_sec, text) β€” all relative to segment start.
89
  """
90
  print(" Transcribing audio with Whisper small...")
91
  segments, info = whisper_model.transcribe(
92
  audio_path,
93
  beam_size=5,
94
- language=None, # auto-detect language
95
- vad_filter=True, # skip silence
96
  vad_parameters=dict(min_silence_duration_ms=500)
97
  )
98
 
@@ -112,33 +121,27 @@ def apply_color_grading_wedding_retro(frame: np.ndarray) -> np.ndarray:
112
  lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
113
  l_channel, a_channel, b_channel = cv2.split(lab)
114
 
115
- # 1. VINTAGE/RETRO: warm tones
116
  a_channel = cv2.add(a_channel, 5)
117
  b_channel = cv2.add(b_channel, 8)
118
 
119
- # 2. WEDDING LOOK: soft highlights via CLAHE
120
  clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
121
  l_channel = clahe.apply(l_channel)
122
 
123
  lab_enhanced = cv2.merge([l_channel, a_channel, b_channel])
124
  frame = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
125
 
126
- # 3. SATURATION BOOST
127
  hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV).astype(np.float32)
128
  hsv[:, :, 1] = np.clip(hsv[:, :, 1] * 1.3, 0, 255)
129
  frame = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)
130
 
131
- # 4. CONTRAST ENHANCEMENT
132
  frame = cv2.convertScaleAbs(frame, alpha=1.15, beta=10)
133
 
134
- # 5. HIGH SHARPENING
135
  kernel = np.array([[-1, -1, -1],
136
  [-1, 9, -1],
137
  [-1, -1, -1]]) / 1.2
138
  sharpened = cv2.filter2D(frame, -1, kernel)
139
  frame = cv2.addWeighted(frame, 0.4, sharpened, 0.6, 0)
140
 
141
- # 6. SLIGHT VIGNETTE
142
  rows, cols = frame.shape[:2]
143
  X_kernel = cv2.getGaussianKernel(cols, cols / 2)
144
  Y_kernel = cv2.getGaussianKernel(rows, rows / 2)
@@ -164,7 +167,6 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
164
  except Exception:
165
  font = ImageFont.load_default()
166
 
167
- # Word-wrap text
168
  max_width = width - 80
169
  wrapped_lines = []
170
  words = text.split()
@@ -184,8 +186,6 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
184
 
185
  line_height = font_size + 12
186
  total_text_height = len(wrapped_lines) * line_height
187
-
188
- # Position: 80% down the frame (near bottom, not center)
189
  y_start = int(height * 0.80) - total_text_height // 2
190
  shadow_offset = 3
191
 
@@ -195,9 +195,7 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
195
  x = (width - line_width) // 2
196
  y = y_start + i * line_height
197
 
198
- # Shadow layer
199
  draw.text((x + shadow_offset, y + shadow_offset), line, font=font, fill=(0, 0, 0, 200))
200
- # Main white text
201
  draw.text((x, y), line, font=font, fill=(255, 255, 255, 255))
202
 
203
  frame_pil = Image.alpha_composite(frame_pil, overlay).convert('RGB')
@@ -205,10 +203,7 @@ def burn_captions_to_frame(frame: np.ndarray, text: str, font_size: int = 36) ->
205
 
206
 
207
  def build_frame_caption_map(captions: List[Tuple[float, float, str]], fps: float) -> Dict[int, str]:
208
- """
209
- Convert Whisper (start, end, text) segments into a per-frame caption map.
210
- Each frame number maps to the caption active at that time.
211
- """
212
  frame_map = {}
213
  for start_sec, end_sec, text in captions:
214
  start_frame = int(start_sec * fps)
@@ -234,7 +229,7 @@ def process_video_segment(
234
  4. Mux processed video with original audio
235
  """
236
  ffmpeg_video_proc = None
237
- temp_wav = None
238
  temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
239
 
240
  try:
@@ -256,18 +251,17 @@ def process_video_segment(
256
  print(f"Video info: {fps} fps, {original_width}x{original_height}")
257
  print(f"Extracting segment: {start_time} to {end_time} ({duration:.1f}s)")
258
 
259
- # ── Step 1: Extract audio segment as WAV ──────────────────────────────
260
- temp_wav = output_path.replace(".mp4", "_audio.wav")
261
  print(" Extracting audio segment...")
262
  audio_ok = extract_audio_segment(video_path, start_seconds, end_seconds, temp_wav)
263
- if not audio_ok:
264
- print(" Warning: Audio extraction failed, captions will be skipped")
265
- captions = []
266
- else:
267
- # ── Step 2: Transcribe with Whisper ───────────────────────────────
268
  captions = transcribe_segment(temp_wav)
 
 
 
269
 
270
- # Build per-frame caption lookup from Whisper timestamps
271
  frame_caption_map = build_frame_caption_map(captions, fps)
272
 
273
  # ── Step 3: Process frames β†’ pipe to FFmpeg ───────────────────────────
@@ -308,7 +302,6 @@ def process_video_segment(
308
  print(f"Warning: Could not read frame at position {processed_frames}")
309
  break
310
 
311
- # Crop to target aspect ratio
312
  aspect_ratio = target_width / target_height
313
  if original_width / original_height > aspect_ratio:
314
  new_width = int(original_height * aspect_ratio)
@@ -322,12 +315,10 @@ def process_video_segment(
322
  frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
323
  frame = apply_color_grading_wedding_retro(frame)
324
 
325
- # Update caption from Whisper frame map
326
- if processed_frames in frame_caption_map:
327
- current_caption = frame_caption_map[processed_frames]
328
- elif processed_frames not in frame_caption_map and current_caption:
329
- # Clear caption when we're past its end frame
330
- current_caption = frame_caption_map.get(processed_frames, "")
331
 
332
  if current_caption:
333
  frame = burn_captions_to_frame(frame, current_caption)
@@ -376,7 +367,7 @@ def process_video_segment(
376
  print(f"βœ— FFmpeg audio mux failed (code {mux_result.returncode})")
377
  return False
378
 
379
- print(f"βœ“ Video segment with audio saved: {output_path}")
380
  return True
381
 
382
  except Exception as e:
@@ -390,10 +381,12 @@ def process_video_segment(
390
  return False
391
 
392
  finally:
393
- # Clean up temp files
394
  for tmp in [temp_video_path, temp_wav]:
395
  if tmp and os.path.exists(tmp):
396
- os.remove(tmp)
 
 
 
397
 
398
 
399
  async def process_movie_segments(movie_name: str) -> bool:
@@ -404,7 +397,6 @@ async def process_movie_segments(movie_name: str) -> bool:
404
  print(f"Processing movie: {movie_name}")
405
  print(f"{'='*80}")
406
 
407
- # Download original video
408
  video_file = f"{movie_name}.mkv"
409
  print(f"Downloading video: {video_file}")
410
 
@@ -422,7 +414,6 @@ async def process_movie_segments(movie_name: str) -> bool:
422
  print(f"Error: Could not download video: {e}")
423
  return False
424
 
425
- # List segment JSON files
426
  hooks_folder = f"{HOOKS_FOLDER}/{movie_name}"
427
  print(f"Listing segments from: {hooks_folder}")
428
 
@@ -442,7 +433,6 @@ async def process_movie_segments(movie_name: str) -> bool:
442
  return False
443
 
444
  print(f"Found {len(segment_files)} segments")
445
-
446
  temp_dir = tempfile.mkdtemp()
447
 
448
  try:
@@ -519,6 +509,7 @@ async def scan_and_process_videos():
519
  print("Video processing already running, skipping...")
520
  return
521
 
 
522
  print("Waiting 3 minutes before starting video processing...")
523
  await asyncio.sleep(180)
524
 
@@ -562,6 +553,11 @@ async def scan_and_process_videos():
562
 
563
  @app.on_event("startup")
564
  async def startup_event():
 
 
 
 
 
565
  asyncio.create_task(scan_and_process_videos())
566
 
567
 
@@ -570,6 +566,7 @@ async def health():
570
  return JSONResponse({
571
  "status": "running",
572
  "service": "Video Processing Service",
 
573
  "is_processing": processing_state["is_running"],
574
  "total_processed": processing_state["total_processed"],
575
  "error_count": processing_state["error_count"],
@@ -582,6 +579,7 @@ async def health():
582
  @app.get("/status")
583
  async def get_status():
584
  return JSONResponse({
 
585
  "is_running": processing_state["is_running"],
586
  "total_processed": processing_state["total_processed"],
587
  "error_count": processing_state["error_count"],
@@ -598,6 +596,11 @@ async def trigger_processing():
598
  "status": "already_running",
599
  "message": "Video processing is already in progress"
600
  })
 
 
 
 
 
601
  asyncio.create_task(scan_and_process_videos())
602
  return JSONResponse({
603
  "status": "started",
@@ -607,5 +610,5 @@ async def trigger_processing():
607
 
608
  if __name__ == "__main__":
609
  print("Starting Video Processing Service on port 7860...")
610
- print("Processing will begin 3 minutes after startup")
611
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ ENDOFFILE'
2
  #!/usr/bin/env python3
3
  import os
4
  import json
 
42
  "current_file": None,
43
  "error_count": 0,
44
  "last_error": None,
45
+ "processed_files": [],
46
+ "whisper_ready": False
47
  }
48
 
49
+ # Whisper model β€” loaded async at startup, not at import time
50
+ whisper_model = None
 
 
51
 
52
  HF_DATASET_REPO = "factorstudios/movs"
53
  HOOKS_FOLDER = "hooks"
 
55
  TRANSCRIPTION_FOLDER = "transcriptions"
56
 
57
 
58
+ def _load_whisper_model():
59
+ """Blocking model load β€” runs in thread executor."""
60
+ global whisper_model
61
+ print("Loading Whisper small model...")
62
+ whisper_model = WhisperModel("small", device="auto", compute_type="int8")
63
+ processing_state["whisper_ready"] = True
64
+ print("βœ“ Whisper model loaded")
65
+
66
+
67
  def timestamp_to_seconds(timestamp: str) -> float:
68
  """Convert HH:MM:SS to seconds."""
69
  try:
 
81
  "-ss", str(start_seconds),
82
  "-to", str(end_seconds),
83
  "-i", video_path,
84
+ "-vn",
85
+ "-acodec", "pcm_s16le",
86
+ "-ar", "16000",
87
+ "-ac", "1",
88
  output_wav
89
  ]
90
  result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
94
  def transcribe_segment(audio_path: str) -> List[Tuple[float, float, str]]:
95
  """
96
  Transcribe audio with Whisper small.
97
+ Returns list of (start_sec, end_sec, text) relative to segment start.
98
  """
99
  print(" Transcribing audio with Whisper small...")
100
  segments, info = whisper_model.transcribe(
101
  audio_path,
102
  beam_size=5,
103
+ language=None,
104
+ vad_filter=True,
105
  vad_parameters=dict(min_silence_duration_ms=500)
106
  )
107
 
 
121
  lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
122
  l_channel, a_channel, b_channel = cv2.split(lab)
123
 
 
124
  a_channel = cv2.add(a_channel, 5)
125
  b_channel = cv2.add(b_channel, 8)
126
 
 
127
  clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
128
  l_channel = clahe.apply(l_channel)
129
 
130
  lab_enhanced = cv2.merge([l_channel, a_channel, b_channel])
131
  frame = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
132
 
 
133
  hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV).astype(np.float32)
134
  hsv[:, :, 1] = np.clip(hsv[:, :, 1] * 1.3, 0, 255)
135
  frame = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)
136
 
 
137
  frame = cv2.convertScaleAbs(frame, alpha=1.15, beta=10)
138
 
 
139
  kernel = np.array([[-1, -1, -1],
140
  [-1, 9, -1],
141
  [-1, -1, -1]]) / 1.2
142
  sharpened = cv2.filter2D(frame, -1, kernel)
143
  frame = cv2.addWeighted(frame, 0.4, sharpened, 0.6, 0)
144
 
 
145
  rows, cols = frame.shape[:2]
146
  X_kernel = cv2.getGaussianKernel(cols, cols / 2)
147
  Y_kernel = cv2.getGaussianKernel(rows, rows / 2)
 
167
  except Exception:
168
  font = ImageFont.load_default()
169
 
 
170
  max_width = width - 80
171
  wrapped_lines = []
172
  words = text.split()
 
186
 
187
  line_height = font_size + 12
188
  total_text_height = len(wrapped_lines) * line_height
 
 
189
  y_start = int(height * 0.80) - total_text_height // 2
190
  shadow_offset = 3
191
 
 
195
  x = (width - line_width) // 2
196
  y = y_start + i * line_height
197
 
 
198
  draw.text((x + shadow_offset, y + shadow_offset), line, font=font, fill=(0, 0, 0, 200))
 
199
  draw.text((x, y), line, font=font, fill=(255, 255, 255, 255))
200
 
201
  frame_pil = Image.alpha_composite(frame_pil, overlay).convert('RGB')
 
203
 
204
 
205
  def build_frame_caption_map(captions: List[Tuple[float, float, str]], fps: float) -> Dict[int, str]:
206
+ """Convert Whisper segments into a per-frame caption lookup."""
 
 
 
207
  frame_map = {}
208
  for start_sec, end_sec, text in captions:
209
  start_frame = int(start_sec * fps)
 
229
  4. Mux processed video with original audio
230
  """
231
  ffmpeg_video_proc = None
232
+ temp_wav = output_path.replace(".mp4", "_audio.wav")
233
  temp_video_path = output_path.replace(".mp4", "_noaudio.mp4")
234
 
235
  try:
 
251
  print(f"Video info: {fps} fps, {original_width}x{original_height}")
252
  print(f"Extracting segment: {start_time} to {end_time} ({duration:.1f}s)")
253
 
254
+ # ── Step 1: Extract audio β†’ WAV ───────────────────────────────────────
 
255
  print(" Extracting audio segment...")
256
  audio_ok = extract_audio_segment(video_path, start_seconds, end_seconds, temp_wav)
257
+
258
+ # ── Step 2: Transcribe with Whisper ───────────────────────────────────
259
+ if audio_ok and whisper_model is not None:
 
 
260
  captions = transcribe_segment(temp_wav)
261
+ else:
262
+ print(" Warning: Skipping transcription (audio failed or model not ready)")
263
+ captions = []
264
 
 
265
  frame_caption_map = build_frame_caption_map(captions, fps)
266
 
267
  # ── Step 3: Process frames β†’ pipe to FFmpeg ───────────────────────────
 
302
  print(f"Warning: Could not read frame at position {processed_frames}")
303
  break
304
 
 
305
  aspect_ratio = target_width / target_height
306
  if original_width / original_height > aspect_ratio:
307
  new_width = int(original_height * aspect_ratio)
 
315
  frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
316
  frame = apply_color_grading_wedding_retro(frame)
317
 
318
+ current_caption = frame_caption_map.get(processed_frames, current_caption)
319
+ # Clear caption if this frame isn't in the map and the previous caption has ended
320
+ if processed_frames not in frame_caption_map:
321
+ current_caption = ""
 
 
322
 
323
  if current_caption:
324
  frame = burn_captions_to_frame(frame, current_caption)
 
367
  print(f"βœ— FFmpeg audio mux failed (code {mux_result.returncode})")
368
  return False
369
 
370
+ print(f"βœ“ Segment complete: {output_path}")
371
  return True
372
 
373
  except Exception as e:
 
381
  return False
382
 
383
  finally:
 
384
  for tmp in [temp_video_path, temp_wav]:
385
  if tmp and os.path.exists(tmp):
386
+ try:
387
+ os.remove(tmp)
388
+ except Exception:
389
+ pass
390
 
391
 
392
  async def process_movie_segments(movie_name: str) -> bool:
 
397
  print(f"Processing movie: {movie_name}")
398
  print(f"{'='*80}")
399
 
 
400
  video_file = f"{movie_name}.mkv"
401
  print(f"Downloading video: {video_file}")
402
 
 
414
  print(f"Error: Could not download video: {e}")
415
  return False
416
 
 
417
  hooks_folder = f"{HOOKS_FOLDER}/{movie_name}"
418
  print(f"Listing segments from: {hooks_folder}")
419
 
 
433
  return False
434
 
435
  print(f"Found {len(segment_files)} segments")
 
436
  temp_dir = tempfile.mkdtemp()
437
 
438
  try:
 
509
  print("Video processing already running, skipping...")
510
  return
511
 
512
+ # Wait 3 minutes for Space to fully initialize
513
  print("Waiting 3 minutes before starting video processing...")
514
  await asyncio.sleep(180)
515
 
 
553
 
554
  @app.on_event("startup")
555
  async def startup_event():
556
+ """Load Whisper in background, then kick off video processing after 3 min."""
557
+ loop = asyncio.get_event_loop()
558
+ # Load Whisper model in thread so it doesn't block the event loop / health check
559
+ await loop.run_in_executor(None, _load_whisper_model)
560
+ # Kick off processing task (has its own 3-min delay inside)
561
  asyncio.create_task(scan_and_process_videos())
562
 
563
 
 
566
  return JSONResponse({
567
  "status": "running",
568
  "service": "Video Processing Service",
569
+ "whisper_ready": processing_state["whisper_ready"],
570
  "is_processing": processing_state["is_running"],
571
  "total_processed": processing_state["total_processed"],
572
  "error_count": processing_state["error_count"],
 
579
  @app.get("/status")
580
  async def get_status():
581
  return JSONResponse({
582
+ "whisper_ready": processing_state["whisper_ready"],
583
  "is_running": processing_state["is_running"],
584
  "total_processed": processing_state["total_processed"],
585
  "error_count": processing_state["error_count"],
 
596
  "status": "already_running",
597
  "message": "Video processing is already in progress"
598
  })
599
+ if not processing_state["whisper_ready"]:
600
+ return JSONResponse({
601
+ "status": "not_ready",
602
+ "message": "Whisper model is still loading, try again shortly"
603
+ })
604
  asyncio.create_task(scan_and_process_videos())
605
  return JSONResponse({
606
  "status": "started",
 
610
 
611
  if __name__ == "__main__":
612
  print("Starting Video Processing Service on port 7860...")
613
+ print("Whisper will load at startup, processing begins 3 minutes after")
614
+ uvicorn.run(app, host="0.0.0.0", port=7860)