Spaces:
Sleeping
Sleeping
Update processor.py
Browse files- processor.py +88 -30
processor.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
VideoProcessor β Core pipeline for viral clip extraction.
|
| 3 |
-
|
| 4 |
Fixes applied:
|
| 5 |
- source_language (for Whisper) separated from target_language (for translation/captions)
|
| 6 |
- Removed duplicate _clean_json_response (json_repair version kept)
|
|
@@ -13,6 +12,11 @@ Fixes applied:
|
|
| 13 |
using SubtitleSegmenter._split_into_lines so line splits match translated content
|
| 14 |
- β
FIX: translated word timestamps distributed proportional to word length
|
| 15 |
(instead of uniform distribution) for better highlight sync
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
import os
|
| 18 |
import gc
|
|
@@ -28,7 +32,6 @@ from core.stt import STT, SubtitleSegmenter
|
|
| 28 |
from core.analyze import analyze_transcript
|
| 29 |
from core.styles import StyleFactory
|
| 30 |
from core.subtitle_manager import SubtitleManager
|
| 31 |
-
# from core.free_translator import FreeTranslator
|
| 32 |
|
| 33 |
logger = Logger.get_logger(__name__)
|
| 34 |
|
|
@@ -41,7 +44,6 @@ def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: flo
|
|
| 41 |
β
FIX: Distribute word timestamps proportional to character length instead of
|
| 42 |
uniform distribution. Longer words get more time, giving better sync in
|
| 43 |
highlight_word mode after translation.
|
| 44 |
-
|
| 45 |
words: list of str (translated words)
|
| 46 |
Returns: list of { text, start, end }
|
| 47 |
"""
|
|
@@ -156,7 +158,6 @@ class VideoProcessor:
|
|
| 156 |
progress_callback=None):
|
| 157 |
"""
|
| 158 |
STT + AI viral-moment detection.
|
| 159 |
-
|
| 160 |
source_language : passed directly to Whisper.
|
| 161 |
None β Whisper auto-detects (slower but safe).
|
| 162 |
target_language : stored in data for process_clips to use for
|
|
@@ -180,6 +181,7 @@ class VideoProcessor:
|
|
| 180 |
|
| 181 |
data = {
|
| 182 |
"segments": full_segments,
|
|
|
|
| 183 |
"detected_language": detected_lang,
|
| 184 |
"target_language": target_language,
|
| 185 |
"duration": duration,
|
|
@@ -262,14 +264,18 @@ class VideoProcessor:
|
|
| 262 |
"""
|
| 263 |
Cuts, styles, captions, and exports each viral clip.
|
| 264 |
|
| 265 |
-
β
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
"""
|
| 274 |
logger.info("π¨ Phase 3: Style & Captions β¦")
|
| 275 |
if progress_callback:
|
|
@@ -284,11 +290,8 @@ class VideoProcessor:
|
|
| 284 |
logger.error(f"β Could not determine video duration: {e}")
|
| 285 |
|
| 286 |
# ββ Language resolution βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 287 |
-
detected_lang
|
| 288 |
-
|
| 289 |
-
# SIMPLIFIED: No separate translation step.
|
| 290 |
-
# STT has already provided the correct text (English or Arabic).
|
| 291 |
-
caption_lang = detected_lang
|
| 292 |
logger.info(f"π£οΈ Captions language: {caption_lang}")
|
| 293 |
|
| 294 |
# ββ Normalise style string once βββββββββββββββββββββββββββββββββββββββ
|
|
@@ -297,11 +300,12 @@ class VideoProcessor:
|
|
| 297 |
style_str = style_str.split(".")[-1]
|
| 298 |
|
| 299 |
# ββ Main loop βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 300 |
-
output_files
|
|
|
|
| 301 |
|
| 302 |
if not best_clips:
|
| 303 |
logger.warning("β οΈ No clips to process.")
|
| 304 |
-
return []
|
| 305 |
|
| 306 |
logger.info(f"π Processing {len(best_clips)} clip(s) β¦")
|
| 307 |
|
|
@@ -344,18 +348,16 @@ class VideoProcessor:
|
|
| 344 |
clip = current_video_clip.subclip(start, end)
|
| 345 |
|
| 346 |
# ββ Build segment_transcript ββββββββββββββββββββββββββββββββββ
|
| 347 |
-
|
| 348 |
|
| 349 |
for s in data["segments"]:
|
| 350 |
if s["start"] >= end or s["end"] <= start:
|
| 351 |
continue
|
| 352 |
|
| 353 |
-
new_seg
|
| 354 |
new_seg["start"] = max(0, s["start"] - start)
|
| 355 |
new_seg["end"] = min(end - start, s["end"] - start)
|
| 356 |
|
| 357 |
-
# SIMPLIFIED: No translation step here.
|
| 358 |
-
# Just adjust timestamps relative to clip start.
|
| 359 |
if "words" in s:
|
| 360 |
new_seg["words"] = [
|
| 361 |
{
|
|
@@ -366,8 +368,10 @@ class VideoProcessor:
|
|
| 366 |
for w in s["words"]
|
| 367 |
if w["start"] < end and w["end"] > start
|
| 368 |
]
|
| 369 |
-
|
| 370 |
-
|
|
|
|
|
|
|
| 371 |
|
| 372 |
# ββ Apply style + captions ββββββββββββββββββββββββββββββββββββ
|
| 373 |
style_strategy = StyleFactory.get_style(style_str)
|
|
@@ -398,6 +402,23 @@ class VideoProcessor:
|
|
| 398 |
output_files.append(final_output)
|
| 399 |
logger.info(f"β
Saved: {final_output}")
|
| 400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
except Exception as e:
|
| 402 |
logger.error(f"β Clip {i+1} error: {e}")
|
| 403 |
logger.error(traceback.format_exc())
|
|
@@ -411,7 +432,7 @@ class VideoProcessor:
|
|
| 411 |
pass
|
| 412 |
gc.collect()
|
| 413 |
|
| 414 |
-
return output_files
|
| 415 |
|
| 416 |
|
| 417 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -422,6 +443,15 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 422 |
"""
|
| 423 |
End-to-end pipeline: STT β AI analysis β clip export.
|
| 424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
Important kwargs:
|
| 426 |
source_language : language of the original video β passed to Whisper.
|
| 427 |
If not set β Whisper auto-detects.
|
|
@@ -452,12 +482,18 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 452 |
|
| 453 |
if not viral_segments:
|
| 454 |
logger.warning("β οΈ No viral segments found.")
|
| 455 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
|
| 457 |
best_clips = processor.get_best_segments(viral_segments, duration)
|
| 458 |
|
| 459 |
# Phase 3: render
|
| 460 |
-
|
| 461 |
video_path,
|
| 462 |
best_clips,
|
| 463 |
stt_data,
|
|
@@ -465,13 +501,35 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 465 |
**kwargs,
|
| 466 |
)
|
| 467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
except Exception as e:
|
| 469 |
logger.error(f"β Processing failed: {e}")
|
| 470 |
logger.error(traceback.format_exc())
|
| 471 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
|
| 474 |
if __name__ == "__main__":
|
| 475 |
import sys
|
| 476 |
if len(sys.argv) > 1:
|
| 477 |
-
process_video(sys.argv[1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
VideoProcessor β Core pipeline for viral clip extraction.
|
|
|
|
| 3 |
Fixes applied:
|
| 4 |
- source_language (for Whisper) separated from target_language (for translation/captions)
|
| 5 |
- Removed duplicate _clean_json_response (json_repair version kept)
|
|
|
|
| 12 |
using SubtitleSegmenter._split_into_lines so line splits match translated content
|
| 13 |
- β
FIX: translated word timestamps distributed proportional to word length
|
| 14 |
(instead of uniform distribution) for better highlight sync
|
| 15 |
+
- β
NEW: process_clips now returns (output_files, transcripts_per_clip)
|
| 16 |
+
where transcripts_per_clip is a list of dicts:
|
| 17 |
+
{ clip_index, start, end, segments, full_text }
|
| 18 |
+
- β
NEW: process_video returns a dict with keys:
|
| 19 |
+
output_files, transcripts, viral_segments, duration
|
| 20 |
"""
|
| 21 |
import os
|
| 22 |
import gc
|
|
|
|
| 32 |
from core.analyze import analyze_transcript
|
| 33 |
from core.styles import StyleFactory
|
| 34 |
from core.subtitle_manager import SubtitleManager
|
|
|
|
| 35 |
|
| 36 |
logger = Logger.get_logger(__name__)
|
| 37 |
|
|
|
|
| 44 |
β
FIX: Distribute word timestamps proportional to character length instead of
|
| 45 |
uniform distribution. Longer words get more time, giving better sync in
|
| 46 |
highlight_word mode after translation.
|
|
|
|
| 47 |
words: list of str (translated words)
|
| 48 |
Returns: list of { text, start, end }
|
| 49 |
"""
|
|
|
|
| 158 |
progress_callback=None):
|
| 159 |
"""
|
| 160 |
STT + AI viral-moment detection.
|
|
|
|
| 161 |
source_language : passed directly to Whisper.
|
| 162 |
None β Whisper auto-detects (slower but safe).
|
| 163 |
target_language : stored in data for process_clips to use for
|
|
|
|
| 181 |
|
| 182 |
data = {
|
| 183 |
"segments": full_segments,
|
| 184 |
+
"full_text": full_text, # β
NEW: store full transcript text
|
| 185 |
"detected_language": detected_lang,
|
| 186 |
"target_language": target_language,
|
| 187 |
"duration": duration,
|
|
|
|
| 264 |
"""
|
| 265 |
Cuts, styles, captions, and exports each viral clip.
|
| 266 |
|
| 267 |
+
β
Returns: (output_files, transcripts_per_clip)
|
| 268 |
+
output_files : list of str β paths to rendered .mp4 files
|
| 269 |
+
transcripts_per_clip : list of dicts, one per successfully rendered clip:
|
| 270 |
+
{
|
| 271 |
+
"clip_index" : int, # 1-based
|
| 272 |
+
"filename" : str, # output filename (basename)
|
| 273 |
+
"start" : float, # clip start in original video (s)
|
| 274 |
+
"end" : float, # clip end in original video (s)
|
| 275 |
+
"language" : str, # detected/caption language
|
| 276 |
+
"segments" : [ ... ], # STT segments relative to clip start
|
| 277 |
+
"full_text" : str, # concatenated text of all segments
|
| 278 |
+
}
|
| 279 |
"""
|
| 280 |
logger.info("π¨ Phase 3: Style & Captions β¦")
|
| 281 |
if progress_callback:
|
|
|
|
| 290 |
logger.error(f"β Could not determine video duration: {e}")
|
| 291 |
|
| 292 |
# ββ Language resolution βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 293 |
+
detected_lang = data.get("detected_language", "en")
|
| 294 |
+
caption_lang = detected_lang
|
|
|
|
|
|
|
|
|
|
| 295 |
logger.info(f"π£οΈ Captions language: {caption_lang}")
|
| 296 |
|
| 297 |
# ββ Normalise style string once βββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 300 |
style_str = style_str.split(".")[-1]
|
| 301 |
|
| 302 |
# ββ Main loop βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 303 |
+
output_files = []
|
| 304 |
+
transcripts_per_clip = [] # β
NEW
|
| 305 |
|
| 306 |
if not best_clips:
|
| 307 |
logger.warning("β οΈ No clips to process.")
|
| 308 |
+
return [], []
|
| 309 |
|
| 310 |
logger.info(f"π Processing {len(best_clips)} clip(s) β¦")
|
| 311 |
|
|
|
|
| 348 |
clip = current_video_clip.subclip(start, end)
|
| 349 |
|
| 350 |
# ββ Build segment_transcript ββββββββββββββββββββββββββββββββββ
|
| 351 |
+
clip_segments = []
|
| 352 |
|
| 353 |
for s in data["segments"]:
|
| 354 |
if s["start"] >= end or s["end"] <= start:
|
| 355 |
continue
|
| 356 |
|
| 357 |
+
new_seg = s.copy()
|
| 358 |
new_seg["start"] = max(0, s["start"] - start)
|
| 359 |
new_seg["end"] = min(end - start, s["end"] - start)
|
| 360 |
|
|
|
|
|
|
|
| 361 |
if "words" in s:
|
| 362 |
new_seg["words"] = [
|
| 363 |
{
|
|
|
|
| 368 |
for w in s["words"]
|
| 369 |
if w["start"] < end and w["end"] > start
|
| 370 |
]
|
| 371 |
+
|
| 372 |
+
clip_segments.append(new_seg)
|
| 373 |
+
|
| 374 |
+
segment_transcript = {"segments": clip_segments}
|
| 375 |
|
| 376 |
# ββ Apply style + captions ββββββββββββββββββββββββββββββββββββ
|
| 377 |
style_strategy = StyleFactory.get_style(style_str)
|
|
|
|
| 402 |
output_files.append(final_output)
|
| 403 |
logger.info(f"β
Saved: {final_output}")
|
| 404 |
|
| 405 |
+
# β
NEW: Build transcript entry for this clip
|
| 406 |
+
clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
|
| 407 |
+
transcripts_per_clip.append({
|
| 408 |
+
"clip_index": i + 1,
|
| 409 |
+
"filename": out_name,
|
| 410 |
+
"start": start,
|
| 411 |
+
"end": end,
|
| 412 |
+
"language": caption_lang,
|
| 413 |
+
"segments": clip_segments,
|
| 414 |
+
"full_text": clip_full_text,
|
| 415 |
+
})
|
| 416 |
+
logger.info(
|
| 417 |
+
f"π Transcript for clip {i+1}: "
|
| 418 |
+
f"{len(clip_segments)} segment(s), "
|
| 419 |
+
f"{len(clip_full_text)} chars"
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
except Exception as e:
|
| 423 |
logger.error(f"β Clip {i+1} error: {e}")
|
| 424 |
logger.error(traceback.format_exc())
|
|
|
|
| 432 |
pass
|
| 433 |
gc.collect()
|
| 434 |
|
| 435 |
+
return output_files, transcripts_per_clip # β
tuple now
|
| 436 |
|
| 437 |
|
| 438 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 443 |
"""
|
| 444 |
End-to-end pipeline: STT β AI analysis β clip export.
|
| 445 |
|
| 446 |
+
β
Returns a dict (instead of a plain list) with:
|
| 447 |
+
{
|
| 448 |
+
"output_files" : list[str], # paths to rendered clips
|
| 449 |
+
"transcripts" : list[dict], # per-clip transcripts (see process_clips)
|
| 450 |
+
"viral_segments" : list[dict], # raw AI viral segment detections
|
| 451 |
+
"full_transcript": str, # full video transcript text
|
| 452 |
+
"duration" : float, # video duration in seconds
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
Important kwargs:
|
| 456 |
source_language : language of the original video β passed to Whisper.
|
| 457 |
If not set β Whisper auto-detects.
|
|
|
|
| 482 |
|
| 483 |
if not viral_segments:
|
| 484 |
logger.warning("β οΈ No viral segments found.")
|
| 485 |
+
return {
|
| 486 |
+
"output_files": [],
|
| 487 |
+
"transcripts": [],
|
| 488 |
+
"viral_segments": [],
|
| 489 |
+
"full_transcript": stt_data.get("full_text", ""),
|
| 490 |
+
"duration": duration,
|
| 491 |
+
}
|
| 492 |
|
| 493 |
best_clips = processor.get_best_segments(viral_segments, duration)
|
| 494 |
|
| 495 |
# Phase 3: render
|
| 496 |
+
output_files, transcripts = processor.process_clips(
|
| 497 |
video_path,
|
| 498 |
best_clips,
|
| 499 |
stt_data,
|
|
|
|
| 501 |
**kwargs,
|
| 502 |
)
|
| 503 |
|
| 504 |
+
return {
|
| 505 |
+
"output_files": output_files,
|
| 506 |
+
"transcripts": transcripts,
|
| 507 |
+
"viral_segments": viral_segments,
|
| 508 |
+
"full_transcript": stt_data.get("full_text", ""),
|
| 509 |
+
"duration": duration,
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
except Exception as e:
|
| 513 |
logger.error(f"β Processing failed: {e}")
|
| 514 |
logger.error(traceback.format_exc())
|
| 515 |
+
return {
|
| 516 |
+
"output_files": [],
|
| 517 |
+
"transcripts": [],
|
| 518 |
+
"viral_segments": [],
|
| 519 |
+
"full_transcript": "",
|
| 520 |
+
"duration": 0,
|
| 521 |
+
}
|
| 522 |
|
| 523 |
|
| 524 |
if __name__ == "__main__":
|
| 525 |
import sys
|
| 526 |
if len(sys.argv) > 1:
|
| 527 |
+
result = process_video(sys.argv[1])
|
| 528 |
+
print(json.dumps({
|
| 529 |
+
"clips": result["output_files"],
|
| 530 |
+
"full_transcript": result["full_transcript"],
|
| 531 |
+
"clip_transcripts": [
|
| 532 |
+
{"clip": t["clip_index"], "text": t["full_text"]}
|
| 533 |
+
for t in result["transcripts"]
|
| 534 |
+
],
|
| 535 |
+
}, indent=2, ensure_ascii=False))
|