Spaces:
Sleeping
Sleeping
Update processor.py
Browse files- processor.py +85 -47
processor.py
CHANGED
|
@@ -17,6 +17,8 @@ Fixes applied:
|
|
| 17 |
{ clip_index, start, end, segments, full_text }
|
| 18 |
- โ
NEW: process_video returns a dict with keys:
|
| 19 |
output_files, transcripts, viral_segments, duration
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
import os
|
| 22 |
import gc
|
|
@@ -31,13 +33,9 @@ from core.logger import Logger
|
|
| 31 |
from core.stt import STT, SubtitleSegmenter
|
| 32 |
from core.analyze import analyze_transcript
|
| 33 |
from core.styles import StyleFactory
|
| 34 |
-
from core.subtitle_manager import SubtitleManager
|
| 35 |
|
| 36 |
logger = Logger.get_logger(__name__)
|
| 37 |
|
| 38 |
-
# Max chars per line โ must match SubtitleSegmenter constant
|
| 39 |
-
_MAX_CHARS_PER_LINE = 42
|
| 40 |
-
|
| 41 |
|
| 42 |
def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: float) -> list:
|
| 43 |
"""
|
|
@@ -52,22 +50,13 @@ def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: flo
|
|
| 52 |
|
| 53 |
total_chars = sum(len(w) for w in words)
|
| 54 |
seg_dur = seg_end - seg_start
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
cursor = seg_start
|
| 58 |
|
| 59 |
for i, w in enumerate(words):
|
| 60 |
-
if total_chars > 0
|
| 61 |
-
|
| 62 |
-
else
|
| 63 |
-
fraction = 1.0 / len(words)
|
| 64 |
-
|
| 65 |
-
w_dur = seg_dur * fraction
|
| 66 |
-
w_end = cursor + w_dur
|
| 67 |
-
|
| 68 |
-
# Clamp last word to seg_end to avoid float drift
|
| 69 |
-
if i == len(words) - 1:
|
| 70 |
-
w_end = seg_end
|
| 71 |
|
| 72 |
result.append({
|
| 73 |
"text": w,
|
|
@@ -86,6 +75,51 @@ class VideoProcessor:
|
|
| 86 |
self.stt = STT(model_size=model_size)
|
| 87 |
Config.setup_dirs()
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
# โโ JSON helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 90 |
|
| 91 |
def _clean_json_response(self, content):
|
|
@@ -181,7 +215,7 @@ class VideoProcessor:
|
|
| 181 |
|
| 182 |
data = {
|
| 183 |
"segments": full_segments,
|
| 184 |
-
"full_text": full_text,
|
| 185 |
"detected_language": detected_lang,
|
| 186 |
"target_language": target_language,
|
| 187 |
"duration": duration,
|
|
@@ -265,16 +299,16 @@ class VideoProcessor:
|
|
| 265 |
Cuts, styles, captions, and exports each viral clip.
|
| 266 |
|
| 267 |
โ
Returns: (output_files, transcripts_per_clip)
|
| 268 |
-
output_files
|
| 269 |
transcripts_per_clip : list of dicts, one per successfully rendered clip:
|
| 270 |
{
|
| 271 |
-
"clip_index"
|
| 272 |
-
"filename"
|
| 273 |
-
"start"
|
| 274 |
-
"end"
|
| 275 |
-
"language"
|
| 276 |
-
"segments"
|
| 277 |
-
"full_text"
|
| 278 |
}
|
| 279 |
"""
|
| 280 |
logger.info("๐จ Phase 3: Style & Captions โฆ")
|
|
@@ -301,7 +335,7 @@ class VideoProcessor:
|
|
| 301 |
|
| 302 |
# โโ Main loop โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 303 |
output_files = []
|
| 304 |
-
transcripts_per_clip = []
|
| 305 |
|
| 306 |
if not best_clips:
|
| 307 |
logger.warning("โ ๏ธ No clips to process.")
|
|
@@ -337,9 +371,9 @@ class VideoProcessor:
|
|
| 337 |
logger.info(f"\n๐ฌ Clip {i+1}/{len(best_clips)} ({start:.2f}s โ {end:.2f}s)")
|
| 338 |
|
| 339 |
# โโ Output path โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 340 |
-
task_id
|
| 341 |
-
prefix
|
| 342 |
-
out_name
|
| 343 |
final_output = os.path.join(Config.OUTPUTS_DIR, "viral_clips", out_name)
|
| 344 |
os.makedirs(os.path.dirname(final_output), exist_ok=True)
|
| 345 |
|
|
@@ -387,6 +421,14 @@ class VideoProcessor:
|
|
| 387 |
playground_path = kwargs.get("playground_path"),
|
| 388 |
)
|
| 389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
# โโ Export โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 391 |
cpu_count = os.cpu_count() or 4
|
| 392 |
logger.info(f"โ๏ธ Rendering with {cpu_count} thread(s) โฆ")
|
|
@@ -402,7 +444,7 @@ class VideoProcessor:
|
|
| 402 |
output_files.append(final_output)
|
| 403 |
logger.info(f"โ
Saved: {final_output}")
|
| 404 |
|
| 405 |
-
#
|
| 406 |
clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
|
| 407 |
transcripts_per_clip.append({
|
| 408 |
"clip_index": i + 1,
|
|
@@ -432,7 +474,7 @@ class VideoProcessor:
|
|
| 432 |
pass
|
| 433 |
gc.collect()
|
| 434 |
|
| 435 |
-
return output_files, transcripts_per_clip
|
| 436 |
|
| 437 |
|
| 438 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
@@ -443,36 +485,33 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 443 |
"""
|
| 444 |
End-to-end pipeline: STT โ AI analysis โ clip export.
|
| 445 |
|
| 446 |
-
โ
Returns a dict
|
| 447 |
{
|
| 448 |
-
"output_files" : list[str],
|
| 449 |
-
"transcripts" : list[dict],
|
| 450 |
-
"viral_segments" : list[dict],
|
| 451 |
-
"full_transcript": str,
|
| 452 |
-
"duration" : float,
|
| 453 |
}
|
| 454 |
|
| 455 |
Important kwargs:
|
| 456 |
source_language : language of the original video โ passed to Whisper.
|
| 457 |
-
If not set โ Whisper auto-detects.
|
| 458 |
language : desired output language (translation + captions).
|
| 459 |
-
If same as source โ no translation.
|
| 460 |
caption_mode : sentence | word | highlight_word
|
| 461 |
caption_style : classic | modern_glow | tiktok_bold | โฆ
|
|
|
|
|
|
|
| 462 |
"""
|
| 463 |
try:
|
| 464 |
-
processor
|
| 465 |
-
|
| 466 |
caption_mode = kwargs.get("caption_mode", "sentence")
|
| 467 |
|
| 468 |
-
# highlight_word and word modes both need word-level timestamps
|
| 469 |
timestamp_mode = (
|
| 470 |
"words"
|
| 471 |
if caption_mode in ("word", "highlight_word")
|
| 472 |
else "segments"
|
| 473 |
)
|
| 474 |
|
| 475 |
-
# Phase 1 + 2: STT + AI analysis
|
| 476 |
viral_segments, duration, stt_data = processor.analyze_impact(
|
| 477 |
video_path,
|
| 478 |
source_language = kwargs.get("source_language"),
|
|
@@ -492,7 +531,6 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 492 |
|
| 493 |
best_clips = processor.get_best_segments(viral_segments, duration)
|
| 494 |
|
| 495 |
-
# Phase 3: render
|
| 496 |
output_files, transcripts = processor.process_clips(
|
| 497 |
video_path,
|
| 498 |
best_clips,
|
|
@@ -526,7 +564,7 @@ if __name__ == "__main__":
|
|
| 526 |
if len(sys.argv) > 1:
|
| 527 |
result = process_video(sys.argv[1])
|
| 528 |
print(json.dumps({
|
| 529 |
-
"clips":
|
| 530 |
"full_transcript": result["full_transcript"],
|
| 531 |
"clip_transcripts": [
|
| 532 |
{"clip": t["clip_index"], "text": t["full_text"]}
|
|
|
|
| 17 |
{ clip_index, start, end, segments, full_text }
|
| 18 |
- โ
NEW: process_video returns a dict with keys:
|
| 19 |
output_files, transcripts, viral_segments, duration
|
| 20 |
+
- โ
NEW: mix_audio method added to VideoProcessor
|
| 21 |
+
blends background music with original video audio
|
| 22 |
"""
|
| 23 |
import os
|
| 24 |
import gc
|
|
|
|
| 33 |
from core.stt import STT, SubtitleSegmenter
|
| 34 |
from core.analyze import analyze_transcript
|
| 35 |
from core.styles import StyleFactory
|
|
|
|
| 36 |
|
| 37 |
logger = Logger.get_logger(__name__)
|
| 38 |
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: float) -> list:
|
| 41 |
"""
|
|
|
|
| 50 |
|
| 51 |
total_chars = sum(len(w) for w in words)
|
| 52 |
seg_dur = seg_end - seg_start
|
| 53 |
+
result = []
|
| 54 |
+
cursor = seg_start
|
|
|
|
| 55 |
|
| 56 |
for i, w in enumerate(words):
|
| 57 |
+
fraction = (len(w) / total_chars) if total_chars > 0 else (1.0 / len(words))
|
| 58 |
+
w_dur = seg_dur * fraction
|
| 59 |
+
w_end = seg_end if i == len(words) - 1 else cursor + w_dur
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
result.append({
|
| 62 |
"text": w,
|
|
|
|
| 75 |
self.stt = STT(model_size=model_size)
|
| 76 |
Config.setup_dirs()
|
| 77 |
|
| 78 |
+
# โโ Audio Mixing โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 79 |
+
|
| 80 |
+
def mix_audio(self, video_clip, audio_path=None, bg_music_volume=0.1, original_volume=1.0):
|
| 81 |
+
"""
|
| 82 |
+
Blends background music with the original video audio.
|
| 83 |
+
|
| 84 |
+
video_clip : MoviePy VideoFileClip or CompositeVideoClip
|
| 85 |
+
audio_path : path to music file (mp3/m4a/...) โ None = skip
|
| 86 |
+
bg_music_volume : background music level (0.0 โ 1.0)
|
| 87 |
+
original_volume : original video audio level (0.0 โ 1.0)
|
| 88 |
+
|
| 89 |
+
Returns: video_clip with mixed audio
|
| 90 |
+
"""
|
| 91 |
+
if not audio_path or not os.path.exists(audio_path):
|
| 92 |
+
return video_clip
|
| 93 |
+
|
| 94 |
+
clip_duration = video_clip.duration
|
| 95 |
+
logger.info(f"๐ต Mixing audio: {audio_path} | vol={bg_music_volume}")
|
| 96 |
+
|
| 97 |
+
music = mpe.AudioFileClip(audio_path)
|
| 98 |
+
|
| 99 |
+
# ูู ุงูู
ูุณููู ุฃูุตุฑ ู
ู ุงููููุจ โ ูููุจ
|
| 100 |
+
if music.duration < clip_duration:
|
| 101 |
+
loops = int(clip_duration / music.duration) + 1
|
| 102 |
+
music = mpe.concatenate_audioclips([music] * loops)
|
| 103 |
+
logger.info(f"๐ Music looped x{loops}")
|
| 104 |
+
|
| 105 |
+
# ูุต ุงูู
ูุณููู ุจููุณ ุทูู ุงููููุจ
|
| 106 |
+
music = music.subclip(0, clip_duration).volumex(bg_music_volume)
|
| 107 |
+
|
| 108 |
+
original_audio = video_clip.audio
|
| 109 |
+
|
| 110 |
+
# ูู ู
ููุด ุตูุช ุฃุตูู โ ุฎูู ุงูู
ูุณููู ุจุณ
|
| 111 |
+
if original_audio is None:
|
| 112 |
+
logger.info("โ ๏ธ No original audio โ using music only")
|
| 113 |
+
return video_clip.set_audio(music)
|
| 114 |
+
|
| 115 |
+
# ุฎูุท ุงูุตูุชูู
|
| 116 |
+
mixed = mpe.CompositeAudioClip([
|
| 117 |
+
original_audio.volumex(original_volume),
|
| 118 |
+
music,
|
| 119 |
+
])
|
| 120 |
+
logger.info("โ
Audio mixed successfully")
|
| 121 |
+
return video_clip.set_audio(mixed)
|
| 122 |
+
|
| 123 |
# โโ JSON helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 124 |
|
| 125 |
def _clean_json_response(self, content):
|
|
|
|
| 215 |
|
| 216 |
data = {
|
| 217 |
"segments": full_segments,
|
| 218 |
+
"full_text": full_text,
|
| 219 |
"detected_language": detected_lang,
|
| 220 |
"target_language": target_language,
|
| 221 |
"duration": duration,
|
|
|
|
| 299 |
Cuts, styles, captions, and exports each viral clip.
|
| 300 |
|
| 301 |
โ
Returns: (output_files, transcripts_per_clip)
|
| 302 |
+
output_files : list of str โ paths to rendered .mp4 files
|
| 303 |
transcripts_per_clip : list of dicts, one per successfully rendered clip:
|
| 304 |
{
|
| 305 |
+
"clip_index" : int,
|
| 306 |
+
"filename" : str,
|
| 307 |
+
"start" : float,
|
| 308 |
+
"end" : float,
|
| 309 |
+
"language" : str,
|
| 310 |
+
"segments" : [ ... ],
|
| 311 |
+
"full_text" : str,
|
| 312 |
}
|
| 313 |
"""
|
| 314 |
logger.info("๐จ Phase 3: Style & Captions โฆ")
|
|
|
|
| 335 |
|
| 336 |
# โโ Main loop โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 337 |
output_files = []
|
| 338 |
+
transcripts_per_clip = []
|
| 339 |
|
| 340 |
if not best_clips:
|
| 341 |
logger.warning("โ ๏ธ No clips to process.")
|
|
|
|
| 371 |
logger.info(f"\n๐ฌ Clip {i+1}/{len(best_clips)} ({start:.2f}s โ {end:.2f}s)")
|
| 372 |
|
| 373 |
# โโ Output path โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 374 |
+
task_id = kwargs.get("task_id")
|
| 375 |
+
prefix = f"viral_{task_id}_{i+1}" if task_id else f"viral_{i+1}"
|
| 376 |
+
out_name = f"{prefix}_{style_str}.mp4"
|
| 377 |
final_output = os.path.join(Config.OUTPUTS_DIR, "viral_clips", out_name)
|
| 378 |
os.makedirs(os.path.dirname(final_output), exist_ok=True)
|
| 379 |
|
|
|
|
| 421 |
playground_path = kwargs.get("playground_path"),
|
| 422 |
)
|
| 423 |
|
| 424 |
+
# โ
Mix background music โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 425 |
+
final_clip = self.mix_audio(
|
| 426 |
+
final_clip,
|
| 427 |
+
audio_path = kwargs.get("audio_path"),
|
| 428 |
+
bg_music_volume = kwargs.get("bg_music_volume", 0.1),
|
| 429 |
+
original_volume = 1.0,
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
# โโ Export โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 433 |
cpu_count = os.cpu_count() or 4
|
| 434 |
logger.info(f"โ๏ธ Rendering with {cpu_count} thread(s) โฆ")
|
|
|
|
| 444 |
output_files.append(final_output)
|
| 445 |
logger.info(f"โ
Saved: {final_output}")
|
| 446 |
|
| 447 |
+
# โโ Build transcript entry โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 448 |
clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
|
| 449 |
transcripts_per_clip.append({
|
| 450 |
"clip_index": i + 1,
|
|
|
|
| 474 |
pass
|
| 475 |
gc.collect()
|
| 476 |
|
| 477 |
+
return output_files, transcripts_per_clip
|
| 478 |
|
| 479 |
|
| 480 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
| 485 |
"""
|
| 486 |
End-to-end pipeline: STT โ AI analysis โ clip export.
|
| 487 |
|
| 488 |
+
โ
Returns a dict with:
|
| 489 |
{
|
| 490 |
+
"output_files" : list[str],
|
| 491 |
+
"transcripts" : list[dict],
|
| 492 |
+
"viral_segments" : list[dict],
|
| 493 |
+
"full_transcript": str,
|
| 494 |
+
"duration" : float,
|
| 495 |
}
|
| 496 |
|
| 497 |
Important kwargs:
|
| 498 |
source_language : language of the original video โ passed to Whisper.
|
|
|
|
| 499 |
language : desired output language (translation + captions).
|
|
|
|
| 500 |
caption_mode : sentence | word | highlight_word
|
| 501 |
caption_style : classic | modern_glow | tiktok_bold | โฆ
|
| 502 |
+
audio_path : path to background music file
|
| 503 |
+
bg_music_volume : background music volume (0.0 โ 1.0)
|
| 504 |
"""
|
| 505 |
try:
|
| 506 |
+
processor = VideoProcessor(model_size=model_size)
|
|
|
|
| 507 |
caption_mode = kwargs.get("caption_mode", "sentence")
|
| 508 |
|
|
|
|
| 509 |
timestamp_mode = (
|
| 510 |
"words"
|
| 511 |
if caption_mode in ("word", "highlight_word")
|
| 512 |
else "segments"
|
| 513 |
)
|
| 514 |
|
|
|
|
| 515 |
viral_segments, duration, stt_data = processor.analyze_impact(
|
| 516 |
video_path,
|
| 517 |
source_language = kwargs.get("source_language"),
|
|
|
|
| 531 |
|
| 532 |
best_clips = processor.get_best_segments(viral_segments, duration)
|
| 533 |
|
|
|
|
| 534 |
output_files, transcripts = processor.process_clips(
|
| 535 |
video_path,
|
| 536 |
best_clips,
|
|
|
|
| 564 |
if len(sys.argv) > 1:
|
| 565 |
result = process_video(sys.argv[1])
|
| 566 |
print(json.dumps({
|
| 567 |
+
"clips": result["output_files"],
|
| 568 |
"full_transcript": result["full_transcript"],
|
| 569 |
"clip_transcripts": [
|
| 570 |
{"clip": t["clip_index"], "text": t["full_text"]}
|