Spaces:

ex510
/

auto_cliper

Sleeping

App Files Files Community

ex510 commited on Feb 27

Commit

ea1bf8a

verified ·

1 Parent(s): 3acae71

Update processor.py

Browse files

Files changed (1) hide show

processor.py +88 -30

processor.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """
 VideoProcessor — Core pipeline for viral clip extraction.
 Fixes applied:
   - source_language (for Whisper) separated from target_language (for translation/captions)
   - Removed duplicate _clean_json_response (json_repair version kept)
@@ -13,6 +12,11 @@ Fixes applied:
     using SubtitleSegmenter._split_into_lines so line splits match translated content
   - ✅ FIX: translated word timestamps distributed proportional to word length
     (instead of uniform distribution) for better highlight sync
 """
 import os
 import gc
@@ -28,7 +32,6 @@ from core.stt import STT, SubtitleSegmenter
 from core.analyze import analyze_transcript
 from core.styles import StyleFactory
 from core.subtitle_manager import SubtitleManager
-# from core.free_translator import FreeTranslator
 logger = Logger.get_logger(__name__)
@@ -41,7 +44,6 @@ def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: flo
     ✅ FIX: Distribute word timestamps proportional to character length instead of
     uniform distribution. Longer words get more time, giving better sync in
     highlight_word mode after translation.
     words: list of str (translated words)
     Returns: list of { text, start, end }
     """
@@ -156,7 +158,6 @@ class VideoProcessor:
                        progress_callback=None):
         """
         STT + AI viral-moment detection.
         source_language : passed directly to Whisper.
                           None → Whisper auto-detects (slower but safe).
         target_language : stored in data for process_clips to use for
@@ -180,6 +181,7 @@ class VideoProcessor:
         data = {
             "segments":          full_segments,
             "detected_language": detected_lang,
             "target_language":   target_language,
             "duration":          duration,
@@ -262,14 +264,18 @@ class VideoProcessor:
         """
         Cuts, styles, captions, and exports each viral clip.
-        ✅ FIX 1: After translation, _line1 and _line2 are re-computed from
-           the translated text using SubtitleSegmenter._split_into_lines.
-           Previously they were left as the original-language splits which
-           caused wrong line breaks in the translated captions.
-        ✅ FIX 2: Word timestamps after translation are distributed proportional
-           to character length (via _distribute_timestamps_by_length) instead of
-           uniform distribution, giving better sync in highlight_word mode.
         """
         logger.info("🎨 Phase 3: Style & Captions …")
         if progress_callback:
@@ -284,11 +290,8 @@ class VideoProcessor:
                 logger.error(f"❌ Could not determine video duration: {e}")
         # ── Language resolution ───────────────────────────────────────────────
-        detected_lang   = data.get("detected_language", "en")
-        # SIMPLIFIED: No separate translation step.
-        # STT has already provided the correct text (English or Arabic).
-        caption_lang = detected_lang
         logger.info(f"🗣️ Captions language: {caption_lang}")
         # ── Normalise style string once ───────────────────────────────────────
@@ -297,11 +300,12 @@ class VideoProcessor:
             style_str = style_str.split(".")[-1]
         # ── Main loop ─────────────────────────────────────────────────────────
-        output_files = []
         if not best_clips:
             logger.warning("⚠️ No clips to process.")
-            return []
         logger.info(f"📊 Processing {len(best_clips)} clip(s) …")
@@ -344,18 +348,16 @@ class VideoProcessor:
                 clip               = current_video_clip.subclip(start, end)
                 # ── Build segment_transcript ──────────────────────────────────
-                segment_transcript = {"segments": []}
                 for s in data["segments"]:
                     if s["start"] >= end or s["end"] <= start:
                         continue
-                    new_seg         = s.copy()
                     new_seg["start"] = max(0, s["start"] - start)
                     new_seg["end"]   = min(end - start, s["end"] - start)
-                    # SIMPLIFIED: No translation step here.
-                    # Just adjust timestamps relative to clip start.
                     if "words" in s:
                         new_seg["words"] = [
                             {
@@ -366,8 +368,10 @@ class VideoProcessor:
                             for w in s["words"]
                             if w["start"] < end and w["end"] > start
                         ]
-                    segment_transcript["segments"].append(new_seg)
                 # ── Apply style + captions ────────────────────────────────────
                 style_strategy = StyleFactory.get_style(style_str)
@@ -398,6 +402,23 @@ class VideoProcessor:
                 output_files.append(final_output)
                 logger.info(f"✅ Saved: {final_output}")
             except Exception as e:
                 logger.error(f"❌ Clip {i+1} error: {e}")
                 logger.error(traceback.format_exc())
@@ -411,7 +432,7 @@ class VideoProcessor:
                             pass
                 gc.collect()
-        return output_files
 # ─────────────────────────────────────────────────────────────────────────────
@@ -422,6 +443,15 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
     """
     End-to-end pipeline: STT → AI analysis → clip export.
     Important kwargs:
         source_language : language of the original video → passed to Whisper.
                           If not set → Whisper auto-detects.
@@ -452,12 +482,18 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
         if not viral_segments:
             logger.warning("⚠️ No viral segments found.")
-            return []
         best_clips = processor.get_best_segments(viral_segments, duration)
         # Phase 3: render
-        return processor.process_clips(
             video_path,
             best_clips,
             stt_data,
@@ -465,13 +501,35 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
             **kwargs,
         )
     except Exception as e:
         logger.error(f"❌ Processing failed: {e}")
         logger.error(traceback.format_exc())
-        return []
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
-        process_video(sys.argv[1])

 """
 VideoProcessor — Core pipeline for viral clip extraction.
 Fixes applied:
   - source_language (for Whisper) separated from target_language (for translation/captions)
   - Removed duplicate _clean_json_response (json_repair version kept)
     using SubtitleSegmenter._split_into_lines so line splits match translated content
   - ✅ FIX: translated word timestamps distributed proportional to word length
     (instead of uniform distribution) for better highlight sync
+  - ✅ NEW: process_clips now returns (output_files, transcripts_per_clip)
+             where transcripts_per_clip is a list of dicts:
+             { clip_index, start, end, segments, full_text }
+  - ✅ NEW: process_video returns a dict with keys:
+             output_files, transcripts, viral_segments, duration
 """
 import os
 import gc
 from core.analyze import analyze_transcript
 from core.styles import StyleFactory
 from core.subtitle_manager import SubtitleManager
 logger = Logger.get_logger(__name__)
     ✅ FIX: Distribute word timestamps proportional to character length instead of
     uniform distribution. Longer words get more time, giving better sync in
     highlight_word mode after translation.
     words: list of str (translated words)
     Returns: list of { text, start, end }
     """
                        progress_callback=None):
         """
         STT + AI viral-moment detection.
         source_language : passed directly to Whisper.
                           None → Whisper auto-detects (slower but safe).
         target_language : stored in data for process_clips to use for
         data = {
             "segments":          full_segments,
+            "full_text":         full_text,           # ✅ NEW: store full transcript text
             "detected_language": detected_lang,
             "target_language":   target_language,
             "duration":          duration,
         """
         Cuts, styles, captions, and exports each viral clip.
+        ✅ Returns: (output_files, transcripts_per_clip)
+            output_files      : list of str — paths to rendered .mp4 files
+            transcripts_per_clip : list of dicts, one per successfully rendered clip:
+                {
+                    "clip_index"  : int,           # 1-based
+                    "filename"    : str,            # output filename (basename)
+                    "start"       : float,          # clip start in original video (s)
+                    "end"         : float,          # clip end in original video (s)
+                    "language"    : str,            # detected/caption language
+                    "segments"    : [ ... ],        # STT segments relative to clip start
+                    "full_text"   : str,            # concatenated text of all segments
+                }
         """
         logger.info("🎨 Phase 3: Style & Captions …")
         if progress_callback:
                 logger.error(f"❌ Could not determine video duration: {e}")
         # ── Language resolution ───────────────────────────────────────────────
+        detected_lang = data.get("detected_language", "en")
+        caption_lang  = detected_lang
         logger.info(f"🗣️ Captions language: {caption_lang}")
         # ── Normalise style string once ───────────────────────────────────────
             style_str = style_str.split(".")[-1]
         # ── Main loop ─────────────────────────────────────────────────────────
+        output_files         = []
+        transcripts_per_clip = []   # ✅ NEW
         if not best_clips:
             logger.warning("⚠️ No clips to process.")
+            return [], []
         logger.info(f"📊 Processing {len(best_clips)} clip(s) …")
                 clip               = current_video_clip.subclip(start, end)
                 # ── Build segment_transcript ──────────────────────────────────
+                clip_segments = []
                 for s in data["segments"]:
                     if s["start"] >= end or s["end"] <= start:
                         continue
+                    new_seg          = s.copy()
                     new_seg["start"] = max(0, s["start"] - start)
                     new_seg["end"]   = min(end - start, s["end"] - start)
                     if "words" in s:
                         new_seg["words"] = [
                             {
                             for w in s["words"]
                             if w["start"] < end and w["end"] > start
                         ]
+                    clip_segments.append(new_seg)
+                segment_transcript = {"segments": clip_segments}
                 # ── Apply style + captions ────────────────────────────────────
                 style_strategy = StyleFactory.get_style(style_str)
                 output_files.append(final_output)
                 logger.info(f"✅ Saved: {final_output}")
+                # ✅ NEW: Build transcript entry for this clip
+                clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
+                transcripts_per_clip.append({
+                    "clip_index": i + 1,
+                    "filename":   out_name,
+                    "start":      start,
+                    "end":        end,
+                    "language":   caption_lang,
+                    "segments":   clip_segments,
+                    "full_text":  clip_full_text,
+                })
+                logger.info(
+                    f"📝 Transcript for clip {i+1}: "
+                    f"{len(clip_segments)} segment(s), "
+                    f"{len(clip_full_text)} chars"
+                )
             except Exception as e:
                 logger.error(f"❌ Clip {i+1} error: {e}")
                 logger.error(traceback.format_exc())
                             pass
                 gc.collect()
+        return output_files, transcripts_per_clip   # ✅ tuple now
 # ─────────────────────────────────────────────────────────────────────────────
     """
     End-to-end pipeline: STT → AI analysis → clip export.
+    ✅ Returns a dict (instead of a plain list) with:
+        {
+            "output_files"   : list[str],   # paths to rendered clips
+            "transcripts"    : list[dict],  # per-clip transcripts (see process_clips)
+            "viral_segments" : list[dict],  # raw AI viral segment detections
+            "full_transcript": str,         # full video transcript text
+            "duration"       : float,       # video duration in seconds
+        }
     Important kwargs:
         source_language : language of the original video → passed to Whisper.
                           If not set → Whisper auto-detects.
         if not viral_segments:
             logger.warning("⚠️ No viral segments found.")
+            return {
+                "output_files":    [],
+                "transcripts":     [],
+                "viral_segments":  [],
+                "full_transcript": stt_data.get("full_text", ""),
+                "duration":        duration,
+            }
         best_clips = processor.get_best_segments(viral_segments, duration)
         # Phase 3: render
+        output_files, transcripts = processor.process_clips(
             video_path,
             best_clips,
             stt_data,
             **kwargs,
         )
+        return {
+            "output_files":    output_files,
+            "transcripts":     transcripts,
+            "viral_segments":  viral_segments,
+            "full_transcript": stt_data.get("full_text", ""),
+            "duration":        duration,
+        }
     except Exception as e:
         logger.error(f"❌ Processing failed: {e}")
         logger.error(traceback.format_exc())
+        return {
+            "output_files":    [],
+            "transcripts":     [],
+            "viral_segments":  [],
+            "full_transcript": "",
+            "duration":        0,
+        }
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
+        result = process_video(sys.argv[1])
+        print(json.dumps({
+            "clips":          result["output_files"],
+            "full_transcript": result["full_transcript"],
+            "clip_transcripts": [
+                {"clip": t["clip_index"], "text": t["full_text"]}
+                for t in result["transcripts"]
+            ],
+        }, indent=2, ensure_ascii=False))