Spaces:

Vishwas1
/

VideoCreator

Runtime error

App Files Files Community

Vishwas1 commited on Sep 8, 2025

Commit

1c01e22

verified ·

1 Parent(s): 902db85

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -34

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
-# app.py — Slideshow with per-image audio + multiline TTS per image + voice picker
-# Works with MoviePy v2.x; falls back to v1 where possible. Python 3.9+ safe.
 import os
 import re
 import tempfile
 import random
-from typing import Optional, List, Dict, Tuple
 import numpy as np
 from PIL import Image
@@ -46,8 +46,8 @@ except Exception:
         AudioFileClip,
         ImageClip,
         concatenate_videoclips,
-        CompositeAudioClip as _CompositeAudioClip,  # type: ignore
-        concatenate_audioclips as _concat_audios,   # type: ignore
     )
     MPY_V2 = False
@@ -87,7 +87,7 @@ def apply_linear_gain(audio_clip, gain_linear: float):
 def concat_audios_or_composite(clips: List):
     """
-    Concatenate audio clips. Prefer built-in concatenator; otherwise composite
     sequentially using start offsets to emulate concatenation.
     """
     if not clips:
@@ -101,19 +101,21 @@ def concat_audios_or_composite(clips: List):
             pass
     # Fallback: sequential CompositeAudioClip
     if _CompositeAudioClip is not None:
-        starts = []
         total = 0.0
         seq = []
         for c in clips:
-            seq.append(c.set_start(total))
-            total += float(c.duration)
         comp = _CompositeAudioClip(seq)
         try:
             comp = clip_with_duration(comp, total)
         except Exception:
             pass
         return comp
-    # last resort
     return clips[0]
@@ -187,20 +189,57 @@ def _get_tts_backend(backend_name: str):
 def list_voices(backend_name: str) -> List[str]:
-    if backend_name == "Coqui (VCTK multi-speaker)":
-        try:
-            tts = _get_tts_backend(backend_name)
-            spks = list(getattr(tts, "speakers", []))
-            # Bring a common male voice to the top if present
-            for pref in ["p225", "p226", "p233", "p243"]:
-                if pref in spks:
-                    spks.remove(pref)
-                    spks.insert(0, pref)
                     break
-            return sorted(spks) if not spks or spks[0] != "p225" else spks
-        except Exception:
-            return []
-    return []
 def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
@@ -279,7 +318,6 @@ def build_audio_for_image_lines(
                 text = txt.strip()
                 if spk.strip():
                     voice = spk.strip()
-        # Synthesize this line
         out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
         gen = synth_tts_to_file(text, tts_backend, voice, out_p)
         if gen and os.path.exists(gen):
@@ -352,7 +390,7 @@ def create_slideshow(
     per_image_texts: str,                # one line per image
     per_image_multiline_blocks: str,     # blocks separated by blank lines
     per_image_audio_files: List,         # uploaded audio files
-    sync_per_image_audio: bool,          # NEW: sync duration to audio for per-image modes
     # TTS config
     tts_backend: str,
@@ -445,7 +483,7 @@ def create_slideshow(
         )
         return out_path, "Done! Per-image audio applied."
-    # --- Per-image TTS per single line (legacy one-line-per-image) ---
     if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
         lines = [ln.strip() for ln in per_image_texts.splitlines()]
         # Pad / trim to image count
@@ -496,7 +534,7 @@ def create_slideshow(
         )
         return out_path, "Done! Per-image TTS (single line) applied."
-    # --- NEW: Per-image TTS multiline per image ---
     if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
         blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
         tmp_dir = tempfile.gettempdir()
@@ -538,7 +576,7 @@ def create_slideshow(
     # --- Single story (one track) ---
     if narration_mode == "Single story" and story_text.strip():
-        # Build base video (uniform duration)
         fps = 24
         repeats = max(1, int(round(float(seconds_per_image) * fps)))
         expanded = []
@@ -597,7 +635,8 @@ def create_slideshow(
 def update_voice_choices(backend_name: str):
     voices = list_voices(backend_name)
     value = voices[0] if voices else None
-    return gr.update(choices=voices, value=value), f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
 def ui():
@@ -651,7 +690,9 @@ def ui():
                 # Single-story UI
                 story_text = gr.Textbox(
                     label="Story (Single track narration)",
-                    placeholder="Type or paste your story..."
                 )
                 match_video_to_narration = gr.Checkbox(
                     value=True, label="Match video duration to narration length (single-story)"
@@ -670,11 +711,15 @@ def ui():
                 # Per-image UI (text)
                 per_image_texts = gr.Textbox(
                     label="Per-image TTS (one line per image)",
-                    placeholder="Line 1 (image 1)\nLine 2 (image 2)\n..."
                 )
                 per_image_multiline_blocks = gr.Textbox(
                     label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
-                    placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n..."
                 )
                 with gr.Row():
@@ -702,6 +747,13 @@ def ui():
             outputs=[tts_voice, voice_status]
         )
         # Main action
         run_btn.click(
             fn=create_slideshow,
@@ -714,9 +766,9 @@ def ui():
                 sort_mode, shuffle_seed,
                 # single-story
                 story_text, match_video_to_narration,
-                # per-image text
                 per_image_texts, per_image_multiline_blocks,
-                # per-image files
                 per_image_audio_files, sync_per_image_audio,
                 # tts
                 tts_backend, tts_voice,

+# app.py — Slideshow with per-image audio, multiline TTS per image, and voice picker
+# Works with MoviePy v2.x; falls back to v1 when necessary. Python 3.9+ safe.
 import os
 import re
 import tempfile
 import random
+from typing import Optional, List, Dict
 import numpy as np
 from PIL import Image
         AudioFileClip,
         ImageClip,
         concatenate_videoclips,
+        CompositeAudioClip as _CompositeAudioClip,   # type: ignore
+        concatenate_audioclips as _concat_audios,    # type: ignore
     )
     MPY_V2 = False
 def concat_audios_or_composite(clips: List):
     """
+    Concatenate audio clips. Prefer the built-in concatenator; otherwise composite
     sequentially using start offsets to emulate concatenation.
     """
     if not clips:
             pass
     # Fallback: sequential CompositeAudioClip
     if _CompositeAudioClip is not None:
         total = 0.0
         seq = []
         for c in clips:
+            try:
+                seq.append(c.set_start(total))
+                total += float(c.duration)
+            except Exception:
+                pass
         comp = _CompositeAudioClip(seq)
         try:
             comp = clip_with_duration(comp, total)
         except Exception:
             pass
         return comp
+    # Last resort
     return clips[0]
 def list_voices(backend_name: str) -> List[str]:
+    if backend_name != "Coqui (VCTK multi-speaker)":
+        return []
+    try:
+        tts = _get_tts_backend(backend_name)
+        candidates: List[str] = []
+        # Try common attributes across TTS versions
+        for path in [
+            "speakers",
+            "speaker_manager.speaker_names",
+            "speaker_manager.speaker_ids",
+        ]:
+            obj = tts
+            try:
+                for part in path.split("."):
+                    obj = getattr(obj, part)
+                names = list(obj) if obj is not None else []
+                if names:
+                    candidates = [str(x) for x in names]
                     break
+            except Exception:
+                continue
+        # Sensible fallback if nothing found (known VCTK IDs)
+        if not candidates:
+            candidates = [
+                "p225","p226","p233","p243","p254","p256","p258","p259",
+                "p270","p273","p274","p278","p279","p302","p311","p316",
+                "p334","p345","p360","p363","p374"
+            ]
+        # Nudge common male IDs toward the top if present
+        male_pref = ["p225","p226","p233","p243","p270","p274","p279","p311","p345","p360","p363"]
+        ordered = candidates[:]
+        for pref in reversed(male_pref):
+            if pref in ordered:
+                ordered.remove(pref)
+                ordered.insert(0, pref)
+        # Deduplicate while preserving order
+        seen, final = set(), []
+        for v in ordered:
+            if v not in seen:
+                seen.add(v)
+                final.append(v)
+        return final
+    except Exception:
+        # Absolute fallback
+        return ["p225","p226","p233","p243"]
 def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
                 text = txt.strip()
                 if spk.strip():
                     voice = spk.strip()
         out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
         gen = synth_tts_to_file(text, tts_backend, voice, out_p)
         if gen and os.path.exists(gen):
     per_image_texts: str,                # one line per image
     per_image_multiline_blocks: str,     # blocks separated by blank lines
     per_image_audio_files: List,         # uploaded audio files
+    sync_per_image_audio: bool,          # sync duration to audio for per-image modes
     # TTS config
     tts_backend: str,
         )
         return out_path, "Done! Per-image audio applied."
+    # --- Per-image TTS per single line ---
     if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
         lines = [ln.strip() for ln in per_image_texts.splitlines()]
         # Pad / trim to image count
         )
         return out_path, "Done! Per-image TTS (single line) applied."
+    # --- Per-image TTS multiline per image ---
     if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
         blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
         tmp_dir = tempfile.gettempdir()
     # --- Single story (one track) ---
     if narration_mode == "Single story" and story_text.strip():
+        # Base video (uniform duration)
         fps = 24
         repeats = max(1, int(round(float(seconds_per_image) * fps)))
         expanded = []
 def update_voice_choices(backend_name: str):
     voices = list_voices(backend_name)
     value = voices[0] if voices else None
+    msg = f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
+    return gr.update(choices=voices, value=value), msg
 def ui():
                 # Single-story UI
                 story_text = gr.Textbox(
                     label="Story (Single track narration)",
+                    placeholder="Type or paste your story...",
+                    lines=6,
+                    autogrow=True
                 )
                 match_video_to_narration = gr.Checkbox(
                     value=True, label="Match video duration to narration length (single-story)"
                 # Per-image UI (text)
                 per_image_texts = gr.Textbox(
                     label="Per-image TTS (one line per image)",
+                    placeholder="Line 1 (image 1)\nLine 2 (image 2)\n...",
+                    lines=8,
+                    autogrow=True
                 )
                 per_image_multiline_blocks = gr.Textbox(
                     label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
+                    placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n...",
+                    lines=12,
+                    autogrow=True
                 )
                 with gr.Row():
             outputs=[tts_voice, voice_status]
         )
+        # Also populate on initial load
+        demo.load(
+            fn=update_voice_choices,
+            inputs=[tts_backend],
+            outputs=[tts_voice, voice_status]
+        )
         # Main action
         run_btn.click(
             fn=create_slideshow,
                 sort_mode, shuffle_seed,
                 # single-story
                 story_text, match_video_to_narration,
+                # per-image text inputs
                 per_image_texts, per_image_multiline_blocks,
+                # per-image files + sync
                 per_image_audio_files, sync_per_image_audio,
                 # tts
                 tts_backend, tts_voice,