Spaces:

Vishwas1
/

VideoCreator

Runtime error

App Files Files Community

Vishwas1 commited on Sep 8, 2025

Commit

902db85

verified ·

1 Parent(s): f47c03b

Update app.py

Browse files

Files changed (1) hide show

app.py +266 -98

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
-# app.py — Slideshow with per-image audio + multi-voice TTS (HF Coqui)
-# Works with MoviePy v2.x; falls back to v1 where possible.
 import os
 import tempfile
 import random
 from typing import Optional, List, Dict, Tuple
@@ -13,19 +14,41 @@ import gradio as gr
 # ---- MoviePy imports with v2/v1 compatibility ----
 MPY_V2 = False
 afx = None  # audio effects (v2)
 try:
     # v2.x preferred
-    from moviepy import ImageSequenceClip, AudioFileClip, ImageClip, concatenate_videoclips  # type: ignore
     try:
         from moviepy import afx as _afx  # type: ignore
         afx = _afx
     except Exception:
         afx = None
     MPY_V2 = True
 except Exception:
     # v1.x fallback
-    from moviepy.editor import ImageSequenceClip, AudioFileClip, ImageClip, concatenate_videoclips  # type: ignore
     MPY_V2 = False
@@ -62,13 +85,45 @@ def apply_linear_gain(audio_clip, gain_linear: float):
     return audio_clip
 # ---------- Image utilities ----------
 def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray:
     """
     Loads an image file and returns an RGB numpy array with exact (height, width, 3).
-    fit modes:
-      - "contain": letterbox to fit within target size (keeps aspect), background color fills the rest.
       - "cover":   fill target size (keeps aspect) with center crop.
       - "stretch": distort to target size.
     """
@@ -85,11 +140,9 @@ def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain",
     if fit == "cover":
         # scale to cover, then center-crop
         if src_aspect > target_aspect:
-            # image too wide -> fit height, crop width
             new_h = height
             new_w = int(round(src_aspect * new_h))
         else:
-            # image too tall -> fit width, crop height
             new_w = width
             new_h = int(round(new_w / src_aspect))
         img = img.resize((new_w, new_h), Image.LANCZOS)
@@ -113,7 +166,6 @@ def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain",
     return np.array(canvas)
 # ---------- TTS backends ----------
 _TTS_CACHE: Dict[str, object] = {}
@@ -139,15 +191,13 @@ def list_voices(backend_name: str) -> List[str]:
         try:
             tts = _get_tts_backend(backend_name)
             spks = list(getattr(tts, "speakers", []))
-            # Prefer a common male default if present
-            default_pref = ["p225", "p226", "p233", "p243"]
-            ordered = sorted(spks)
-            for pref in default_pref:
-                if pref in ordered:
-                    ordered.remove(pref)
-                    ordered.insert(0, pref)
                     break
-            return ordered
         except Exception:
             return []
     return []
@@ -161,7 +211,6 @@ def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_pa
     if backend_name == "Coqui (VCTK multi-speaker)":
         try:
             tts = _get_tts_backend(backend_name)
-            # Coqui writes WAV by default; we'll give a .wav path
             if not out_path.lower().endswith(".wav"):
                 out_path = os.path.splitext(out_path)[0] + ".wav"
             tts.tts_to_file(text=text, speaker=voice, file_path=out_path)
@@ -169,7 +218,6 @@ def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_pa
         except Exception:
             return None
-    # gTTS fallback
     if backend_name == "gTTS (simple)":
         try:
             from gtts import gTTS
@@ -183,73 +231,111 @@ def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_pa
     return None
-# ---------- Helpers for per-image mapping ----------
-def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]:
     """
-    Try basename match first; then fall back to index order.
     """
-    result = [None] * len(image_paths)
-    if not audio_paths:
-        return result
-    # Basename map (without extension)
-    audio_map = {}
-    for a in audio_paths:
-        base = os.path.splitext(os.path.basename(a))[0].lower()
-        audio_map[base] = a
-    used = set()
-    # First pass: basename matches
-    for i, ip in enumerate(image_paths):
-        base = os.path.splitext(os.path.basename(ip))[0].lower()
-        if base in audio_map:
-            result[i] = audio_map[base]
-            used.add(audio_map[base])
-    # Second pass: index fallback for any remaining
-    leftover = [a for a in audio_paths if a not in used]
-    for i in range(len(image_paths)):
-        if result[i] is None and leftover:
-            result[i] = leftover.pop(0)
     return result
-# ---------- Core builder ----------
 def build_variable_duration_video(
     frames: List[np.ndarray],
     per_image_durations: List[float],
-    per_image_audios: List[Optional[str]],
-    audio_gain_db: float
 ):
     """
     Create a video where each image has its own duration and optional audio.
     """
     clips = []
-    for frame, dur, apath in zip(frames, per_image_durations, per_image_audios):
         iclip = ImageClip(frame)
-        iclip = clip_with_duration(iclip, float(dur))
-        if apath:
             try:
-                aclip = AudioFileClip(apath)
-                gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
-                if abs(gain - 1.0) > 1e-3:
-                    aclip = apply_linear_gain(aclip, gain)
                 iclip = clip_with_audio(iclip, aclip)
             except Exception:
                 pass
         clips.append(iclip)
-    # Compose ensures audio & size are aligned
     final = concatenate_videoclips(clips, method="compose")
     return final
 def create_slideshow(
     image_files: List,
-    narration_mode: str,                 # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)"
     seconds_per_image: float,
     width: int,
     height: int,
@@ -263,8 +349,10 @@ def create_slideshow(
     match_video_to_narration: bool,
     # per-image inputs
-    per_image_texts: str,                # one line per image; optional "speaker| text" when using Coqui
     per_image_audio_files: List,         # uploaded audio files
     # TTS config
     tts_backend: str,
@@ -295,8 +383,8 @@ def create_slideshow(
     # Load frames
     width = int(width); height = int(height)
     frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths]
-    # Build outputs based on narration_mode
     out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4")
     # --- Per-image AUDIO FILES ---
@@ -309,22 +397,44 @@ def create_slideshow(
                 aud_paths.append(ap)
         aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower())
-        per_img_audio = map_audio_to_images_by_name(paths, aud_paths)
-        # Durations: match each audio if present, else fall back to seconds_per_image
-        durations = []
-        for ap in per_img_audio:
             if ap:
                 try:
                     aclip = AudioFileClip(ap)
-                    durations.append(float(aclip.duration))
                 except Exception:
-                    durations.append(float(seconds_per_image))
             else:
-                durations.append(float(seconds_per_image))
-        final_clip = build_variable_duration_video(frames, durations, per_img_audio, audio_gain_db)
         final_clip.write_videofile(
             out_path,
             codec="libx264",
@@ -335,22 +445,21 @@ def create_slideshow(
         )
         return out_path, "Done! Per-image audio applied."
-    # --- Per-image TTS per line ---
     if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
         lines = [ln.strip() for ln in per_image_texts.splitlines()]
         # Pad / trim to image count
-        if len(lines) < len(paths):
-            lines += [""] * (len(paths) - len(lines))
         else:
-            lines = lines[:len(paths)]
-        # Generate audio per line
         tmp_dir = tempfile.gettempdir()
-        per_img_audio = []
-        durations = []
         for idx, text in enumerate(lines):
             voice = tts_voice
-            # Optional "speaker| text" override for Coqui
             if "|" in text and tts_backend.startswith("Coqui"):
                 maybe_speaker, maybe_text = text.split("|", 1)
                 if maybe_text.strip():
@@ -364,18 +473,59 @@ def create_slideshow(
                 gen = synth_tts_to_file(text, tts_backend, voice, apath)
                 apath = gen if gen and os.path.exists(gen) else None
-            per_img_audio.append(apath)
             if apath:
                 try:
                     aclip = AudioFileClip(apath)
-                    durations.append(float(aclip.duration))
                 except Exception:
-                    durations.append(float(seconds_per_image))
             else:
-                durations.append(float(seconds_per_image))
-        final_clip = build_variable_duration_video(frames, durations, per_img_audio, audio_gain_db)
         final_clip.write_videofile(
             out_path,
             codec="libx264",
@@ -384,7 +534,7 @@ def create_slideshow(
             preset="medium",
             threads=max(1, (os.cpu_count() or 2) // 2),
         )
-        return out_path, "Done! Per-image TTS applied."
     # --- Single story (one track) ---
     if narration_mode == "Single story" and story_text.strip():
@@ -451,12 +601,13 @@ def update_voice_choices(backend_name: str):
 def ui():
-    with gr.Blocks(title="Slideshow + Per-Image Audio + Voice Picker", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
             # 🖼️ → 🎬 Slideshow Maker
-            - **Per-image audio**: upload audio files (matched by filename or order) **or** generate per-image narration from text lines.
-            - **TTS voices**: pick from **Coqui VCTK**'s multi-speaker voices (male/female), or use gTTS as a lightweight fallback.
             """
         )
@@ -475,7 +626,8 @@ def ui():
                 shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)")
                 seconds_per_image = gr.Slider(
-                    minimum=0.1, maximum=10.0, step=0.1, value=1.5, label="Seconds per Image (used when no per-image audio)"
                 )
                 with gr.Row():
@@ -487,10 +639,15 @@ def ui():
             with gr.Column(scale=1):
                 narration_mode = gr.Radio(
-                    ["None", "Single story", "Per-image (files)", "Per-image (TTS per line)"],
                     value="None",
                     label="Narration mode"
                 )
                 # Single-story UI
                 story_text = gr.Textbox(
                     label="Story (Single track narration)",
@@ -500,15 +657,24 @@ def ui():
                     value=True, label="Match video duration to narration length (single-story)"
                 )
-                # Per-image UI
                 per_image_audio_files = gr.Files(
                     label="Per-image audio files (optional) — matched by filename or order",
                     file_count="multiple",
                     file_types=["audio"]
                 )
                 per_image_texts = gr.Textbox(
-                    label="Per-image TTS text (one line per image). For Coqui, optional 'speaker| text' per line.",
-                    placeholder="Line 1 text\nLine 2 text\n..."
                 )
                 with gr.Row():
@@ -517,7 +683,7 @@ def ui():
                         value="Coqui (VCTK multi-speaker)",
                         label="TTS backend"
                     )
-                    tts_voice = gr.Dropdown(choices=[], label="Voice (for Coqui)")
                 voice_status = gr.Markdown("")
                 audio_gain_db = gr.Slider(
@@ -548,8 +714,10 @@ def ui():
                 sort_mode, shuffle_seed,
                 # single-story
                 story_text, match_video_to_narration,
-                # per-image
-                per_image_texts, per_image_audio_files,
                 # tts
                 tts_backend, tts_voice,
                 audio_gain_db
@@ -560,9 +728,9 @@ def ui():
         gr.Markdown(
             """
             **Tips**
-            - *Per-image audio (files)*: name audio like your images (e.g., `001.jpg` ↔ `001.wav`) for automatic matching.
-            - *Per-image TTS per line*: supply the same number of lines as images; extra/missing lines are trimmed/padded.
-            - *Coqui voice per line*: prefix a line with `speaker| text` to override the dropdown voice (e.g., `p225| Hello there`).
             """
         )

+# app.py — Slideshow with per-image audio + multiline TTS per image + voice picker
+# Works with MoviePy v2.x; falls back to v1 where possible. Python 3.9+ safe.
 import os
+import re
 import tempfile
 import random
 from typing import Optional, List, Dict, Tuple
 # ---- MoviePy imports with v2/v1 compatibility ----
 MPY_V2 = False
 afx = None  # audio effects (v2)
+_CompositeAudioClip = None
+_concat_audios = None
 try:
     # v2.x preferred
+    from moviepy import (
+        ImageSequenceClip,
+        AudioFileClip,
+        ImageClip,
+        concatenate_videoclips,
+    )
     try:
         from moviepy import afx as _afx  # type: ignore
         afx = _afx
     except Exception:
         afx = None
+    try:
+        from moviepy import CompositeAudioClip as _CompositeAudioClip  # type: ignore
+    except Exception:
+        _CompositeAudioClip = None
+    try:
+        from moviepy import concatenate_audioclips as _concat_audios  # type: ignore
+    except Exception:
+        _concat_audios = None
     MPY_V2 = True
 except Exception:
     # v1.x fallback
+    from moviepy.editor import (
+        ImageSequenceClip,
+        AudioFileClip,
+        ImageClip,
+        concatenate_videoclips,
+        CompositeAudioClip as _CompositeAudioClip,  # type: ignore
+        concatenate_audioclips as _concat_audios,   # type: ignore
+    )
     MPY_V2 = False
     return audio_clip
+def concat_audios_or_composite(clips: List):
+    """
+    Concatenate audio clips. Prefer built-in concatenator; otherwise composite
+    sequentially using start offsets to emulate concatenation.
+    """
+    if not clips:
+        return None
+    if len(clips) == 1:
+        return clips[0]
+    if _concat_audios is not None:
+        try:
+            return _concat_audios(clips)
+        except Exception:
+            pass
+    # Fallback: sequential CompositeAudioClip
+    if _CompositeAudioClip is not None:
+        starts = []
+        total = 0.0
+        seq = []
+        for c in clips:
+            seq.append(c.set_start(total))
+            total += float(c.duration)
+        comp = _CompositeAudioClip(seq)
+        try:
+            comp = clip_with_duration(comp, total)
+        except Exception:
+            pass
+        return comp
+    # last resort
+    return clips[0]
 # ---------- Image utilities ----------
 def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray:
     """
     Loads an image file and returns an RGB numpy array with exact (height, width, 3).
+    fit:
+      - "contain": letterbox to fit within target size (keeps aspect), background fills rest.
       - "cover":   fill target size (keeps aspect) with center crop.
       - "stretch": distort to target size.
     """
     if fit == "cover":
         # scale to cover, then center-crop
         if src_aspect > target_aspect:
             new_h = height
             new_w = int(round(src_aspect * new_h))
         else:
             new_w = width
             new_h = int(round(new_w / src_aspect))
         img = img.resize((new_w, new_h), Image.LANCZOS)
     return np.array(canvas)
 # ---------- TTS backends ----------
 _TTS_CACHE: Dict[str, object] = {}
         try:
             tts = _get_tts_backend(backend_name)
             spks = list(getattr(tts, "speakers", []))
+            # Bring a common male voice to the top if present
+            for pref in ["p225", "p226", "p233", "p243"]:
+                if pref in spks:
+                    spks.remove(pref)
+                    spks.insert(0, pref)
                     break
+            return sorted(spks) if not spks or spks[0] != "p225" else spks
         except Exception:
             return []
     return []
     if backend_name == "Coqui (VCTK multi-speaker)":
         try:
             tts = _get_tts_backend(backend_name)
             if not out_path.lower().endswith(".wav"):
                 out_path = os.path.splitext(out_path)[0] + ".wav"
             tts.tts_to_file(text=text, speaker=voice, file_path=out_path)
         except Exception:
             return None
     if backend_name == "gTTS (simple)":
         try:
             from gtts import gTTS
     return None
+# ---------- Text parsing for multiline-per-image ----------
+def parse_multiline_blocks(text: str, expected_images: int) -> List[List[str]]:
     """
+    Split text into blocks by blank lines. Each block = one image.
+    Within a block, each non-empty line is a separate TTS segment.
     """
+    if not (text or "").strip():
+        return [[] for _ in range(expected_images)]
+    blocks = [b.strip() for b in re.split(r"\n\s*\n", text.strip()) if b.strip()]
+    # Pad/trim to match number of images
+    if len(blocks) < expected_images:
+        blocks += [""] * (expected_images - len(blocks))
+    elif len(blocks) > expected_images:
+        blocks = blocks[:expected_images]
+    result = []
+    for b in blocks:
+        lines = [ln.strip() for ln in b.splitlines() if ln.strip()]
+        result.append(lines)
     return result
+# ---------- Build audio for each image from multiple lines ----------
+def build_audio_for_image_lines(
+    lines: List[str],
+    tts_backend: str,
+    default_voice: Optional[str],
+    audio_gain_db: float,
+    tmp_dir: str
+):
+    """
+    For a single image:
+      - Generate TTS for each line (respect 'speaker| text' override).
+      - Concatenate segments.
+      - Apply gain to the final track.
+      - Return (audio_clip, total_duration) or (None, 0.0) if no audio.
+    """
+    segments = []
+    for idx, raw in enumerate(lines):
+        voice = default_voice
+        text = raw
+        if "|" in raw and tts_backend.startswith("Coqui"):
+            spk, txt = raw.split("|", 1)
+            if txt.strip():
+                text = txt.strip()
+                if spk.strip():
+                    voice = spk.strip()
+        # Synthesize this line
+        out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
+        gen = synth_tts_to_file(text, tts_backend, voice, out_p)
+        if gen and os.path.exists(gen):
+            try:
+                seg = AudioFileClip(gen)
+                segments.append(seg)
+            except Exception:
+                pass
+    if not segments:
+        return None, 0.0
+    combined = concat_audios_or_composite(segments)
+    if combined is None:
+        return None, 0.0
+    # Apply gain on the final composite if needed
+    gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
+    if abs(gain - 1.0) > 1e-3:
+        combined = apply_linear_gain(combined, gain)
+    total = float(combined.duration)
+    return combined, total
+# ---------- Variable-duration video (per-image) ----------
 def build_variable_duration_video(
     frames: List[np.ndarray],
     per_image_durations: List[float],
+    per_image_audios: List[Optional[object]],  # AudioFileClip or CompositeAudioClip
 ):
     """
     Create a video where each image has its own duration and optional audio.
     """
     clips = []
+    for frame, dur, aclip in zip(frames, per_image_durations, per_image_audios):
         iclip = ImageClip(frame)
+        iclip = clip_with_duration(iclip, float(max(0.05, dur)))
+        if aclip is not None:
             try:
                 iclip = clip_with_audio(iclip, aclip)
             except Exception:
                 pass
         clips.append(iclip)
     final = concatenate_videoclips(clips, method="compose")
     return final
+# ---------- Main create function ----------
 def create_slideshow(
     image_files: List,
+    narration_mode: str,                 # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)" | "Per-image (TTS multiline per image)"
     seconds_per_image: float,
     width: int,
     height: int,
     match_video_to_narration: bool,
     # per-image inputs
+    per_image_texts: str,                # one line per image
+    per_image_multiline_blocks: str,     # blocks separated by blank lines
     per_image_audio_files: List,         # uploaded audio files
+    sync_per_image_audio: bool,          # NEW: sync duration to audio for per-image modes
     # TTS config
     tts_backend: str,
     # Load frames
     width = int(width); height = int(height)
     frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths]
+    num_images = len(frames)
     out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4")
     # --- Per-image AUDIO FILES ---
                 aud_paths.append(ap)
         aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower())
+        # Basename match, then index fallback
+        def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]:
+            result = [None] * len(image_paths)
+            if not audio_paths:
+                return result
+            audio_map = {}
+            for a in audio_paths:
+                base = os.path.splitext(os.path.basename(a))[0].lower()
+                audio_map[base] = a
+            used = set()
+            for i, ip in enumerate(image_paths):
+                base = os.path.splitext(os.path.basename(ip))[0].lower()
+                if base in audio_map:
+                    result[i] = audio_map[base]; used.add(audio_map[base])
+            leftover = [a for a in audio_paths if a not in used]
+            for i in range(len(image_paths)):
+                if result[i] is None and leftover:
+                    result[i] = leftover.pop(0)
+            return result
+        per_img_audio_paths = map_audio_to_images_by_name(paths, aud_paths)
+        per_img_audios = []
+        per_img_durs = []
+        for ap in per_img_audio_paths:
             if ap:
                 try:
                     aclip = AudioFileClip(ap)
+                    per_img_audios.append(aclip)
+                    per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
                 except Exception:
+                    per_img_audios.append(None)
+                    per_img_durs.append(float(seconds_per_image))
             else:
+                per_img_audios.append(None)
+                per_img_durs.append(float(seconds_per_image))
+        final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
         final_clip.write_videofile(
             out_path,
             codec="libx264",
         )
         return out_path, "Done! Per-image audio applied."
+    # --- Per-image TTS per single line (legacy one-line-per-image) ---
     if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
         lines = [ln.strip() for ln in per_image_texts.splitlines()]
         # Pad / trim to image count
+        if len(lines) < num_images:
+            lines += [""] * (num_images - len(lines))
         else:
+            lines = lines[:num_images]
         tmp_dir = tempfile.gettempdir()
+        per_img_audios = []
+        per_img_durs = []
         for idx, text in enumerate(lines):
             voice = tts_voice
             if "|" in text and tts_backend.startswith("Coqui"):
                 maybe_speaker, maybe_text = text.split("|", 1)
                 if maybe_text.strip():
                 gen = synth_tts_to_file(text, tts_backend, voice, apath)
                 apath = gen if gen and os.path.exists(gen) else None
             if apath:
                 try:
                     aclip = AudioFileClip(apath)
+                    per_img_audios.append(aclip)
+                    per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
                 except Exception:
+                    per_img_audios.append(None)
+                    per_img_durs.append(float(seconds_per_image))
             else:
+                per_img_audios.append(None)
+                per_img_durs.append(float(seconds_per_image))
+        final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
+        final_clip.write_videofile(
+            out_path,
+            codec="libx264",
+            audio_codec="aac",
+            fps=24,
+            preset="medium",
+            threads=max(1, (os.cpu_count() or 2) // 2),
+        )
+        return out_path, "Done! Per-image TTS (single line) applied."
+    # --- NEW: Per-image TTS multiline per image ---
+    if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
+        blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
+        tmp_dir = tempfile.gettempdir()
+        per_img_audios = []
+        per_img_durs = []
+        for idx, lines in enumerate(blocks):
+            if not lines:
+                per_img_audios.append(None)
+                per_img_durs.append(float(seconds_per_image))
+                continue
+            aclip, total = build_audio_for_image_lines(
+                lines=lines,
+                tts_backend=tts_backend,
+                default_voice=tts_voice,
+                audio_gain_db=audio_gain_db,
+                tmp_dir=tmp_dir
+            )
+            if aclip is not None:
+                per_img_audios.append(aclip)
+                per_img_durs.append(float(total) if sync_per_image_audio else float(seconds_per_image))
+            else:
+                per_img_audios.append(None)
+                per_img_durs.append(float(seconds_per_image))
+        final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
         final_clip.write_videofile(
             out_path,
             codec="libx264",
             preset="medium",
             threads=max(1, (os.cpu_count() or 2) // 2),
         )
+        return out_path, "Done! Per-image multiline TTS applied."
     # --- Single story (one track) ---
     if narration_mode == "Single story" and story_text.strip():
 def ui():
+    with gr.Blocks(title="Slideshow + Per-Image Audio + Multiline TTS + Voice Picker", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
             # 🖼️ → 🎬 Slideshow Maker
+            - **Per-image audio**: upload audio files, one (or more) per image (matched by filename or order).
+            - **Per-image TTS (multiline)**: write blocks separated by **blank lines**; lines inside a block are spoken sequentially for that image.
+            - **TTS voices**: pick from **Coqui VCTK** multi-speaker voices (male/female) or use gTTS as a lightweight fallback.
             """
         )
                 shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)")
                 seconds_per_image = gr.Slider(
+                    minimum=0.1, maximum=10.0, step=0.1, value=1.5,
+                    label="Seconds per Image (used when not syncing to audio)"
                 )
                 with gr.Row():
             with gr.Column(scale=1):
                 narration_mode = gr.Radio(
+                    ["None",
+                     "Single story",
+                     "Per-image (files)",
+                     "Per-image (TTS per line)",
+                     "Per-image (TTS multiline per image)"],
                     value="None",
                     label="Narration mode"
                 )
                 # Single-story UI
                 story_text = gr.Textbox(
                     label="Story (Single track narration)",
                     value=True, label="Match video duration to narration length (single-story)"
                 )
+                # Per-image UI (files)
                 per_image_audio_files = gr.Files(
                     label="Per-image audio files (optional) — matched by filename or order",
                     file_count="multiple",
                     file_types=["audio"]
                 )
+                sync_per_image_audio = gr.Checkbox(
+                    value=True, label="Sync image to audio duration (per-image modes)"
+                )
+                # Per-image UI (text)
                 per_image_texts = gr.Textbox(
+                    label="Per-image TTS (one line per image)",
+                    placeholder="Line 1 (image 1)\nLine 2 (image 2)\n..."
+                )
+                per_image_multiline_blocks = gr.Textbox(
+                    label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
+                    placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n..."
                 )
                 with gr.Row():
                         value="Coqui (VCTK multi-speaker)",
                         label="TTS backend"
                     )
+                    tts_voice = gr.Dropdown(choices=[], label="Default Voice (for Coqui)")
                 voice_status = gr.Markdown("")
                 audio_gain_db = gr.Slider(
                 sort_mode, shuffle_seed,
                 # single-story
                 story_text, match_video_to_narration,
+                # per-image text
+                per_image_texts, per_image_multiline_blocks,
+                # per-image files
+                per_image_audio_files, sync_per_image_audio,
                 # tts
                 tts_backend, tts_voice,
                 audio_gain_db
         gr.Markdown(
             """
             **Tips**
+            - *Multiline per image*: separate image blocks with a **blank line**. Within each block, lines are spoken in order.
+            - *Coqui per-line speaker*: prefix a line with `speaker| text`, e.g., `p225| Hello there`.
+            - *Sync option*: turn it on to make each image stay up for the full duration of its own audio.
             """
         )