# app.py — Slideshow with per-image audio, multiline TTS per image, and voice picker # Works with MoviePy v2.x; falls back to v1 when necessary. Python 3.9+ safe. import os import re import tempfile import random from typing import Optional, List, Dict import numpy as np from PIL import Image import gradio as gr # ---- MoviePy imports with v2/v1 compatibility ---- MPY_V2 = False afx = None # audio effects (v2) _CompositeAudioClip = None _concat_audios = None try: # v2.x preferred from moviepy import ( ImageSequenceClip, AudioFileClip, ImageClip, concatenate_videoclips, ) try: from moviepy import afx as _afx # type: ignore afx = _afx except Exception: afx = None try: from moviepy import CompositeAudioClip as _CompositeAudioClip # type: ignore except Exception: _CompositeAudioClip = None try: from moviepy import concatenate_audioclips as _concat_audios # type: ignore except Exception: _concat_audios = None MPY_V2 = True except Exception: # v1.x fallback from moviepy.editor import ( ImageSequenceClip, AudioFileClip, ImageClip, concatenate_videoclips, CompositeAudioClip as _CompositeAudioClip, # type: ignore concatenate_audioclips as _concat_audios, # type: ignore ) MPY_V2 = False # ---------- Small compatibility helpers ---------- def clip_with_duration(clip, duration: float): if hasattr(clip, "with_duration"): # v2 return clip.with_duration(duration) return clip.set_duration(duration) # v1 def clip_with_audio(clip, audio): if hasattr(clip, "with_audio"): # v2 return clip.with_audio(audio) return clip.set_audio(audio) # v1 def apply_linear_gain(audio_clip, gain_linear: float): """ Try to apply a linear gain to an AudioFileClip. If effects aren't available, return the original clip (no-op). """ if hasattr(audio_clip, "with_effects") and afx is not None: try: return audio_clip.with_effects([afx.MultiplyVolume(gain_linear)]) except Exception: pass if hasattr(audio_clip, "fx"): try: if afx is not None and hasattr(afx, "volumex"): return audio_clip.fx(afx.volumex, gain_linear) except Exception: pass return audio_clip def concat_audios_or_composite(clips: List): """ Concatenate audio clips. Prefer the built-in concatenator; otherwise composite sequentially using start offsets to emulate concatenation. """ if not clips: return None if len(clips) == 1: return clips[0] if _concat_audios is not None: try: return _concat_audios(clips) except Exception: pass # Fallback: sequential CompositeAudioClip if _CompositeAudioClip is not None: total = 0.0 seq = [] for c in clips: try: seq.append(c.set_start(total)) total += float(c.duration) except Exception: pass comp = _CompositeAudioClip(seq) try: comp = clip_with_duration(comp, total) except Exception: pass return comp # Last resort return clips[0] # ---------- Image utilities ---------- def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray: """ Loads an image file and returns an RGB numpy array with exact (height, width, 3). fit: - "contain": letterbox to fit within target size (keeps aspect), background fills rest. - "cover": fill target size (keeps aspect) with center crop. - "stretch": distort to target size. """ img = Image.open(path).convert("RGB") if fit == "stretch": img = img.resize((width, height), Image.LANCZOS) return np.array(img) iw, ih = img.size target_aspect = float(width) / float(height) src_aspect = float(iw) / float(ih) if fit == "cover": # scale to cover, then center-crop if src_aspect > target_aspect: new_h = height new_w = int(round(src_aspect * new_h)) else: new_w = width new_h = int(round(new_w / src_aspect)) img = img.resize((new_w, new_h), Image.LANCZOS) left = (new_w - width) // 2 top = (new_h - height) // 2 img = img.crop((left, top, left + width, top + height)) return np.array(img) # contain (letterbox/pillarbox) canvas = Image.new("RGB", (width, height), bg) if src_aspect > target_aspect: new_w = width new_h = int(round(new_w / src_aspect)) else: new_h = height new_w = int(round(src_aspect * new_h)) resized = img.resize((new_w, new_h), Image.LANCZOS) left = (width - new_w) // 2 top = (height - new_h) // 2 canvas.paste(resized, (left, top)) return np.array(canvas) # ---------- TTS backends ---------- _TTS_CACHE: Dict[str, object] = {} def _get_tts_backend(backend_name: str): """ Lazy-load a TTS backend instance. - "Coqui (VCTK multi-speaker)" -> coqui-ai/TTS model: tts_models/en/vctk/vits - "gTTS (simple)" -> sentinel string "gTTS" """ if backend_name == "Coqui (VCTK multi-speaker)": if backend_name not in _TTS_CACHE: from TTS.api import TTS # heavy import _TTS_CACHE[backend_name] = TTS("tts_models/en/vctk/vits") return _TTS_CACHE[backend_name] elif backend_name == "gTTS (simple)": return "gTTS" return None def list_voices(backend_name: str) -> List[str]: if backend_name != "Coqui (VCTK multi-speaker)": return [] try: tts = _get_tts_backend(backend_name) candidates: List[str] = [] # Try common attributes across TTS versions for path in [ "speakers", "speaker_manager.speaker_names", "speaker_manager.speaker_ids", ]: obj = tts try: for part in path.split("."): obj = getattr(obj, part) names = list(obj) if obj is not None else [] if names: candidates = [str(x) for x in names] break except Exception: continue # Sensible fallback if nothing found (known VCTK IDs) if not candidates: candidates = [ "p225","p226","p233","p243","p254","p256","p258","p259", "p270","p273","p274","p278","p279","p302","p311","p316", "p334","p345","p360","p363","p374" ] # Nudge common male IDs toward the top if present male_pref = ["p225","p226","p233","p243","p270","p274","p279","p311","p345","p360","p363"] ordered = candidates[:] for pref in reversed(male_pref): if pref in ordered: ordered.remove(pref) ordered.insert(0, pref) # Deduplicate while preserving order seen, final = set(), [] for v in ordered: if v not in seen: seen.add(v) final.append(v) return final except Exception: # Absolute fallback return ["p225","p226","p233","p243"] def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]: text = (text or "").strip() if not text: return None if backend_name == "Coqui (VCTK multi-speaker)": try: tts = _get_tts_backend(backend_name) if not out_path.lower().endswith(".wav"): out_path = os.path.splitext(out_path)[0] + ".wav" tts.tts_to_file(text=text, speaker=voice, file_path=out_path) return out_path except Exception: return None if backend_name == "gTTS (simple)": try: from gtts import gTTS if not out_path.lower().endswith(".mp3"): out_path = os.path.splitext(out_path)[0] + ".mp3" gTTS(text=text, lang="en").save(out_path) return out_path except Exception: return None return None # ---------- Text parsing for multiline-per-image ---------- def parse_multiline_blocks(text: str, expected_images: int) -> List[List[str]]: """ Split text into blocks by blank lines. Each block = one image. Within a block, each non-empty line is a separate TTS segment. """ if not (text or "").strip(): return [[] for _ in range(expected_images)] blocks = [b.strip() for b in re.split(r"\n\s*\n", text.strip()) if b.strip()] # Pad/trim to match number of images if len(blocks) < expected_images: blocks += [""] * (expected_images - len(blocks)) elif len(blocks) > expected_images: blocks = blocks[:expected_images] result = [] for b in blocks: lines = [ln.strip() for ln in b.splitlines() if ln.strip()] result.append(lines) return result # ---------- Build audio for each image from multiple lines ---------- def build_audio_for_image_lines( lines: List[str], tts_backend: str, default_voice: Optional[str], audio_gain_db: float, tmp_dir: str ): """ For a single image: - Generate TTS for each line (respect 'speaker| text' override). - Concatenate segments. - Apply gain to the final track. - Return (audio_clip, total_duration) or (None, 0.0) if no audio. """ segments = [] for idx, raw in enumerate(lines): voice = default_voice text = raw if "|" in raw and tts_backend.startswith("Coqui"): spk, txt = raw.split("|", 1) if txt.strip(): text = txt.strip() if spk.strip(): voice = spk.strip() out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav") gen = synth_tts_to_file(text, tts_backend, voice, out_p) if gen and os.path.exists(gen): try: seg = AudioFileClip(gen) segments.append(seg) except Exception: pass if not segments: return None, 0.0 combined = concat_audios_or_composite(segments) if combined is None: return None, 0.0 # Apply gain on the final composite if needed gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0 if abs(gain - 1.0) > 1e-3: combined = apply_linear_gain(combined, gain) total = float(combined.duration) return combined, total # ---------- Variable-duration video (per-image) ---------- def build_variable_duration_video( frames: List[np.ndarray], per_image_durations: List[float], per_image_audios: List[Optional[object]], # AudioFileClip or CompositeAudioClip ): """ Create a video where each image has its own duration and optional audio. """ clips = [] for frame, dur, aclip in zip(frames, per_image_durations, per_image_audios): iclip = ImageClip(frame) iclip = clip_with_duration(iclip, float(max(0.05, dur))) if aclip is not None: try: iclip = clip_with_audio(iclip, aclip) except Exception: pass clips.append(iclip) final = concatenate_videoclips(clips, method="compose") return final # ---------- Main create function ---------- def create_slideshow( image_files: List, narration_mode: str, # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)" | "Per-image (TTS multiline per image)" seconds_per_image: float, width: int, height: int, fit_mode: str, bg_color: str, sort_mode: str, shuffle_seed: Optional[float], # single-story inputs story_text: str, match_video_to_narration: bool, # per-image inputs per_image_texts: str, # one line per image per_image_multiline_blocks: str, # blocks separated by blank lines per_image_audio_files: List, # uploaded audio files sync_per_image_audio: bool, # sync duration to audio for per-image modes # TTS config tts_backend: str, tts_voice: Optional[str], audio_gain_db: float ): if not image_files: return None, "Please upload at least one image." # Normalize image paths paths = [] for f in image_files: p = getattr(f, "name", None) or getattr(f, "path", None) or f if p and os.path.exists(p): paths.append(p) if not paths: return None, "Could not read the uploaded images." # Order if sort_mode == "Filename (A→Z)": paths = sorted(paths, key=lambda p: os.path.basename(p).lower()) elif sort_mode == "Filename (Z→A)": paths = sorted(paths, key=lambda p: os.path.basename(p).lower(), reverse=True) elif sort_mode == "Shuffle": rnd = random.Random(int(shuffle_seed or 0)) rnd.shuffle(paths) # Load frames width = int(width); height = int(height) frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths] num_images = len(frames) out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4") # --- Per-image AUDIO FILES --- if narration_mode == "Per-image (files)" and per_image_audio_files: # Normalize audio paths & sort by filename aud_paths = [] for a in per_image_audio_files: ap = getattr(a, "name", None) or getattr(a, "path", None) or a if ap and os.path.exists(ap): aud_paths.append(ap) aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower()) # Basename match, then index fallback def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]: result = [None] * len(image_paths) if not audio_paths: return result audio_map = {} for a in audio_paths: base = os.path.splitext(os.path.basename(a))[0].lower() audio_map[base] = a used = set() for i, ip in enumerate(image_paths): base = os.path.splitext(os.path.basename(ip))[0].lower() if base in audio_map: result[i] = audio_map[base]; used.add(audio_map[base]) leftover = [a for a in audio_paths if a not in used] for i in range(len(image_paths)): if result[i] is None and leftover: result[i] = leftover.pop(0) return result per_img_audio_paths = map_audio_to_images_by_name(paths, aud_paths) per_img_audios = [] per_img_durs = [] for ap in per_img_audio_paths: if ap: try: aclip = AudioFileClip(ap) per_img_audios.append(aclip) per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image)) except Exception: per_img_audios.append(None) per_img_durs.append(float(seconds_per_image)) else: per_img_audios.append(None) per_img_durs.append(float(seconds_per_image)) final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios) final_clip.write_videofile( out_path, codec="libx264", audio_codec="aac", fps=24, preset="medium", threads=max(1, (os.cpu_count() or 2) // 2), ) return out_path, "Done! Per-image audio applied." # --- Per-image TTS per single line --- if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip(): lines = [ln.strip() for ln in per_image_texts.splitlines()] # Pad / trim to image count if len(lines) < num_images: lines += [""] * (num_images - len(lines)) else: lines = lines[:num_images] tmp_dir = tempfile.gettempdir() per_img_audios = [] per_img_durs = [] for idx, text in enumerate(lines): voice = tts_voice if "|" in text and tts_backend.startswith("Coqui"): maybe_speaker, maybe_text = text.split("|", 1) if maybe_text.strip(): text = maybe_text.strip() if maybe_speaker.strip(): voice = maybe_speaker.strip() apath = None if text: apath = os.path.join(tmp_dir, f"tts_line_{idx}.wav") gen = synth_tts_to_file(text, tts_backend, voice, apath) apath = gen if gen and os.path.exists(gen) else None if apath: try: aclip = AudioFileClip(apath) per_img_audios.append(aclip) per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image)) except Exception: per_img_audios.append(None) per_img_durs.append(float(seconds_per_image)) else: per_img_audios.append(None) per_img_durs.append(float(seconds_per_image)) final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios) final_clip.write_videofile( out_path, codec="libx264", audio_codec="aac", fps=24, preset="medium", threads=max(1, (os.cpu_count() or 2) // 2), ) return out_path, "Done! Per-image TTS (single line) applied." # --- Per-image TTS multiline per image --- if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip(): blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images) tmp_dir = tempfile.gettempdir() per_img_audios = [] per_img_durs = [] for idx, lines in enumerate(blocks): if not lines: per_img_audios.append(None) per_img_durs.append(float(seconds_per_image)) continue aclip, total = build_audio_for_image_lines( lines=lines, tts_backend=tts_backend, default_voice=tts_voice, audio_gain_db=audio_gain_db, tmp_dir=tmp_dir ) if aclip is not None: per_img_audios.append(aclip) per_img_durs.append(float(total) if sync_per_image_audio else float(seconds_per_image)) else: per_img_audios.append(None) per_img_durs.append(float(seconds_per_image)) final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios) final_clip.write_videofile( out_path, codec="libx264", audio_codec="aac", fps=24, preset="medium", threads=max(1, (os.cpu_count() or 2) // 2), ) return out_path, "Done! Per-image multiline TTS applied." # --- Single story (one track) --- if narration_mode == "Single story" and story_text.strip(): # Base video (uniform duration) fps = 24 repeats = max(1, int(round(float(seconds_per_image) * fps))) expanded = [] for frame in frames: expanded.extend([frame] * repeats) clip = ImageSequenceClip(expanded, fps=fps) # TTS tmp = tempfile.gettempdir() audio_path = os.path.join(tmp, "narration_single.wav") gen = synth_tts_to_file(story_text.strip(), tts_backend, tts_voice, audio_path) audio_path = gen if gen and os.path.exists(gen) else None if audio_path: try: aclip = AudioFileClip(audio_path) if match_video_to_narration: clip = clip_with_duration(clip, float(aclip.duration)) gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0 if abs(gain - 1.0) > 1e-3: aclip = apply_linear_gain(aclip, gain) clip = clip_with_audio(clip, aclip) except Exception: pass clip.write_videofile( out_path, codec="libx264", audio_codec="aac", fps=fps, preset="medium", threads=max(1, (os.cpu_count() or 2) // 2), ) return out_path, "Done! Story narration applied." # --- No narration: uniform duration slideshow --- fps = 24 repeats = max(1, int(round(float(seconds_per_image) * fps))) expanded = [] for frame in frames: expanded.extend([frame] * repeats) clip = ImageSequenceClip(expanded, fps=fps) clip.write_videofile( out_path, codec="libx264", audio_codec="aac", fps=fps, preset="medium", threads=max(1, (os.cpu_count() or 2) // 2), ) return out_path, "Done! Video created without narration." # ---------- UI ---------- def update_voice_choices(backend_name: str): voices = list_voices(backend_name) value = voices[0] if voices else None msg = f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)." return gr.update(choices=voices, value=value), msg def ui(): with gr.Blocks(title="Slideshow + Per-Image Audio + Multiline TTS + Voice Picker", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🖼️ → 🎬 Slideshow Maker - **Per-image audio**: upload audio files, one (or more) per image (matched by filename or order). - **Per-image TTS (multiline)**: write blocks separated by **blank lines**; lines inside a block are spoken sequentially for that image. - **TTS voices**: pick from **Coqui VCTK** multi-speaker voices (male/female) or use gTTS as a lightweight fallback. """ ) with gr.Row(): with gr.Column(scale=1): image_files = gr.Files( label="Upload Images (multiple)", file_count="multiple", file_types=["image"], ) sort_mode = gr.Radio( ["Filename (A→Z)", "Filename (Z→A)", "Shuffle"], value="Filename (A→Z)", label="Image Order", ) shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)") seconds_per_image = gr.Slider( minimum=0.1, maximum=10.0, step=0.1, value=1.5, label="Seconds per Image (used when not syncing to audio)" ) with gr.Row(): width = gr.Number(value=1280, precision=0, label="Width (px)") height = gr.Number(value=720, precision=0, label="Height (px)") fit_mode = gr.Radio(["contain", "cover", "stretch"], value="contain", label="Sizing Mode") bg_color = gr.ColorPicker(value="#000000", label="Background (for 'contain')") with gr.Column(scale=1): narration_mode = gr.Radio( ["None", "Single story", "Per-image (files)", "Per-image (TTS per line)", "Per-image (TTS multiline per image)"], value="None", label="Narration mode" ) # Single-story UI story_text = gr.Textbox( label="Story (Single track narration)", placeholder="Type or paste your story...", lines=20, ) match_video_to_narration = gr.Checkbox( value=True, label="Match video duration to narration length (single-story)" ) # Per-image UI (files) per_image_audio_files = gr.Files( label="Per-image audio files (optional) — matched by filename or order", file_count="multiple", file_types=["audio"] ) sync_per_image_audio = gr.Checkbox( value=True, label="Sync image to audio duration (per-image modes)" ) # Per-image UI (text) per_image_texts = gr.Textbox( label="Per-image TTS (one line per image)", placeholder="Line 1 (image 1)\nLine 2 (image 2)\n...", lines=8, ) per_image_multiline_blocks = gr.Textbox( label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override", placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n...", lines=40, ) with gr.Row(): tts_backend = gr.Dropdown( ["Coqui (VCTK multi-speaker)", "gTTS (simple)"], value="Coqui (VCTK multi-speaker)", label="TTS backend" ) tts_voice = gr.Dropdown(choices=[], label="Default Voice (for Coqui)") voice_status = gr.Markdown("") audio_gain_db = gr.Slider( minimum=-12, maximum=12, step=1, value=0, label="Narration Gain (dB)" ) run_btn = gr.Button("Create Video", variant="primary") status = gr.Markdown("") video_out = gr.Video(label="Result", autoplay=False) # Load voices when backend changes tts_backend.change( fn=update_voice_choices, inputs=[tts_backend], outputs=[tts_voice, voice_status] ) # Also populate on initial load demo.load( fn=update_voice_choices, inputs=[tts_backend], outputs=[tts_voice, voice_status] ) # Main action run_btn.click( fn=create_slideshow, inputs=[ image_files, narration_mode, seconds_per_image, width, height, fit_mode, bg_color, sort_mode, shuffle_seed, # single-story story_text, match_video_to_narration, # per-image text inputs per_image_texts, per_image_multiline_blocks, # per-image files + sync per_image_audio_files, sync_per_image_audio, # tts tts_backend, tts_voice, audio_gain_db ], outputs=[video_out, status], ) gr.Markdown( """ **Tips** - *Multiline per image*: separate image blocks with a **blank line**. Within each block, lines are spoken in order. - *Coqui per-line speaker*: prefix a line with `speaker| text`, e.g., `p225| Hello there`. - *Sync option*: turn it on to make each image stay up for the full duration of its own audio. """ ) return demo if __name__ == "__main__": ui().launch()