Spaces:
Runtime error
Runtime error
| # app.py — Slideshow with per-image audio, multiline TTS per image, and voice picker | |
| # Works with MoviePy v2.x; falls back to v1 when necessary. Python 3.9+ safe. | |
| import os | |
| import re | |
| import tempfile | |
| import random | |
| from typing import Optional, List, Dict | |
| import numpy as np | |
| from PIL import Image | |
| import gradio as gr | |
| # ---- MoviePy imports with v2/v1 compatibility ---- | |
| MPY_V2 = False | |
| afx = None # audio effects (v2) | |
| _CompositeAudioClip = None | |
| _concat_audios = None | |
| try: | |
| # v2.x preferred | |
| from moviepy import ( | |
| ImageSequenceClip, | |
| AudioFileClip, | |
| ImageClip, | |
| concatenate_videoclips, | |
| ) | |
| try: | |
| from moviepy import afx as _afx # type: ignore | |
| afx = _afx | |
| except Exception: | |
| afx = None | |
| try: | |
| from moviepy import CompositeAudioClip as _CompositeAudioClip # type: ignore | |
| except Exception: | |
| _CompositeAudioClip = None | |
| try: | |
| from moviepy import concatenate_audioclips as _concat_audios # type: ignore | |
| except Exception: | |
| _concat_audios = None | |
| MPY_V2 = True | |
| except Exception: | |
| # v1.x fallback | |
| from moviepy.editor import ( | |
| ImageSequenceClip, | |
| AudioFileClip, | |
| ImageClip, | |
| concatenate_videoclips, | |
| CompositeAudioClip as _CompositeAudioClip, # type: ignore | |
| concatenate_audioclips as _concat_audios, # type: ignore | |
| ) | |
| MPY_V2 = False | |
| # ---------- Small compatibility helpers ---------- | |
| def clip_with_duration(clip, duration: float): | |
| if hasattr(clip, "with_duration"): # v2 | |
| return clip.with_duration(duration) | |
| return clip.set_duration(duration) # v1 | |
| def clip_with_audio(clip, audio): | |
| if hasattr(clip, "with_audio"): # v2 | |
| return clip.with_audio(audio) | |
| return clip.set_audio(audio) # v1 | |
| def apply_linear_gain(audio_clip, gain_linear: float): | |
| """ | |
| Try to apply a linear gain to an AudioFileClip. | |
| If effects aren't available, return the original clip (no-op). | |
| """ | |
| if hasattr(audio_clip, "with_effects") and afx is not None: | |
| try: | |
| return audio_clip.with_effects([afx.MultiplyVolume(gain_linear)]) | |
| except Exception: | |
| pass | |
| if hasattr(audio_clip, "fx"): | |
| try: | |
| if afx is not None and hasattr(afx, "volumex"): | |
| return audio_clip.fx(afx.volumex, gain_linear) | |
| except Exception: | |
| pass | |
| return audio_clip | |
| def concat_audios_or_composite(clips: List): | |
| """ | |
| Concatenate audio clips. Prefer the built-in concatenator; otherwise composite | |
| sequentially using start offsets to emulate concatenation. | |
| """ | |
| if not clips: | |
| return None | |
| if len(clips) == 1: | |
| return clips[0] | |
| if _concat_audios is not None: | |
| try: | |
| return _concat_audios(clips) | |
| except Exception: | |
| pass | |
| # Fallback: sequential CompositeAudioClip | |
| if _CompositeAudioClip is not None: | |
| total = 0.0 | |
| seq = [] | |
| for c in clips: | |
| try: | |
| seq.append(c.set_start(total)) | |
| total += float(c.duration) | |
| except Exception: | |
| pass | |
| comp = _CompositeAudioClip(seq) | |
| try: | |
| comp = clip_with_duration(comp, total) | |
| except Exception: | |
| pass | |
| return comp | |
| # Last resort | |
| return clips[0] | |
| # ---------- Image utilities ---------- | |
| def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray: | |
| """ | |
| Loads an image file and returns an RGB numpy array with exact (height, width, 3). | |
| fit: | |
| - "contain": letterbox to fit within target size (keeps aspect), background fills rest. | |
| - "cover": fill target size (keeps aspect) with center crop. | |
| - "stretch": distort to target size. | |
| """ | |
| img = Image.open(path).convert("RGB") | |
| if fit == "stretch": | |
| img = img.resize((width, height), Image.LANCZOS) | |
| return np.array(img) | |
| iw, ih = img.size | |
| target_aspect = float(width) / float(height) | |
| src_aspect = float(iw) / float(ih) | |
| if fit == "cover": | |
| # scale to cover, then center-crop | |
| if src_aspect > target_aspect: | |
| new_h = height | |
| new_w = int(round(src_aspect * new_h)) | |
| else: | |
| new_w = width | |
| new_h = int(round(new_w / src_aspect)) | |
| img = img.resize((new_w, new_h), Image.LANCZOS) | |
| left = (new_w - width) // 2 | |
| top = (new_h - height) // 2 | |
| img = img.crop((left, top, left + width, top + height)) | |
| return np.array(img) | |
| # contain (letterbox/pillarbox) | |
| canvas = Image.new("RGB", (width, height), bg) | |
| if src_aspect > target_aspect: | |
| new_w = width | |
| new_h = int(round(new_w / src_aspect)) | |
| else: | |
| new_h = height | |
| new_w = int(round(src_aspect * new_h)) | |
| resized = img.resize((new_w, new_h), Image.LANCZOS) | |
| left = (width - new_w) // 2 | |
| top = (height - new_h) // 2 | |
| canvas.paste(resized, (left, top)) | |
| return np.array(canvas) | |
| # ---------- TTS backends ---------- | |
| _TTS_CACHE: Dict[str, object] = {} | |
| def _get_tts_backend(backend_name: str): | |
| """ | |
| Lazy-load a TTS backend instance. | |
| - "Coqui (VCTK multi-speaker)" -> coqui-ai/TTS model: tts_models/en/vctk/vits | |
| - "gTTS (simple)" -> sentinel string "gTTS" | |
| """ | |
| if backend_name == "Coqui (VCTK multi-speaker)": | |
| if backend_name not in _TTS_CACHE: | |
| from TTS.api import TTS # heavy import | |
| _TTS_CACHE[backend_name] = TTS("tts_models/en/vctk/vits") | |
| return _TTS_CACHE[backend_name] | |
| elif backend_name == "gTTS (simple)": | |
| return "gTTS" | |
| return None | |
| def list_voices(backend_name: str) -> List[str]: | |
| if backend_name != "Coqui (VCTK multi-speaker)": | |
| return [] | |
| try: | |
| tts = _get_tts_backend(backend_name) | |
| candidates: List[str] = [] | |
| # Try common attributes across TTS versions | |
| for path in [ | |
| "speakers", | |
| "speaker_manager.speaker_names", | |
| "speaker_manager.speaker_ids", | |
| ]: | |
| obj = tts | |
| try: | |
| for part in path.split("."): | |
| obj = getattr(obj, part) | |
| names = list(obj) if obj is not None else [] | |
| if names: | |
| candidates = [str(x) for x in names] | |
| break | |
| except Exception: | |
| continue | |
| # Sensible fallback if nothing found (known VCTK IDs) | |
| if not candidates: | |
| candidates = [ | |
| "p225","p226","p233","p243","p254","p256","p258","p259", | |
| "p270","p273","p274","p278","p279","p302","p311","p316", | |
| "p334","p345","p360","p363","p374" | |
| ] | |
| # Nudge common male IDs toward the top if present | |
| male_pref = ["p225","p226","p233","p243","p270","p274","p279","p311","p345","p360","p363"] | |
| ordered = candidates[:] | |
| for pref in reversed(male_pref): | |
| if pref in ordered: | |
| ordered.remove(pref) | |
| ordered.insert(0, pref) | |
| # Deduplicate while preserving order | |
| seen, final = set(), [] | |
| for v in ordered: | |
| if v not in seen: | |
| seen.add(v) | |
| final.append(v) | |
| return final | |
| except Exception: | |
| # Absolute fallback | |
| return ["p225","p226","p233","p243"] | |
| def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]: | |
| text = (text or "").strip() | |
| if not text: | |
| return None | |
| if backend_name == "Coqui (VCTK multi-speaker)": | |
| try: | |
| tts = _get_tts_backend(backend_name) | |
| if not out_path.lower().endswith(".wav"): | |
| out_path = os.path.splitext(out_path)[0] + ".wav" | |
| tts.tts_to_file(text=text, speaker=voice, file_path=out_path) | |
| return out_path | |
| except Exception: | |
| return None | |
| if backend_name == "gTTS (simple)": | |
| try: | |
| from gtts import gTTS | |
| if not out_path.lower().endswith(".mp3"): | |
| out_path = os.path.splitext(out_path)[0] + ".mp3" | |
| gTTS(text=text, lang="en").save(out_path) | |
| return out_path | |
| except Exception: | |
| return None | |
| return None | |
| # ---------- Text parsing for multiline-per-image ---------- | |
| def parse_multiline_blocks(text: str, expected_images: int) -> List[List[str]]: | |
| """ | |
| Split text into blocks by blank lines. Each block = one image. | |
| Within a block, each non-empty line is a separate TTS segment. | |
| """ | |
| if not (text or "").strip(): | |
| return [[] for _ in range(expected_images)] | |
| blocks = [b.strip() for b in re.split(r"\n\s*\n", text.strip()) if b.strip()] | |
| # Pad/trim to match number of images | |
| if len(blocks) < expected_images: | |
| blocks += [""] * (expected_images - len(blocks)) | |
| elif len(blocks) > expected_images: | |
| blocks = blocks[:expected_images] | |
| result = [] | |
| for b in blocks: | |
| lines = [ln.strip() for ln in b.splitlines() if ln.strip()] | |
| result.append(lines) | |
| return result | |
| # ---------- Build audio for each image from multiple lines ---------- | |
| def build_audio_for_image_lines( | |
| lines: List[str], | |
| tts_backend: str, | |
| default_voice: Optional[str], | |
| audio_gain_db: float, | |
| tmp_dir: str | |
| ): | |
| """ | |
| For a single image: | |
| - Generate TTS for each line (respect 'speaker| text' override). | |
| - Concatenate segments. | |
| - Apply gain to the final track. | |
| - Return (audio_clip, total_duration) or (None, 0.0) if no audio. | |
| """ | |
| segments = [] | |
| for idx, raw in enumerate(lines): | |
| voice = default_voice | |
| text = raw | |
| if "|" in raw and tts_backend.startswith("Coqui"): | |
| spk, txt = raw.split("|", 1) | |
| if txt.strip(): | |
| text = txt.strip() | |
| if spk.strip(): | |
| voice = spk.strip() | |
| out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav") | |
| gen = synth_tts_to_file(text, tts_backend, voice, out_p) | |
| if gen and os.path.exists(gen): | |
| try: | |
| seg = AudioFileClip(gen) | |
| segments.append(seg) | |
| except Exception: | |
| pass | |
| if not segments: | |
| return None, 0.0 | |
| combined = concat_audios_or_composite(segments) | |
| if combined is None: | |
| return None, 0.0 | |
| # Apply gain on the final composite if needed | |
| gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0 | |
| if abs(gain - 1.0) > 1e-3: | |
| combined = apply_linear_gain(combined, gain) | |
| total = float(combined.duration) | |
| return combined, total | |
| # ---------- Variable-duration video (per-image) ---------- | |
| def build_variable_duration_video( | |
| frames: List[np.ndarray], | |
| per_image_durations: List[float], | |
| per_image_audios: List[Optional[object]], # AudioFileClip or CompositeAudioClip | |
| ): | |
| """ | |
| Create a video where each image has its own duration and optional audio. | |
| """ | |
| clips = [] | |
| for frame, dur, aclip in zip(frames, per_image_durations, per_image_audios): | |
| iclip = ImageClip(frame) | |
| iclip = clip_with_duration(iclip, float(max(0.05, dur))) | |
| if aclip is not None: | |
| try: | |
| iclip = clip_with_audio(iclip, aclip) | |
| except Exception: | |
| pass | |
| clips.append(iclip) | |
| final = concatenate_videoclips(clips, method="compose") | |
| return final | |
| # ---------- Main create function ---------- | |
| def create_slideshow( | |
| image_files: List, | |
| narration_mode: str, # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)" | "Per-image (TTS multiline per image)" | |
| seconds_per_image: float, | |
| width: int, | |
| height: int, | |
| fit_mode: str, | |
| bg_color: str, | |
| sort_mode: str, | |
| shuffle_seed: Optional[float], | |
| # single-story inputs | |
| story_text: str, | |
| match_video_to_narration: bool, | |
| # per-image inputs | |
| per_image_texts: str, # one line per image | |
| per_image_multiline_blocks: str, # blocks separated by blank lines | |
| per_image_audio_files: List, # uploaded audio files | |
| sync_per_image_audio: bool, # sync duration to audio for per-image modes | |
| # TTS config | |
| tts_backend: str, | |
| tts_voice: Optional[str], | |
| audio_gain_db: float | |
| ): | |
| if not image_files: | |
| return None, "Please upload at least one image." | |
| # Normalize image paths | |
| paths = [] | |
| for f in image_files: | |
| p = getattr(f, "name", None) or getattr(f, "path", None) or f | |
| if p and os.path.exists(p): | |
| paths.append(p) | |
| if not paths: | |
| return None, "Could not read the uploaded images." | |
| # Order | |
| if sort_mode == "Filename (A→Z)": | |
| paths = sorted(paths, key=lambda p: os.path.basename(p).lower()) | |
| elif sort_mode == "Filename (Z→A)": | |
| paths = sorted(paths, key=lambda p: os.path.basename(p).lower(), reverse=True) | |
| elif sort_mode == "Shuffle": | |
| rnd = random.Random(int(shuffle_seed or 0)) | |
| rnd.shuffle(paths) | |
| # Load frames | |
| width = int(width); height = int(height) | |
| frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths] | |
| num_images = len(frames) | |
| out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4") | |
| # --- Per-image AUDIO FILES --- | |
| if narration_mode == "Per-image (files)" and per_image_audio_files: | |
| # Normalize audio paths & sort by filename | |
| aud_paths = [] | |
| for a in per_image_audio_files: | |
| ap = getattr(a, "name", None) or getattr(a, "path", None) or a | |
| if ap and os.path.exists(ap): | |
| aud_paths.append(ap) | |
| aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower()) | |
| # Basename match, then index fallback | |
| def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]: | |
| result = [None] * len(image_paths) | |
| if not audio_paths: | |
| return result | |
| audio_map = {} | |
| for a in audio_paths: | |
| base = os.path.splitext(os.path.basename(a))[0].lower() | |
| audio_map[base] = a | |
| used = set() | |
| for i, ip in enumerate(image_paths): | |
| base = os.path.splitext(os.path.basename(ip))[0].lower() | |
| if base in audio_map: | |
| result[i] = audio_map[base]; used.add(audio_map[base]) | |
| leftover = [a for a in audio_paths if a not in used] | |
| for i in range(len(image_paths)): | |
| if result[i] is None and leftover: | |
| result[i] = leftover.pop(0) | |
| return result | |
| per_img_audio_paths = map_audio_to_images_by_name(paths, aud_paths) | |
| per_img_audios = [] | |
| per_img_durs = [] | |
| for ap in per_img_audio_paths: | |
| if ap: | |
| try: | |
| aclip = AudioFileClip(ap) | |
| per_img_audios.append(aclip) | |
| per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image)) | |
| except Exception: | |
| per_img_audios.append(None) | |
| per_img_durs.append(float(seconds_per_image)) | |
| else: | |
| per_img_audios.append(None) | |
| per_img_durs.append(float(seconds_per_image)) | |
| final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios) | |
| final_clip.write_videofile( | |
| out_path, | |
| codec="libx264", | |
| audio_codec="aac", | |
| fps=24, | |
| preset="medium", | |
| threads=max(1, (os.cpu_count() or 2) // 2), | |
| ) | |
| return out_path, "Done! Per-image audio applied." | |
| # --- Per-image TTS per single line --- | |
| if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip(): | |
| lines = [ln.strip() for ln in per_image_texts.splitlines()] | |
| # Pad / trim to image count | |
| if len(lines) < num_images: | |
| lines += [""] * (num_images - len(lines)) | |
| else: | |
| lines = lines[:num_images] | |
| tmp_dir = tempfile.gettempdir() | |
| per_img_audios = [] | |
| per_img_durs = [] | |
| for idx, text in enumerate(lines): | |
| voice = tts_voice | |
| if "|" in text and tts_backend.startswith("Coqui"): | |
| maybe_speaker, maybe_text = text.split("|", 1) | |
| if maybe_text.strip(): | |
| text = maybe_text.strip() | |
| if maybe_speaker.strip(): | |
| voice = maybe_speaker.strip() | |
| apath = None | |
| if text: | |
| apath = os.path.join(tmp_dir, f"tts_line_{idx}.wav") | |
| gen = synth_tts_to_file(text, tts_backend, voice, apath) | |
| apath = gen if gen and os.path.exists(gen) else None | |
| if apath: | |
| try: | |
| aclip = AudioFileClip(apath) | |
| per_img_audios.append(aclip) | |
| per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image)) | |
| except Exception: | |
| per_img_audios.append(None) | |
| per_img_durs.append(float(seconds_per_image)) | |
| else: | |
| per_img_audios.append(None) | |
| per_img_durs.append(float(seconds_per_image)) | |
| final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios) | |
| final_clip.write_videofile( | |
| out_path, | |
| codec="libx264", | |
| audio_codec="aac", | |
| fps=24, | |
| preset="medium", | |
| threads=max(1, (os.cpu_count() or 2) // 2), | |
| ) | |
| return out_path, "Done! Per-image TTS (single line) applied." | |
| # --- Per-image TTS multiline per image --- | |
| if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip(): | |
| blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images) | |
| tmp_dir = tempfile.gettempdir() | |
| per_img_audios = [] | |
| per_img_durs = [] | |
| for idx, lines in enumerate(blocks): | |
| if not lines: | |
| per_img_audios.append(None) | |
| per_img_durs.append(float(seconds_per_image)) | |
| continue | |
| aclip, total = build_audio_for_image_lines( | |
| lines=lines, | |
| tts_backend=tts_backend, | |
| default_voice=tts_voice, | |
| audio_gain_db=audio_gain_db, | |
| tmp_dir=tmp_dir | |
| ) | |
| if aclip is not None: | |
| per_img_audios.append(aclip) | |
| per_img_durs.append(float(total) if sync_per_image_audio else float(seconds_per_image)) | |
| else: | |
| per_img_audios.append(None) | |
| per_img_durs.append(float(seconds_per_image)) | |
| final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios) | |
| final_clip.write_videofile( | |
| out_path, | |
| codec="libx264", | |
| audio_codec="aac", | |
| fps=24, | |
| preset="medium", | |
| threads=max(1, (os.cpu_count() or 2) // 2), | |
| ) | |
| return out_path, "Done! Per-image multiline TTS applied." | |
| # --- Single story (one track) --- | |
| if narration_mode == "Single story" and story_text.strip(): | |
| # Base video (uniform duration) | |
| fps = 24 | |
| repeats = max(1, int(round(float(seconds_per_image) * fps))) | |
| expanded = [] | |
| for frame in frames: | |
| expanded.extend([frame] * repeats) | |
| clip = ImageSequenceClip(expanded, fps=fps) | |
| # TTS | |
| tmp = tempfile.gettempdir() | |
| audio_path = os.path.join(tmp, "narration_single.wav") | |
| gen = synth_tts_to_file(story_text.strip(), tts_backend, tts_voice, audio_path) | |
| audio_path = gen if gen and os.path.exists(gen) else None | |
| if audio_path: | |
| try: | |
| aclip = AudioFileClip(audio_path) | |
| if match_video_to_narration: | |
| clip = clip_with_duration(clip, float(aclip.duration)) | |
| gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0 | |
| if abs(gain - 1.0) > 1e-3: | |
| aclip = apply_linear_gain(aclip, gain) | |
| clip = clip_with_audio(clip, aclip) | |
| except Exception: | |
| pass | |
| clip.write_videofile( | |
| out_path, | |
| codec="libx264", | |
| audio_codec="aac", | |
| fps=fps, | |
| preset="medium", | |
| threads=max(1, (os.cpu_count() or 2) // 2), | |
| ) | |
| return out_path, "Done! Story narration applied." | |
| # --- No narration: uniform duration slideshow --- | |
| fps = 24 | |
| repeats = max(1, int(round(float(seconds_per_image) * fps))) | |
| expanded = [] | |
| for frame in frames: | |
| expanded.extend([frame] * repeats) | |
| clip = ImageSequenceClip(expanded, fps=fps) | |
| clip.write_videofile( | |
| out_path, | |
| codec="libx264", | |
| audio_codec="aac", | |
| fps=fps, | |
| preset="medium", | |
| threads=max(1, (os.cpu_count() or 2) // 2), | |
| ) | |
| return out_path, "Done! Video created without narration." | |
| # ---------- UI ---------- | |
| def update_voice_choices(backend_name: str): | |
| voices = list_voices(backend_name) | |
| value = voices[0] if voices else None | |
| msg = f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)." | |
| return gr.update(choices=voices, value=value), msg | |
| def ui(): | |
| with gr.Blocks(title="Slideshow + Per-Image Audio + Multiline TTS + Voice Picker", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🖼️ → 🎬 Slideshow Maker | |
| - **Per-image audio**: upload audio files, one (or more) per image (matched by filename or order). | |
| - **Per-image TTS (multiline)**: write blocks separated by **blank lines**; lines inside a block are spoken sequentially for that image. | |
| - **TTS voices**: pick from **Coqui VCTK** multi-speaker voices (male/female) or use gTTS as a lightweight fallback. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_files = gr.Files( | |
| label="Upload Images (multiple)", | |
| file_count="multiple", | |
| file_types=["image"], | |
| ) | |
| sort_mode = gr.Radio( | |
| ["Filename (A→Z)", "Filename (Z→A)", "Shuffle"], | |
| value="Filename (A→Z)", | |
| label="Image Order", | |
| ) | |
| shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)") | |
| seconds_per_image = gr.Slider( | |
| minimum=0.1, maximum=10.0, step=0.1, value=1.5, | |
| label="Seconds per Image (used when not syncing to audio)" | |
| ) | |
| with gr.Row(): | |
| width = gr.Number(value=1280, precision=0, label="Width (px)") | |
| height = gr.Number(value=720, precision=0, label="Height (px)") | |
| fit_mode = gr.Radio(["contain", "cover", "stretch"], value="contain", label="Sizing Mode") | |
| bg_color = gr.ColorPicker(value="#000000", label="Background (for 'contain')") | |
| with gr.Column(scale=1): | |
| narration_mode = gr.Radio( | |
| ["None", | |
| "Single story", | |
| "Per-image (files)", | |
| "Per-image (TTS per line)", | |
| "Per-image (TTS multiline per image)"], | |
| value="None", | |
| label="Narration mode" | |
| ) | |
| # Single-story UI | |
| story_text = gr.Textbox( | |
| label="Story (Single track narration)", | |
| placeholder="Type or paste your story...", | |
| lines=20, | |
| ) | |
| match_video_to_narration = gr.Checkbox( | |
| value=True, label="Match video duration to narration length (single-story)" | |
| ) | |
| # Per-image UI (files) | |
| per_image_audio_files = gr.Files( | |
| label="Per-image audio files (optional) — matched by filename or order", | |
| file_count="multiple", | |
| file_types=["audio"] | |
| ) | |
| sync_per_image_audio = gr.Checkbox( | |
| value=True, label="Sync image to audio duration (per-image modes)" | |
| ) | |
| # Per-image UI (text) | |
| per_image_texts = gr.Textbox( | |
| label="Per-image TTS (one line per image)", | |
| placeholder="Line 1 (image 1)\nLine 2 (image 2)\n...", | |
| lines=8, | |
| ) | |
| per_image_multiline_blocks = gr.Textbox( | |
| label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override", | |
| placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n...", | |
| lines=40, | |
| ) | |
| with gr.Row(): | |
| tts_backend = gr.Dropdown( | |
| ["Coqui (VCTK multi-speaker)", "gTTS (simple)"], | |
| value="Coqui (VCTK multi-speaker)", | |
| label="TTS backend" | |
| ) | |
| tts_voice = gr.Dropdown(choices=[], label="Default Voice (for Coqui)") | |
| voice_status = gr.Markdown("") | |
| audio_gain_db = gr.Slider( | |
| minimum=-12, maximum=12, step=1, value=0, label="Narration Gain (dB)" | |
| ) | |
| run_btn = gr.Button("Create Video", variant="primary") | |
| status = gr.Markdown("") | |
| video_out = gr.Video(label="Result", autoplay=False) | |
| # Load voices when backend changes | |
| tts_backend.change( | |
| fn=update_voice_choices, | |
| inputs=[tts_backend], | |
| outputs=[tts_voice, voice_status] | |
| ) | |
| # Also populate on initial load | |
| demo.load( | |
| fn=update_voice_choices, | |
| inputs=[tts_backend], | |
| outputs=[tts_voice, voice_status] | |
| ) | |
| # Main action | |
| run_btn.click( | |
| fn=create_slideshow, | |
| inputs=[ | |
| image_files, | |
| narration_mode, | |
| seconds_per_image, | |
| width, height, | |
| fit_mode, bg_color, | |
| sort_mode, shuffle_seed, | |
| # single-story | |
| story_text, match_video_to_narration, | |
| # per-image text inputs | |
| per_image_texts, per_image_multiline_blocks, | |
| # per-image files + sync | |
| per_image_audio_files, sync_per_image_audio, | |
| # tts | |
| tts_backend, tts_voice, | |
| audio_gain_db | |
| ], | |
| outputs=[video_out, status], | |
| ) | |
| gr.Markdown( | |
| """ | |
| **Tips** | |
| - *Multiline per image*: separate image blocks with a **blank line**. Within each block, lines are spoken in order. | |
| - *Coqui per-line speaker*: prefix a line with `speaker| text`, e.g., `p225| Hello there`. | |
| - *Sync option*: turn it on to make each image stay up for the full duration of its own audio. | |
| """ | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| ui().launch() | |