Spaces:

Vishwas1
/

VideoCreator

Runtime error

App Files Files Community

Vishwas1 commited on Sep 7, 2025

Commit

d7dbd51

verified ·

1 Parent(s): 43229da

Create app.py

Browse files

Files changed (1) hide show

app.py +562 -0

app.py ADDED Viewed

	@@ -0,0 +1,562 @@

+# app.py — Slideshow with per-image audio + multi-voice TTS (HF Coqui)
+# Works with MoviePy v2.x; falls back to v1 where possible.
+import os
+import tempfile
+import random
+from typing import Optional, List, Dict, Tuple
+import numpy as np
+from PIL import Image
+import gradio as gr
+# ---- MoviePy imports with v2/v1 compatibility ----
+MPY_V2 = False
+afx = None  # audio effects (v2)
+try:
+    # v2.x preferred
+    from moviepy import ImageSequenceClip, AudioFileClip, ImageClip, concatenate_videoclips  # type: ignore
+    try:
+        from moviepy import afx as _afx  # type: ignore
+        afx = _afx
+    except Exception:
+        afx = None
+    MPY_V2 = True
+except Exception:
+    # v1.x fallback
+    from moviepy.editor import ImageSequenceClip, AudioFileClip, ImageClip, concatenate_videoclips  # type: ignore
+    MPY_V2 = False
+# ---------- Small compatibility helpers ----------
+def clip_with_duration(clip, duration: float):
+    if hasattr(clip, "with_duration"):  # v2
+        return clip.with_duration(duration)
+    return clip.set_duration(duration)  # v1
+def clip_with_audio(clip, audio):
+    if hasattr(clip, "with_audio"):  # v2
+        return clip.with_audio(audio)
+    return clip.set_audio(audio)      # v1
+def apply_linear_gain(audio_clip, gain_linear: float):
+    """
+    Try to apply a linear gain to an AudioFileClip.
+    If effects aren't available, return the original clip (no-op).
+    """
+    if hasattr(audio_clip, "with_effects") and afx is not None:
+        try:
+            return audio_clip.with_effects([afx.MultiplyVolume(gain_linear)])
+        except Exception:
+            pass
+    if hasattr(audio_clip, "fx"):
+        try:
+            if afx is not None and hasattr(afx, "volumex"):
+                return audio_clip.fx(afx.volumex, gain_linear)
+        except Exception:
+            pass
+    return audio_clip
+# ---------- Image utilities ----------
+def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray:
+    img = Image.open(path).convert("RGB")
+    if fit == "stretch":
+        img = img.resize((width, height), Image.LANCZOS)
+        return np.array(img)
+    iw, ih = img.size
+    target_aspect = float(width) / float(height)
+    src_aspect = float(iw) / float(ih)
+    if fit == "cover":
+        if src_aspect > target_asect := target_aspect:
+            new_h = height
+            new_w = int(round(src_asect * new_h))
+        else:
+            new_w = width
+            new_h = int(round(new_w / src_asect))
+        img = img.resize((new_w, new_h), Image.LANCZOS)
+        left = (new_w - width) // 2
+        top = (new_h - height) // 2
+        img = img.crop((left, top, left + width, top + height))
+        return np.array(img)
+    # contain
+    canvas = Image.new("RGB", (width, height), bg)
+    if src_aspect > target_aspect:
+        new_w = width
+        new_h = int(round(new_w / src_aspect))
+    else:
+        new_h = height
+        new_w = int(round(src_aspect * new_h))
+    resized = img.resize((new_w, new_h), Image.LANCZOS)
+    left = (width - new_w) // 2
+    top = (height - new_h) // 2
+    canvas.paste(resized, (left, top))
+    return np.array(canvas)
+# ---------- TTS backends ----------
+_TTS_CACHE: Dict[str, object] = {}
+def _get_tts_backend(backend_name: str):
+    """
+    Lazy-load a TTS backend instance.
+    - "Coqui (VCTK multi-speaker)" -> coqui-ai/TTS model: tts_models/en/vctk/vits
+    - "gTTS (simple)" -> sentinel string "gTTS"
+    """
+    if backend_name == "Coqui (VCTK multi-speaker)":
+        if backend_name not in _TTS_CACHE:
+            from TTS.api import TTS  # heavy import
+            _TTS_CACHE[backend_name] = TTS("tts_models/en/vctk/vits")
+        return _TTS_CACHE[backend_name]
+    elif backend_name == "gTTS (simple)":
+        return "gTTS"
+    return None
+def list_voices(backend_name: str) -> List[str]:
+    if backend_name == "Coqui (VCTK multi-speaker)":
+        try:
+            tts = _get_tts_backend(backend_name)
+            spks = list(getattr(tts, "speakers", []))
+            # Prefer a common male default if present
+            default_pref = ["p225", "p226", "p233", "p243"]
+            ordered = sorted(spks)
+            for pref in default_pref:
+                if pref in ordered:
+                    ordered.remove(pref)
+                    ordered.insert(0, pref)
+                    break
+            return ordered
+        except Exception:
+            return []
+    return []
+def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
+    text = (text or "").strip()
+    if not text:
+        return None
+    if backend_name == "Coqui (VCTK multi-speaker)":
+        try:
+            tts = _get_tts_backend(backend_name)
+            # Coqui writes WAV by default; we'll give a .wav path
+            if not out_path.lower().endswith(".wav"):
+                out_path = os.path.splitext(out_path)[0] + ".wav"
+            tts.tts_to_file(text=text, speaker=voice, file_path=out_path)
+            return out_path
+        except Exception:
+            return None
+    # gTTS fallback
+    if backend_name == "gTTS (simple)":
+        try:
+            from gtts import gTTS
+            if not out_path.lower().endswith(".mp3"):
+                out_path = os.path.splitext(out_path)[0] + ".mp3"
+            gTTS(text=text, lang="en").save(out_path)
+            return out_path
+        except Exception:
+            return None
+    return None
+# ---------- Helpers for per-image mapping ----------
+def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]:
+    """
+    Try basename match first; then fall back to index order.
+    """
+    result = [None] * len(image_paths)
+    if not audio_paths:
+        return result
+    # Basename map (without extension)
+    audio_map = {}
+    for a in audio_paths:
+        base = os.path.splitext(os.path.basename(a))[0].lower()
+        audio_map[base] = a
+    used = set()
+    # First pass: basename matches
+    for i, ip in enumerate(image_paths):
+        base = os.path.splitext(os.path.basename(ip))[0].lower()
+        if base in audio_map:
+            result[i] = audio_map[base]
+            used.add(audio_map[base])
+    # Second pass: index fallback for any remaining
+    leftover = [a for a in audio_paths if a not in used]
+    for i in range(len(image_paths)):
+        if result[i] is None and leftover:
+            result[i] = leftover.pop(0)
+    return result
+# ---------- Core builder ----------
+def build_variable_duration_video(
+    frames: List[np.ndarray],
+    per_image_durations: List[float],
+    per_image_audios: List[Optional[str]],
+    audio_gain_db: float
+):
+    """
+    Create a video where each image has its own duration and optional audio.
+    """
+    clips = []
+    for frame, dur, apath in zip(frames, per_image_durations, per_image_audios):
+        iclip = ImageClip(frame)
+        iclip = clip_with_duration(iclip, float(dur))
+        if apath:
+            try:
+                aclip = AudioFileClip(apath)
+                gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
+                if abs(gain - 1.0) > 1e-3:
+                    aclip = apply_linear_gain(aclip, gain)
+                iclip = clip_with_audio(iclip, aclip)
+            except Exception:
+                pass
+        clips.append(iclip)
+    # Compose ensures audio & size are aligned
+    final = concatenate_videoclips(clips, method="compose")
+    return final
+def create_slideshow(
+    image_files: List,
+    narration_mode: str,                 # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)"
+    seconds_per_image: float,
+    width: int,
+    height: int,
+    fit_mode: str,
+    bg_color: str,
+    sort_mode: str,
+    shuffle_seed: Optional[float],
+    # single-story inputs
+    story_text: str,
+    match_video_to_narration: bool,
+    # per-image inputs
+    per_image_texts: str,                # one line per image; optional "speaker| text" when using Coqui
+    per_image_audio_files: List,         # uploaded audio files
+    # TTS config
+    tts_backend: str,
+    tts_voice: Optional[str],
+    audio_gain_db: float
+):
+    if not image_files:
+        return None, "Please upload at least one image."
+    # Normalize image paths
+    paths = []
+    for f in image_files:
+        p = getattr(f, "name", None) or getattr(f, "path", None) or f
+        if p and os.path.exists(p):
+            paths.append(p)
+    if not paths:
+        return None, "Could not read the uploaded images."
+    # Order
+    if sort_mode == "Filename (A→Z)":
+        paths = sorted(paths, key=lambda p: os.path.basename(p).lower())
+    elif sort_mode == "Filename (Z→A)":
+        paths = sorted(paths, key=lambda p: os.path.basename(p).lower(), reverse=True)
+    elif sort_mode == "Shuffle":
+        rnd = random.Random(int(shuffle_seed or 0))
+        rnd.shuffle(paths)
+    # Load frames
+    width = int(width); height = int(height)
+    frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths]
+    # Build outputs based on narration_mode
+    out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4")
+    # --- Per-image AUDIO FILES ---
+    if narration_mode == "Per-image (files)" and per_image_audio_files:
+        # Normalize audio paths & sort by filename
+        aud_paths = []
+        for a in per_image_audio_files:
+            ap = getattr(a, "name", None) or getattr(a, "path", None) or a
+            if ap and os.path.exists(ap):
+                aud_paths.append(ap)
+        aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower())
+        per_img_audio = map_audio_to_images_by_name(paths, aud_paths)
+        # Durations: match each audio if present, else fall back to seconds_per_image
+        durations = []
+        for ap in per_img_audio:
+            if ap:
+                try:
+                    aclip = AudioFileClip(ap)
+                    durations.append(float(aclip.duration))
+                except Exception:
+                    durations.append(float(seconds_per_image))
+            else:
+                durations.append(float(seconds_per_image))
+        final_clip = build_variable_duration_video(frames, durations, per_img_audio, audio_gain_db)
+        final_clip.write_videofile(
+            out_path,
+            codec="libx264",
+            audio_codec="aac",
+            fps=24,
+            preset="medium",
+            threads=max(1, (os.cpu_count() or 2) // 2),
+        )
+        return out_path, "Done! Per-image audio applied."
+    # --- Per-image TTS per line ---
+    if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
+        lines = [ln.strip() for ln in per_image_texts.splitlines()]
+        # Pad / trim to image count
+        if len(lines) < len(paths):
+            lines += [""] * (len(paths) - len(lines))
+        else:
+            lines = lines[:len(paths)]
+        # Generate audio per line
+        tmp_dir = tempfile.gettempdir()
+        per_img_audio = []
+        durations = []
+        for idx, text in enumerate(lines):
+            voice = tts_voice
+            # Optional "speaker| text" override for Coqui
+            if "|" in text and tts_backend.startswith("Coqui"):
+                maybe_speaker, maybe_text = text.split("|", 1)
+                if maybe_text.strip():
+                    text = maybe_text.strip()
+                    if maybe_speaker.strip():
+                        voice = maybe_speaker.strip()
+            apath = None
+            if text:
+                apath = os.path.join(tmp_dir, f"tts_line_{idx}.wav")
+                gen = synth_tts_to_file(text, tts_backend, voice, apath)
+                apath = gen if gen and os.path.exists(gen) else None
+            per_img_audio.append(apath)
+            if apath:
+                try:
+                    aclip = AudioFileClip(apath)
+                    durations.append(float(aclip.duration))
+                except Exception:
+                    durations.append(float(seconds_per_image))
+            else:
+                durations.append(float(seconds_per_image))
+        final_clip = build_variable_duration_video(frames, durations, per_img_audio, audio_gain_db)
+        final_clip.write_videofile(
+            out_path,
+            codec="libx264",
+            audio_codec="aac",
+            fps=24,
+            preset="medium",
+            threads=max(1, (os.cpu_count() or 2) // 2),
+        )
+        return out_path, "Done! Per-image TTS applied."
+    # --- Single story (one track) ---
+    if narration_mode == "Single story" and story_text.strip():
+        # Build base video (uniform duration)
+        fps = 24
+        repeats = max(1, int(round(float(seconds_per_image) * fps)))
+        expanded = []
+        for frame in frames:
+            expanded.extend([frame] * repeats)
+        clip = ImageSequenceClip(expanded, fps=fps)
+        # TTS
+        tmp = tempfile.gettempdir()
+        audio_path = os.path.join(tmp, "narration_single.wav")
+        gen = synth_tts_to_file(story_text.strip(), tts_backend, tts_voice, audio_path)
+        audio_path = gen if gen and os.path.exists(gen) else None
+        if audio_path:
+            try:
+                aclip = AudioFileClip(audio_path)
+                if match_video_to_narration:
+                    clip = clip_with_duration(clip, float(aclip.duration))
+                gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
+                if abs(gain - 1.0) > 1e-3:
+                    aclip = apply_linear_gain(aclip, gain)
+                clip = clip_with_audio(clip, aclip)
+            except Exception:
+                pass
+        clip.write_videofile(
+            out_path,
+            codec="libx264",
+            audio_codec="aac",
+            fps=fps,
+            preset="medium",
+            threads=max(1, (os.cpu_count() or 2) // 2),
+        )
+        return out_path, "Done! Story narration applied."
+    # --- No narration: uniform duration slideshow ---
+    fps = 24
+    repeats = max(1, int(round(float(seconds_per_image) * fps)))
+    expanded = []
+    for frame in frames:
+        expanded.extend([frame] * repeats)
+    clip = ImageSequenceClip(expanded, fps=fps)
+    clip.write_videofile(
+        out_path,
+        codec="libx264",
+        audio_codec="aac",
+        fps=fps,
+        preset="medium",
+        threads=max(1, (os.cpu_count() or 2) // 2),
+    )
+    return out_path, "Done! Video created without narration."
+# ---------- UI ----------
+def update_voice_choices(backend_name: str):
+    voices = list_voices(backend_name)
+    value = voices[0] if voices else None
+    return gr.update(choices=voices, value=value), f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
+def ui():
+    with gr.Blocks(title="Slideshow + Per-Image Audio + Voice Picker", theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            # 🖼️ → 🎬 Slideshow Maker
+            - **Per-image audio**: upload audio files (matched by filename or order) **or** generate per-image narration from text lines.
+            - **TTS voices**: pick from **Coqui VCTK**'s multi-speaker voices (male/female), or use gTTS as a lightweight fallback.
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                image_files = gr.Files(
+                    label="Upload Images (multiple)",
+                    file_count="multiple",
+                    file_types=["image"],
+                )
+                sort_mode = gr.Radio(
+                    ["Filename (A→Z)", "Filename (Z→A)", "Shuffle"],
+                    value="Filename (A→Z)",
+                    label="Image Order",
+                )
+                shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)")
+                seconds_per_image = gr.Slider(
+                    minimum=0.1, maximum=10.0, step=0.1, value=1.5, label="Seconds per Image (used when no per-image audio)"
+                )
+                with gr.Row():
+                    width = gr.Number(value=1280, precision=0, label="Width (px)")
+                    height = gr.Number(value=720, precision=0, label="Height (px)")
+                fit_mode = gr.Radio(["contain", "cover", "stretch"], value="contain", label="Sizing Mode")
+                bg_color = gr.ColorPicker(value="#000000", label="Background (for 'contain')")
+            with gr.Column(scale=1):
+                narration_mode = gr.Radio(
+                    ["None", "Single story", "Per-image (files)", "Per-image (TTS per line)"],
+                    value="None",
+                    label="Narration mode"
+                )
+                # Single-story UI
+                story_text = gr.Textbox(
+                    label="Story (Single track narration)",
+                    placeholder="Type or paste your story..."
+                )
+                match_video_to_narration = gr.Checkbox(
+                    value=True, label="Match video duration to narration length (single-story)"
+                )
+                # Per-image UI
+                per_image_audio_files = gr.Files(
+                    label="Per-image audio files (optional) — matched by filename or order",
+                    file_count="multiple",
+                    file_types=["audio"]
+                )
+                per_image_texts = gr.Textbox(
+                    label="Per-image TTS text (one line per image). For Coqui, optional 'speaker| text' per line.",
+                    placeholder="Line 1 text\nLine 2 text\n..."
+                )
+                with gr.Row():
+                    tts_backend = gr.Dropdown(
+                        ["Coqui (VCTK multi-speaker)", "gTTS (simple)"],
+                        value="Coqui (VCTK multi-speaker)",
+                        label="TTS backend"
+                    )
+                    tts_voice = gr.Dropdown(choices=[], label="Voice (for Coqui)")
+                voice_status = gr.Markdown("")
+                audio_gain_db = gr.Slider(
+                    minimum=-12, maximum=12, step=1, value=0, label="Narration Gain (dB)"
+                )
+                run_btn = gr.Button("Create Video", variant="primary")
+                status = gr.Markdown("")
+        video_out = gr.Video(label="Result", autoplay=False)
+        # Load voices when backend changes
+        tts_backend.change(
+            fn=update_voice_choices,
+            inputs=[tts_backend],
+            outputs=[tts_voice, voice_status]
+        )
+        # Main action
+        run_btn.click(
+            fn=create_slideshow,
+            inputs=[
+                image_files,
+                narration_mode,
+                seconds_per_image,
+                width, height,
+                fit_mode, bg_color,
+                sort_mode, shuffle_seed,
+                # single-story
+                story_text, match_video_to_narration,
+                # per-image
+                per_image_texts, per_image_audio_files,
+                # tts
+                tts_backend, tts_voice,
+                audio_gain_db
+            ],
+            outputs=[video_out, status],
+        )
+        gr.Markdown(
+            """
+            **Tips**
+            - *Per-image audio (files)*: name audio like your images (e.g., `001.jpg` ↔ `001.wav`) for automatic matching.
+            - *Per-image TTS per line*: supply the same number of lines as images; extra/missing lines are trimmed/padded.
+            - *Coqui voice per line*: prefix a line with `speaker| text` to override the dropdown voice (e.g., `p225| Hello there`).
+            """
+        )
+    return demo
+if __name__ == "__main__":
+    ui().launch()