Spaces:

Vishwas1
/

VideoCreator

Runtime error

File size: 27,858 Bytes

# app.py — Slideshow with per-image audio, multiline TTS per image, and voice picker
# Works with MoviePy v2.x; falls back to v1 when necessary. Python 3.9+ safe.

import os
import re
import tempfile
import random
from typing import Optional, List, Dict

import numpy as np
from PIL import Image
import gradio as gr

# ---- MoviePy imports with v2/v1 compatibility ----
MPY_V2 = False
afx = None  # audio effects (v2)
_CompositeAudioClip = None
_concat_audios = None

try:
    # v2.x preferred
    from moviepy import (
        ImageSequenceClip,
        AudioFileClip,
        ImageClip,
        concatenate_videoclips,
    )
    try:
        from moviepy import afx as _afx  # type: ignore
        afx = _afx
    except Exception:
        afx = None
    try:
        from moviepy import CompositeAudioClip as _CompositeAudioClip  # type: ignore
    except Exception:
        _CompositeAudioClip = None
    try:
        from moviepy import concatenate_audioclips as _concat_audios  # type: ignore
    except Exception:
        _concat_audios = None
    MPY_V2 = True
except Exception:
    # v1.x fallback
    from moviepy.editor import (
        ImageSequenceClip,
        AudioFileClip,
        ImageClip,
        concatenate_videoclips,
        CompositeAudioClip as _CompositeAudioClip,   # type: ignore
        concatenate_audioclips as _concat_audios,    # type: ignore
    )
    MPY_V2 = False


# ---------- Small compatibility helpers ----------

def clip_with_duration(clip, duration: float):
    if hasattr(clip, "with_duration"):  # v2
        return clip.with_duration(duration)
    return clip.set_duration(duration)  # v1


def clip_with_audio(clip, audio):
    if hasattr(clip, "with_audio"):  # v2
        return clip.with_audio(audio)
    return clip.set_audio(audio)      # v1


def apply_linear_gain(audio_clip, gain_linear: float):
    """
    Try to apply a linear gain to an AudioFileClip.
    If effects aren't available, return the original clip (no-op).
    """
    if hasattr(audio_clip, "with_effects") and afx is not None:
        try:
            return audio_clip.with_effects([afx.MultiplyVolume(gain_linear)])
        except Exception:
            pass
    if hasattr(audio_clip, "fx"):
        try:
            if afx is not None and hasattr(afx, "volumex"):
                return audio_clip.fx(afx.volumex, gain_linear)
        except Exception:
            pass
    return audio_clip


def concat_audios_or_composite(clips: List):
    """
    Concatenate audio clips. Prefer the built-in concatenator; otherwise composite
    sequentially using start offsets to emulate concatenation.
    """
    if not clips:
        return None
    if len(clips) == 1:
        return clips[0]
    if _concat_audios is not None:
        try:
            return _concat_audios(clips)
        except Exception:
            pass
    # Fallback: sequential CompositeAudioClip
    if _CompositeAudioClip is not None:
        total = 0.0
        seq = []
        for c in clips:
            try:
                seq.append(c.set_start(total))
                total += float(c.duration)
            except Exception:
                pass
        comp = _CompositeAudioClip(seq)
        try:
            comp = clip_with_duration(comp, total)
        except Exception:
            pass
        return comp
    # Last resort
    return clips[0]


# ---------- Image utilities ----------

def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray:
    """
    Loads an image file and returns an RGB numpy array with exact (height, width, 3).
    fit:
      - "contain": letterbox to fit within target size (keeps aspect), background fills rest.
      - "cover":   fill target size (keeps aspect) with center crop.
      - "stretch": distort to target size.
    """
    img = Image.open(path).convert("RGB")

    if fit == "stretch":
        img = img.resize((width, height), Image.LANCZOS)
        return np.array(img)

    iw, ih = img.size
    target_aspect = float(width) / float(height)
    src_aspect = float(iw) / float(ih)

    if fit == "cover":
        # scale to cover, then center-crop
        if src_aspect > target_aspect:
            new_h = height
            new_w = int(round(src_aspect * new_h))
        else:
            new_w = width
            new_h = int(round(new_w / src_aspect))
        img = img.resize((new_w, new_h), Image.LANCZOS)
        left = (new_w - width) // 2
        top = (new_h - height) // 2
        img = img.crop((left, top, left + width, top + height))
        return np.array(img)

    # contain (letterbox/pillarbox)
    canvas = Image.new("RGB", (width, height), bg)
    if src_aspect > target_aspect:
        new_w = width
        new_h = int(round(new_w / src_aspect))
    else:
        new_h = height
        new_w = int(round(src_aspect * new_h))
    resized = img.resize((new_w, new_h), Image.LANCZOS)
    left = (width - new_w) // 2
    top = (height - new_h) // 2
    canvas.paste(resized, (left, top))
    return np.array(canvas)


# ---------- TTS backends ----------

_TTS_CACHE: Dict[str, object] = {}

def _get_tts_backend(backend_name: str):
    """
    Lazy-load a TTS backend instance.
    - "Coqui (VCTK multi-speaker)" -> coqui-ai/TTS model: tts_models/en/vctk/vits
    - "gTTS (simple)" -> sentinel string "gTTS"
    """
    if backend_name == "Coqui (VCTK multi-speaker)":
        if backend_name not in _TTS_CACHE:
            from TTS.api import TTS  # heavy import
            _TTS_CACHE[backend_name] = TTS("tts_models/en/vctk/vits")
        return _TTS_CACHE[backend_name]
    elif backend_name == "gTTS (simple)":
        return "gTTS"
    return None


def list_voices(backend_name: str) -> List[str]:
    if backend_name != "Coqui (VCTK multi-speaker)":
        return []

    try:
        tts = _get_tts_backend(backend_name)
        candidates: List[str] = []

        # Try common attributes across TTS versions
        for path in [
            "speakers",
            "speaker_manager.speaker_names",
            "speaker_manager.speaker_ids",
        ]:
            obj = tts
            try:
                for part in path.split("."):
                    obj = getattr(obj, part)
                names = list(obj) if obj is not None else []
                if names:
                    candidates = [str(x) for x in names]
                    break
            except Exception:
                continue

        # Sensible fallback if nothing found (known VCTK IDs)
        if not candidates:
            candidates = [
                "p225","p226","p233","p243","p254","p256","p258","p259",
                "p270","p273","p274","p278","p279","p302","p311","p316",
                "p334","p345","p360","p363","p374"
            ]

        # Nudge common male IDs toward the top if present
        male_pref = ["p225","p226","p233","p243","p270","p274","p279","p311","p345","p360","p363"]
        ordered = candidates[:]
        for pref in reversed(male_pref):
            if pref in ordered:
                ordered.remove(pref)
                ordered.insert(0, pref)

        # Deduplicate while preserving order
        seen, final = set(), []
        for v in ordered:
            if v not in seen:
                seen.add(v)
                final.append(v)
        return final

    except Exception:
        # Absolute fallback
        return ["p225","p226","p233","p243"]


def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
    text = (text or "").strip()
    if not text:
        return None

    if backend_name == "Coqui (VCTK multi-speaker)":
        try:
            tts = _get_tts_backend(backend_name)
            if not out_path.lower().endswith(".wav"):
                out_path = os.path.splitext(out_path)[0] + ".wav"
            tts.tts_to_file(text=text, speaker=voice, file_path=out_path)
            return out_path
        except Exception:
            return None

    if backend_name == "gTTS (simple)":
        try:
            from gtts import gTTS
            if not out_path.lower().endswith(".mp3"):
                out_path = os.path.splitext(out_path)[0] + ".mp3"
            gTTS(text=text, lang="en").save(out_path)
            return out_path
        except Exception:
            return None

    return None


# ---------- Text parsing for multiline-per-image ----------

def parse_multiline_blocks(text: str, expected_images: int) -> List[List[str]]:
    """
    Split text into blocks by blank lines. Each block = one image.
    Within a block, each non-empty line is a separate TTS segment.
    """
    if not (text or "").strip():
        return [[] for _ in range(expected_images)]
    blocks = [b.strip() for b in re.split(r"\n\s*\n", text.strip()) if b.strip()]
    # Pad/trim to match number of images
    if len(blocks) < expected_images:
        blocks += [""] * (expected_images - len(blocks))
    elif len(blocks) > expected_images:
        blocks = blocks[:expected_images]
    result = []
    for b in blocks:
        lines = [ln.strip() for ln in b.splitlines() if ln.strip()]
        result.append(lines)
    return result


# ---------- Build audio for each image from multiple lines ----------

def build_audio_for_image_lines(
    lines: List[str],
    tts_backend: str,
    default_voice: Optional[str],
    audio_gain_db: float,
    tmp_dir: str
):
    """
    For a single image:
      - Generate TTS for each line (respect 'speaker| text' override).
      - Concatenate segments.
      - Apply gain to the final track.
      - Return (audio_clip, total_duration) or (None, 0.0) if no audio.
    """
    segments = []
    for idx, raw in enumerate(lines):
        voice = default_voice
        text = raw
        if "|" in raw and tts_backend.startswith("Coqui"):
            spk, txt = raw.split("|", 1)
            if txt.strip():
                text = txt.strip()
                if spk.strip():
                    voice = spk.strip()
        out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
        gen = synth_tts_to_file(text, tts_backend, voice, out_p)
        if gen and os.path.exists(gen):
            try:
                seg = AudioFileClip(gen)
                segments.append(seg)
            except Exception:
                pass

    if not segments:
        return None, 0.0

    combined = concat_audios_or_composite(segments)
    if combined is None:
        return None, 0.0

    # Apply gain on the final composite if needed
    gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
    if abs(gain - 1.0) > 1e-3:
        combined = apply_linear_gain(combined, gain)

    total = float(combined.duration)
    return combined, total


# ---------- Variable-duration video (per-image) ----------

def build_variable_duration_video(
    frames: List[np.ndarray],
    per_image_durations: List[float],
    per_image_audios: List[Optional[object]],  # AudioFileClip or CompositeAudioClip
):
    """
    Create a video where each image has its own duration and optional audio.
    """
    clips = []
    for frame, dur, aclip in zip(frames, per_image_durations, per_image_audios):
        iclip = ImageClip(frame)
        iclip = clip_with_duration(iclip, float(max(0.05, dur)))
        if aclip is not None:
            try:
                iclip = clip_with_audio(iclip, aclip)
            except Exception:
                pass
        clips.append(iclip)

    final = concatenate_videoclips(clips, method="compose")
    return final


# ---------- Main create function ----------

def create_slideshow(
    image_files: List,

    narration_mode: str,                 # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)" | "Per-image (TTS multiline per image)"
    seconds_per_image: float,
    width: int,
    height: int,
    fit_mode: str,
    bg_color: str,
    sort_mode: str,
    shuffle_seed: Optional[float],

    # single-story inputs
    story_text: str,
    match_video_to_narration: bool,

    # per-image inputs
    per_image_texts: str,                # one line per image
    per_image_multiline_blocks: str,     # blocks separated by blank lines
    per_image_audio_files: List,         # uploaded audio files
    sync_per_image_audio: bool,          # sync duration to audio for per-image modes

    # TTS config
    tts_backend: str,
    tts_voice: Optional[str],
    audio_gain_db: float
):
    if not image_files:
        return None, "Please upload at least one image."

    # Normalize image paths
    paths = []
    for f in image_files:
        p = getattr(f, "name", None) or getattr(f, "path", None) or f
        if p and os.path.exists(p):
            paths.append(p)
    if not paths:
        return None, "Could not read the uploaded images."

    # Order
    if sort_mode == "Filename (A→Z)":
        paths = sorted(paths, key=lambda p: os.path.basename(p).lower())
    elif sort_mode == "Filename (Z→A)":
        paths = sorted(paths, key=lambda p: os.path.basename(p).lower(), reverse=True)
    elif sort_mode == "Shuffle":
        rnd = random.Random(int(shuffle_seed or 0))
        rnd.shuffle(paths)

    # Load frames
    width = int(width); height = int(height)
    frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths]
    num_images = len(frames)

    out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4")

    # --- Per-image AUDIO FILES ---
    if narration_mode == "Per-image (files)" and per_image_audio_files:
        # Normalize audio paths & sort by filename
        aud_paths = []
        for a in per_image_audio_files:
            ap = getattr(a, "name", None) or getattr(a, "path", None) or a
            if ap and os.path.exists(ap):
                aud_paths.append(ap)
        aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower())

        # Basename match, then index fallback
        def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]:
            result = [None] * len(image_paths)
            if not audio_paths:
                return result
            audio_map = {}
            for a in audio_paths:
                base = os.path.splitext(os.path.basename(a))[0].lower()
                audio_map[base] = a
            used = set()
            for i, ip in enumerate(image_paths):
                base = os.path.splitext(os.path.basename(ip))[0].lower()
                if base in audio_map:
                    result[i] = audio_map[base]; used.add(audio_map[base])
            leftover = [a for a in audio_paths if a not in used]
            for i in range(len(image_paths)):
                if result[i] is None and leftover:
                    result[i] = leftover.pop(0)
            return result

        per_img_audio_paths = map_audio_to_images_by_name(paths, aud_paths)

        per_img_audios = []
        per_img_durs = []
        for ap in per_img_audio_paths:
            if ap:
                try:
                    aclip = AudioFileClip(ap)
                    per_img_audios.append(aclip)
                    per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
                except Exception:
                    per_img_audios.append(None)
                    per_img_durs.append(float(seconds_per_image))
            else:
                per_img_audios.append(None)
                per_img_durs.append(float(seconds_per_image))

        final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
        final_clip.write_videofile(
            out_path,
            codec="libx264",
            audio_codec="aac",
            fps=24,
            preset="medium",
            threads=max(1, (os.cpu_count() or 2) // 2),
        )
        return out_path, "Done! Per-image audio applied."

    # --- Per-image TTS per single line ---
    if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
        lines = [ln.strip() for ln in per_image_texts.splitlines()]
        # Pad / trim to image count
        if len(lines) < num_images:
            lines += [""] * (num_images - len(lines))
        else:
            lines = lines[:num_images]

        tmp_dir = tempfile.gettempdir()
        per_img_audios = []
        per_img_durs = []

        for idx, text in enumerate(lines):
            voice = tts_voice
            if "|" in text and tts_backend.startswith("Coqui"):
                maybe_speaker, maybe_text = text.split("|", 1)
                if maybe_text.strip():
                    text = maybe_text.strip()
                    if maybe_speaker.strip():
                        voice = maybe_speaker.strip()

            apath = None
            if text:
                apath = os.path.join(tmp_dir, f"tts_line_{idx}.wav")
                gen = synth_tts_to_file(text, tts_backend, voice, apath)
                apath = gen if gen and os.path.exists(gen) else None

            if apath:
                try:
                    aclip = AudioFileClip(apath)
                    per_img_audios.append(aclip)
                    per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
                except Exception:
                    per_img_audios.append(None)
                    per_img_durs.append(float(seconds_per_image))
            else:
                per_img_audios.append(None)
                per_img_durs.append(float(seconds_per_image))

        final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
        final_clip.write_videofile(
            out_path,
            codec="libx264",
            audio_codec="aac",
            fps=24,
            preset="medium",
            threads=max(1, (os.cpu_count() or 2) // 2),
        )
        return out_path, "Done! Per-image TTS (single line) applied."

    # --- Per-image TTS multiline per image ---
    if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
        blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
        tmp_dir = tempfile.gettempdir()

        per_img_audios = []
        per_img_durs = []

        for idx, lines in enumerate(blocks):
            if not lines:
                per_img_audios.append(None)
                per_img_durs.append(float(seconds_per_image))
                continue

            aclip, total = build_audio_for_image_lines(
                lines=lines,
                tts_backend=tts_backend,
                default_voice=tts_voice,
                audio_gain_db=audio_gain_db,
                tmp_dir=tmp_dir
            )

            if aclip is not None:
                per_img_audios.append(aclip)
                per_img_durs.append(float(total) if sync_per_image_audio else float(seconds_per_image))
            else:
                per_img_audios.append(None)
                per_img_durs.append(float(seconds_per_image))

        final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
        final_clip.write_videofile(
            out_path,
            codec="libx264",
            audio_codec="aac",
            fps=24,
            preset="medium",
            threads=max(1, (os.cpu_count() or 2) // 2),
        )
        return out_path, "Done! Per-image multiline TTS applied."

    # --- Single story (one track) ---
    if narration_mode == "Single story" and story_text.strip():
        # Base video (uniform duration)
        fps = 24
        repeats = max(1, int(round(float(seconds_per_image) * fps)))
        expanded = []
        for frame in frames:
            expanded.extend([frame] * repeats)
        clip = ImageSequenceClip(expanded, fps=fps)

        # TTS
        tmp = tempfile.gettempdir()
        audio_path = os.path.join(tmp, "narration_single.wav")
        gen = synth_tts_to_file(story_text.strip(), tts_backend, tts_voice, audio_path)
        audio_path = gen if gen and os.path.exists(gen) else None

        if audio_path:
            try:
                aclip = AudioFileClip(audio_path)
                if match_video_to_narration:
                    clip = clip_with_duration(clip, float(aclip.duration))
                gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
                if abs(gain - 1.0) > 1e-3:
                    aclip = apply_linear_gain(aclip, gain)
                clip = clip_with_audio(clip, aclip)
            except Exception:
                pass

        clip.write_videofile(
            out_path,
            codec="libx264",
            audio_codec="aac",
            fps=fps,
            preset="medium",
            threads=max(1, (os.cpu_count() or 2) // 2),
        )
        return out_path, "Done! Story narration applied."

    # --- No narration: uniform duration slideshow ---
    fps = 24
    repeats = max(1, int(round(float(seconds_per_image) * fps)))
    expanded = []
    for frame in frames:
        expanded.extend([frame] * repeats)
    clip = ImageSequenceClip(expanded, fps=fps)
    clip.write_videofile(
        out_path,
        codec="libx264",
        audio_codec="aac",
        fps=fps,
        preset="medium",
        threads=max(1, (os.cpu_count() or 2) // 2),
    )
    return out_path, "Done! Video created without narration."


# ---------- UI ----------

def update_voice_choices(backend_name: str):
    voices = list_voices(backend_name)
    value = voices[0] if voices else None
    msg = f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
    return gr.update(choices=voices, value=value), msg


def ui():
    with gr.Blocks(title="Slideshow + Per-Image Audio + Multiline TTS + Voice Picker", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # 🖼️ → 🎬 Slideshow Maker
            - **Per-image audio**: upload audio files, one (or more) per image (matched by filename or order).
            - **Per-image TTS (multiline)**: write blocks separated by **blank lines**; lines inside a block are spoken sequentially for that image.
            - **TTS voices**: pick from **Coqui VCTK** multi-speaker voices (male/female) or use gTTS as a lightweight fallback.
            """
        )

        with gr.Row():
            with gr.Column(scale=1):
                image_files = gr.Files(
                    label="Upload Images (multiple)",
                    file_count="multiple",
                    file_types=["image"],
                )
                sort_mode = gr.Radio(
                    ["Filename (A→Z)", "Filename (Z→A)", "Shuffle"],
                    value="Filename (A→Z)",
                    label="Image Order",
                )
                shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)")

                seconds_per_image = gr.Slider(
                    minimum=0.1, maximum=10.0, step=0.1, value=1.5,
                    label="Seconds per Image (used when not syncing to audio)"
                )

                with gr.Row():
                    width = gr.Number(value=1280, precision=0, label="Width (px)")
                    height = gr.Number(value=720, precision=0, label="Height (px)")

                fit_mode = gr.Radio(["contain", "cover", "stretch"], value="contain", label="Sizing Mode")
                bg_color = gr.ColorPicker(value="#000000", label="Background (for 'contain')")

            with gr.Column(scale=1):
                narration_mode = gr.Radio(
                    ["None",
                     "Single story",
                     "Per-image (files)",
                     "Per-image (TTS per line)",
                     "Per-image (TTS multiline per image)"],
                    value="None",
                    label="Narration mode"
                )

                # Single-story UI
                story_text = gr.Textbox(
                    label="Story (Single track narration)",
                    placeholder="Type or paste your story...",
                    lines=20,
                    
                )
                match_video_to_narration = gr.Checkbox(
                    value=True, label="Match video duration to narration length (single-story)"
                )

                # Per-image UI (files)
                per_image_audio_files = gr.Files(
                    label="Per-image audio files (optional) — matched by filename or order",
                    file_count="multiple",
                    file_types=["audio"]
                )
                sync_per_image_audio = gr.Checkbox(
                    value=True, label="Sync image to audio duration (per-image modes)"
                )

                # Per-image UI (text)
                per_image_texts = gr.Textbox(
                    label="Per-image TTS (one line per image)",
                    placeholder="Line 1 (image 1)\nLine 2 (image 2)\n...",
                    lines=8,
                   
                )
                per_image_multiline_blocks = gr.Textbox(
                    label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
                    placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n...",
                    lines=40,
                    
                )

                with gr.Row():
                    tts_backend = gr.Dropdown(
                        ["Coqui (VCTK multi-speaker)", "gTTS (simple)"],
                        value="Coqui (VCTK multi-speaker)",
                        label="TTS backend"
                    )
                    tts_voice = gr.Dropdown(choices=[], label="Default Voice (for Coqui)")
                voice_status = gr.Markdown("")

                audio_gain_db = gr.Slider(
                    minimum=-12, maximum=12, step=1, value=0, label="Narration Gain (dB)"
                )

                run_btn = gr.Button("Create Video", variant="primary")
                status = gr.Markdown("")

        video_out = gr.Video(label="Result", autoplay=False)

        # Load voices when backend changes
        tts_backend.change(
            fn=update_voice_choices,
            inputs=[tts_backend],
            outputs=[tts_voice, voice_status]
        )

        # Also populate on initial load
        demo.load(
            fn=update_voice_choices,
            inputs=[tts_backend],
            outputs=[tts_voice, voice_status]
        )

        # Main action
        run_btn.click(
            fn=create_slideshow,
            inputs=[
                image_files,
                narration_mode,
                seconds_per_image,
                width, height,
                fit_mode, bg_color,
                sort_mode, shuffle_seed,
                # single-story
                story_text, match_video_to_narration,
                # per-image text inputs
                per_image_texts, per_image_multiline_blocks,
                # per-image files + sync
                per_image_audio_files, sync_per_image_audio,
                # tts
                tts_backend, tts_voice,
                audio_gain_db
            ],
            outputs=[video_out, status],
        )

        gr.Markdown(
            """
            **Tips**
            - *Multiline per image*: separate image blocks with a **blank line**. Within each block, lines are spoken in order.
            - *Coqui per-line speaker*: prefix a line with `speaker| text`, e.g., `p225| Hello there`.
            - *Sync option*: turn it on to make each image stay up for the full duration of its own audio.
            """
        )

    return demo


if __name__ == "__main__":
    ui().launch()