VideoCreator / app.py
Vishwas1's picture
Update app.py
32e7cea verified
# app.py — Slideshow with per-image audio, multiline TTS per image, and voice picker
# Works with MoviePy v2.x; falls back to v1 when necessary. Python 3.9+ safe.
import os
import re
import tempfile
import random
from typing import Optional, List, Dict
import numpy as np
from PIL import Image
import gradio as gr
# ---- MoviePy imports with v2/v1 compatibility ----
MPY_V2 = False
afx = None # audio effects (v2)
_CompositeAudioClip = None
_concat_audios = None
try:
# v2.x preferred
from moviepy import (
ImageSequenceClip,
AudioFileClip,
ImageClip,
concatenate_videoclips,
)
try:
from moviepy import afx as _afx # type: ignore
afx = _afx
except Exception:
afx = None
try:
from moviepy import CompositeAudioClip as _CompositeAudioClip # type: ignore
except Exception:
_CompositeAudioClip = None
try:
from moviepy import concatenate_audioclips as _concat_audios # type: ignore
except Exception:
_concat_audios = None
MPY_V2 = True
except Exception:
# v1.x fallback
from moviepy.editor import (
ImageSequenceClip,
AudioFileClip,
ImageClip,
concatenate_videoclips,
CompositeAudioClip as _CompositeAudioClip, # type: ignore
concatenate_audioclips as _concat_audios, # type: ignore
)
MPY_V2 = False
# ---------- Small compatibility helpers ----------
def clip_with_duration(clip, duration: float):
if hasattr(clip, "with_duration"): # v2
return clip.with_duration(duration)
return clip.set_duration(duration) # v1
def clip_with_audio(clip, audio):
if hasattr(clip, "with_audio"): # v2
return clip.with_audio(audio)
return clip.set_audio(audio) # v1
def apply_linear_gain(audio_clip, gain_linear: float):
"""
Try to apply a linear gain to an AudioFileClip.
If effects aren't available, return the original clip (no-op).
"""
if hasattr(audio_clip, "with_effects") and afx is not None:
try:
return audio_clip.with_effects([afx.MultiplyVolume(gain_linear)])
except Exception:
pass
if hasattr(audio_clip, "fx"):
try:
if afx is not None and hasattr(afx, "volumex"):
return audio_clip.fx(afx.volumex, gain_linear)
except Exception:
pass
return audio_clip
def concat_audios_or_composite(clips: List):
"""
Concatenate audio clips. Prefer the built-in concatenator; otherwise composite
sequentially using start offsets to emulate concatenation.
"""
if not clips:
return None
if len(clips) == 1:
return clips[0]
if _concat_audios is not None:
try:
return _concat_audios(clips)
except Exception:
pass
# Fallback: sequential CompositeAudioClip
if _CompositeAudioClip is not None:
total = 0.0
seq = []
for c in clips:
try:
seq.append(c.set_start(total))
total += float(c.duration)
except Exception:
pass
comp = _CompositeAudioClip(seq)
try:
comp = clip_with_duration(comp, total)
except Exception:
pass
return comp
# Last resort
return clips[0]
# ---------- Image utilities ----------
def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray:
"""
Loads an image file and returns an RGB numpy array with exact (height, width, 3).
fit:
- "contain": letterbox to fit within target size (keeps aspect), background fills rest.
- "cover": fill target size (keeps aspect) with center crop.
- "stretch": distort to target size.
"""
img = Image.open(path).convert("RGB")
if fit == "stretch":
img = img.resize((width, height), Image.LANCZOS)
return np.array(img)
iw, ih = img.size
target_aspect = float(width) / float(height)
src_aspect = float(iw) / float(ih)
if fit == "cover":
# scale to cover, then center-crop
if src_aspect > target_aspect:
new_h = height
new_w = int(round(src_aspect * new_h))
else:
new_w = width
new_h = int(round(new_w / src_aspect))
img = img.resize((new_w, new_h), Image.LANCZOS)
left = (new_w - width) // 2
top = (new_h - height) // 2
img = img.crop((left, top, left + width, top + height))
return np.array(img)
# contain (letterbox/pillarbox)
canvas = Image.new("RGB", (width, height), bg)
if src_aspect > target_aspect:
new_w = width
new_h = int(round(new_w / src_aspect))
else:
new_h = height
new_w = int(round(src_aspect * new_h))
resized = img.resize((new_w, new_h), Image.LANCZOS)
left = (width - new_w) // 2
top = (height - new_h) // 2
canvas.paste(resized, (left, top))
return np.array(canvas)
# ---------- TTS backends ----------
_TTS_CACHE: Dict[str, object] = {}
def _get_tts_backend(backend_name: str):
"""
Lazy-load a TTS backend instance.
- "Coqui (VCTK multi-speaker)" -> coqui-ai/TTS model: tts_models/en/vctk/vits
- "gTTS (simple)" -> sentinel string "gTTS"
"""
if backend_name == "Coqui (VCTK multi-speaker)":
if backend_name not in _TTS_CACHE:
from TTS.api import TTS # heavy import
_TTS_CACHE[backend_name] = TTS("tts_models/en/vctk/vits")
return _TTS_CACHE[backend_name]
elif backend_name == "gTTS (simple)":
return "gTTS"
return None
def list_voices(backend_name: str) -> List[str]:
if backend_name != "Coqui (VCTK multi-speaker)":
return []
try:
tts = _get_tts_backend(backend_name)
candidates: List[str] = []
# Try common attributes across TTS versions
for path in [
"speakers",
"speaker_manager.speaker_names",
"speaker_manager.speaker_ids",
]:
obj = tts
try:
for part in path.split("."):
obj = getattr(obj, part)
names = list(obj) if obj is not None else []
if names:
candidates = [str(x) for x in names]
break
except Exception:
continue
# Sensible fallback if nothing found (known VCTK IDs)
if not candidates:
candidates = [
"p225","p226","p233","p243","p254","p256","p258","p259",
"p270","p273","p274","p278","p279","p302","p311","p316",
"p334","p345","p360","p363","p374"
]
# Nudge common male IDs toward the top if present
male_pref = ["p225","p226","p233","p243","p270","p274","p279","p311","p345","p360","p363"]
ordered = candidates[:]
for pref in reversed(male_pref):
if pref in ordered:
ordered.remove(pref)
ordered.insert(0, pref)
# Deduplicate while preserving order
seen, final = set(), []
for v in ordered:
if v not in seen:
seen.add(v)
final.append(v)
return final
except Exception:
# Absolute fallback
return ["p225","p226","p233","p243"]
def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
text = (text or "").strip()
if not text:
return None
if backend_name == "Coqui (VCTK multi-speaker)":
try:
tts = _get_tts_backend(backend_name)
if not out_path.lower().endswith(".wav"):
out_path = os.path.splitext(out_path)[0] + ".wav"
tts.tts_to_file(text=text, speaker=voice, file_path=out_path)
return out_path
except Exception:
return None
if backend_name == "gTTS (simple)":
try:
from gtts import gTTS
if not out_path.lower().endswith(".mp3"):
out_path = os.path.splitext(out_path)[0] + ".mp3"
gTTS(text=text, lang="en").save(out_path)
return out_path
except Exception:
return None
return None
# ---------- Text parsing for multiline-per-image ----------
def parse_multiline_blocks(text: str, expected_images: int) -> List[List[str]]:
"""
Split text into blocks by blank lines. Each block = one image.
Within a block, each non-empty line is a separate TTS segment.
"""
if not (text or "").strip():
return [[] for _ in range(expected_images)]
blocks = [b.strip() for b in re.split(r"\n\s*\n", text.strip()) if b.strip()]
# Pad/trim to match number of images
if len(blocks) < expected_images:
blocks += [""] * (expected_images - len(blocks))
elif len(blocks) > expected_images:
blocks = blocks[:expected_images]
result = []
for b in blocks:
lines = [ln.strip() for ln in b.splitlines() if ln.strip()]
result.append(lines)
return result
# ---------- Build audio for each image from multiple lines ----------
def build_audio_for_image_lines(
lines: List[str],
tts_backend: str,
default_voice: Optional[str],
audio_gain_db: float,
tmp_dir: str
):
"""
For a single image:
- Generate TTS for each line (respect 'speaker| text' override).
- Concatenate segments.
- Apply gain to the final track.
- Return (audio_clip, total_duration) or (None, 0.0) if no audio.
"""
segments = []
for idx, raw in enumerate(lines):
voice = default_voice
text = raw
if "|" in raw and tts_backend.startswith("Coqui"):
spk, txt = raw.split("|", 1)
if txt.strip():
text = txt.strip()
if spk.strip():
voice = spk.strip()
out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
gen = synth_tts_to_file(text, tts_backend, voice, out_p)
if gen and os.path.exists(gen):
try:
seg = AudioFileClip(gen)
segments.append(seg)
except Exception:
pass
if not segments:
return None, 0.0
combined = concat_audios_or_composite(segments)
if combined is None:
return None, 0.0
# Apply gain on the final composite if needed
gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
if abs(gain - 1.0) > 1e-3:
combined = apply_linear_gain(combined, gain)
total = float(combined.duration)
return combined, total
# ---------- Variable-duration video (per-image) ----------
def build_variable_duration_video(
frames: List[np.ndarray],
per_image_durations: List[float],
per_image_audios: List[Optional[object]], # AudioFileClip or CompositeAudioClip
):
"""
Create a video where each image has its own duration and optional audio.
"""
clips = []
for frame, dur, aclip in zip(frames, per_image_durations, per_image_audios):
iclip = ImageClip(frame)
iclip = clip_with_duration(iclip, float(max(0.05, dur)))
if aclip is not None:
try:
iclip = clip_with_audio(iclip, aclip)
except Exception:
pass
clips.append(iclip)
final = concatenate_videoclips(clips, method="compose")
return final
# ---------- Main create function ----------
def create_slideshow(
image_files: List,
narration_mode: str, # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)" | "Per-image (TTS multiline per image)"
seconds_per_image: float,
width: int,
height: int,
fit_mode: str,
bg_color: str,
sort_mode: str,
shuffle_seed: Optional[float],
# single-story inputs
story_text: str,
match_video_to_narration: bool,
# per-image inputs
per_image_texts: str, # one line per image
per_image_multiline_blocks: str, # blocks separated by blank lines
per_image_audio_files: List, # uploaded audio files
sync_per_image_audio: bool, # sync duration to audio for per-image modes
# TTS config
tts_backend: str,
tts_voice: Optional[str],
audio_gain_db: float
):
if not image_files:
return None, "Please upload at least one image."
# Normalize image paths
paths = []
for f in image_files:
p = getattr(f, "name", None) or getattr(f, "path", None) or f
if p and os.path.exists(p):
paths.append(p)
if not paths:
return None, "Could not read the uploaded images."
# Order
if sort_mode == "Filename (A→Z)":
paths = sorted(paths, key=lambda p: os.path.basename(p).lower())
elif sort_mode == "Filename (Z→A)":
paths = sorted(paths, key=lambda p: os.path.basename(p).lower(), reverse=True)
elif sort_mode == "Shuffle":
rnd = random.Random(int(shuffle_seed or 0))
rnd.shuffle(paths)
# Load frames
width = int(width); height = int(height)
frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths]
num_images = len(frames)
out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4")
# --- Per-image AUDIO FILES ---
if narration_mode == "Per-image (files)" and per_image_audio_files:
# Normalize audio paths & sort by filename
aud_paths = []
for a in per_image_audio_files:
ap = getattr(a, "name", None) or getattr(a, "path", None) or a
if ap and os.path.exists(ap):
aud_paths.append(ap)
aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower())
# Basename match, then index fallback
def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]:
result = [None] * len(image_paths)
if not audio_paths:
return result
audio_map = {}
for a in audio_paths:
base = os.path.splitext(os.path.basename(a))[0].lower()
audio_map[base] = a
used = set()
for i, ip in enumerate(image_paths):
base = os.path.splitext(os.path.basename(ip))[0].lower()
if base in audio_map:
result[i] = audio_map[base]; used.add(audio_map[base])
leftover = [a for a in audio_paths if a not in used]
for i in range(len(image_paths)):
if result[i] is None and leftover:
result[i] = leftover.pop(0)
return result
per_img_audio_paths = map_audio_to_images_by_name(paths, aud_paths)
per_img_audios = []
per_img_durs = []
for ap in per_img_audio_paths:
if ap:
try:
aclip = AudioFileClip(ap)
per_img_audios.append(aclip)
per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
except Exception:
per_img_audios.append(None)
per_img_durs.append(float(seconds_per_image))
else:
per_img_audios.append(None)
per_img_durs.append(float(seconds_per_image))
final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
final_clip.write_videofile(
out_path,
codec="libx264",
audio_codec="aac",
fps=24,
preset="medium",
threads=max(1, (os.cpu_count() or 2) // 2),
)
return out_path, "Done! Per-image audio applied."
# --- Per-image TTS per single line ---
if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
lines = [ln.strip() for ln in per_image_texts.splitlines()]
# Pad / trim to image count
if len(lines) < num_images:
lines += [""] * (num_images - len(lines))
else:
lines = lines[:num_images]
tmp_dir = tempfile.gettempdir()
per_img_audios = []
per_img_durs = []
for idx, text in enumerate(lines):
voice = tts_voice
if "|" in text and tts_backend.startswith("Coqui"):
maybe_speaker, maybe_text = text.split("|", 1)
if maybe_text.strip():
text = maybe_text.strip()
if maybe_speaker.strip():
voice = maybe_speaker.strip()
apath = None
if text:
apath = os.path.join(tmp_dir, f"tts_line_{idx}.wav")
gen = synth_tts_to_file(text, tts_backend, voice, apath)
apath = gen if gen and os.path.exists(gen) else None
if apath:
try:
aclip = AudioFileClip(apath)
per_img_audios.append(aclip)
per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
except Exception:
per_img_audios.append(None)
per_img_durs.append(float(seconds_per_image))
else:
per_img_audios.append(None)
per_img_durs.append(float(seconds_per_image))
final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
final_clip.write_videofile(
out_path,
codec="libx264",
audio_codec="aac",
fps=24,
preset="medium",
threads=max(1, (os.cpu_count() or 2) // 2),
)
return out_path, "Done! Per-image TTS (single line) applied."
# --- Per-image TTS multiline per image ---
if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
tmp_dir = tempfile.gettempdir()
per_img_audios = []
per_img_durs = []
for idx, lines in enumerate(blocks):
if not lines:
per_img_audios.append(None)
per_img_durs.append(float(seconds_per_image))
continue
aclip, total = build_audio_for_image_lines(
lines=lines,
tts_backend=tts_backend,
default_voice=tts_voice,
audio_gain_db=audio_gain_db,
tmp_dir=tmp_dir
)
if aclip is not None:
per_img_audios.append(aclip)
per_img_durs.append(float(total) if sync_per_image_audio else float(seconds_per_image))
else:
per_img_audios.append(None)
per_img_durs.append(float(seconds_per_image))
final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
final_clip.write_videofile(
out_path,
codec="libx264",
audio_codec="aac",
fps=24,
preset="medium",
threads=max(1, (os.cpu_count() or 2) // 2),
)
return out_path, "Done! Per-image multiline TTS applied."
# --- Single story (one track) ---
if narration_mode == "Single story" and story_text.strip():
# Base video (uniform duration)
fps = 24
repeats = max(1, int(round(float(seconds_per_image) * fps)))
expanded = []
for frame in frames:
expanded.extend([frame] * repeats)
clip = ImageSequenceClip(expanded, fps=fps)
# TTS
tmp = tempfile.gettempdir()
audio_path = os.path.join(tmp, "narration_single.wav")
gen = synth_tts_to_file(story_text.strip(), tts_backend, tts_voice, audio_path)
audio_path = gen if gen and os.path.exists(gen) else None
if audio_path:
try:
aclip = AudioFileClip(audio_path)
if match_video_to_narration:
clip = clip_with_duration(clip, float(aclip.duration))
gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
if abs(gain - 1.0) > 1e-3:
aclip = apply_linear_gain(aclip, gain)
clip = clip_with_audio(clip, aclip)
except Exception:
pass
clip.write_videofile(
out_path,
codec="libx264",
audio_codec="aac",
fps=fps,
preset="medium",
threads=max(1, (os.cpu_count() or 2) // 2),
)
return out_path, "Done! Story narration applied."
# --- No narration: uniform duration slideshow ---
fps = 24
repeats = max(1, int(round(float(seconds_per_image) * fps)))
expanded = []
for frame in frames:
expanded.extend([frame] * repeats)
clip = ImageSequenceClip(expanded, fps=fps)
clip.write_videofile(
out_path,
codec="libx264",
audio_codec="aac",
fps=fps,
preset="medium",
threads=max(1, (os.cpu_count() or 2) // 2),
)
return out_path, "Done! Video created without narration."
# ---------- UI ----------
def update_voice_choices(backend_name: str):
voices = list_voices(backend_name)
value = voices[0] if voices else None
msg = f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
return gr.update(choices=voices, value=value), msg
def ui():
with gr.Blocks(title="Slideshow + Per-Image Audio + Multiline TTS + Voice Picker", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🖼️ → 🎬 Slideshow Maker
- **Per-image audio**: upload audio files, one (or more) per image (matched by filename or order).
- **Per-image TTS (multiline)**: write blocks separated by **blank lines**; lines inside a block are spoken sequentially for that image.
- **TTS voices**: pick from **Coqui VCTK** multi-speaker voices (male/female) or use gTTS as a lightweight fallback.
"""
)
with gr.Row():
with gr.Column(scale=1):
image_files = gr.Files(
label="Upload Images (multiple)",
file_count="multiple",
file_types=["image"],
)
sort_mode = gr.Radio(
["Filename (A→Z)", "Filename (Z→A)", "Shuffle"],
value="Filename (A→Z)",
label="Image Order",
)
shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)")
seconds_per_image = gr.Slider(
minimum=0.1, maximum=10.0, step=0.1, value=1.5,
label="Seconds per Image (used when not syncing to audio)"
)
with gr.Row():
width = gr.Number(value=1280, precision=0, label="Width (px)")
height = gr.Number(value=720, precision=0, label="Height (px)")
fit_mode = gr.Radio(["contain", "cover", "stretch"], value="contain", label="Sizing Mode")
bg_color = gr.ColorPicker(value="#000000", label="Background (for 'contain')")
with gr.Column(scale=1):
narration_mode = gr.Radio(
["None",
"Single story",
"Per-image (files)",
"Per-image (TTS per line)",
"Per-image (TTS multiline per image)"],
value="None",
label="Narration mode"
)
# Single-story UI
story_text = gr.Textbox(
label="Story (Single track narration)",
placeholder="Type or paste your story...",
lines=20,
)
match_video_to_narration = gr.Checkbox(
value=True, label="Match video duration to narration length (single-story)"
)
# Per-image UI (files)
per_image_audio_files = gr.Files(
label="Per-image audio files (optional) — matched by filename or order",
file_count="multiple",
file_types=["audio"]
)
sync_per_image_audio = gr.Checkbox(
value=True, label="Sync image to audio duration (per-image modes)"
)
# Per-image UI (text)
per_image_texts = gr.Textbox(
label="Per-image TTS (one line per image)",
placeholder="Line 1 (image 1)\nLine 2 (image 2)\n...",
lines=8,
)
per_image_multiline_blocks = gr.Textbox(
label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n...",
lines=40,
)
with gr.Row():
tts_backend = gr.Dropdown(
["Coqui (VCTK multi-speaker)", "gTTS (simple)"],
value="Coqui (VCTK multi-speaker)",
label="TTS backend"
)
tts_voice = gr.Dropdown(choices=[], label="Default Voice (for Coqui)")
voice_status = gr.Markdown("")
audio_gain_db = gr.Slider(
minimum=-12, maximum=12, step=1, value=0, label="Narration Gain (dB)"
)
run_btn = gr.Button("Create Video", variant="primary")
status = gr.Markdown("")
video_out = gr.Video(label="Result", autoplay=False)
# Load voices when backend changes
tts_backend.change(
fn=update_voice_choices,
inputs=[tts_backend],
outputs=[tts_voice, voice_status]
)
# Also populate on initial load
demo.load(
fn=update_voice_choices,
inputs=[tts_backend],
outputs=[tts_voice, voice_status]
)
# Main action
run_btn.click(
fn=create_slideshow,
inputs=[
image_files,
narration_mode,
seconds_per_image,
width, height,
fit_mode, bg_color,
sort_mode, shuffle_seed,
# single-story
story_text, match_video_to_narration,
# per-image text inputs
per_image_texts, per_image_multiline_blocks,
# per-image files + sync
per_image_audio_files, sync_per_image_audio,
# tts
tts_backend, tts_voice,
audio_gain_db
],
outputs=[video_out, status],
)
gr.Markdown(
"""
**Tips**
- *Multiline per image*: separate image blocks with a **blank line**. Within each block, lines are spoken in order.
- *Coqui per-line speaker*: prefix a line with `speaker| text`, e.g., `p225| Hello there`.
- *Sync option*: turn it on to make each image stay up for the full duration of its own audio.
"""
)
return demo
if __name__ == "__main__":
ui().launch()