Spaces:

Vishwas1
/

VideoCreator

Runtime error

App Files Files Community

VideoCreator / app.py

Vishwas1

Update app.py

32e7cea verified 5 months ago

raw

history blame contribute delete

27.9 kB

	# app.py — Slideshow with per-image audio, multiline TTS per image, and voice picker
	# Works with MoviePy v2.x; falls back to v1 when necessary. Python 3.9+ safe.

	import os
	import re
	import tempfile
	import random
	from typing import Optional, List, Dict

	import numpy as np
	from PIL import Image
	import gradio as gr

	# ---- MoviePy imports with v2/v1 compatibility ----
	MPY_V2 = False
	afx = None # audio effects (v2)
	_CompositeAudioClip = None
	_concat_audios = None

	try:
	# v2.x preferred
	from moviepy import (
	ImageSequenceClip,
	AudioFileClip,
	ImageClip,
	concatenate_videoclips,
	)
	try:
	from moviepy import afx as _afx # type: ignore
	afx = _afx
	except Exception:
	afx = None
	try:
	from moviepy import CompositeAudioClip as _CompositeAudioClip # type: ignore
	except Exception:
	_CompositeAudioClip = None
	try:
	from moviepy import concatenate_audioclips as _concat_audios # type: ignore
	except Exception:
	_concat_audios = None
	MPY_V2 = True
	except Exception:
	# v1.x fallback
	from moviepy.editor import (
	ImageSequenceClip,
	AudioFileClip,
	ImageClip,
	concatenate_videoclips,
	CompositeAudioClip as _CompositeAudioClip, # type: ignore
	concatenate_audioclips as _concat_audios, # type: ignore
	)
	MPY_V2 = False


	# ---------- Small compatibility helpers ----------

	def clip_with_duration(clip, duration: float):
	if hasattr(clip, "with_duration"): # v2
	return clip.with_duration(duration)
	return clip.set_duration(duration) # v1


	def clip_with_audio(clip, audio):
	if hasattr(clip, "with_audio"): # v2
	return clip.with_audio(audio)
	return clip.set_audio(audio) # v1


	def apply_linear_gain(audio_clip, gain_linear: float):
	"""
	Try to apply a linear gain to an AudioFileClip.
	If effects aren't available, return the original clip (no-op).
	"""
	if hasattr(audio_clip, "with_effects") and afx is not None:
	try:
	return audio_clip.with_effects([afx.MultiplyVolume(gain_linear)])
	except Exception:
	pass
	if hasattr(audio_clip, "fx"):
	try:
	if afx is not None and hasattr(afx, "volumex"):
	return audio_clip.fx(afx.volumex, gain_linear)
	except Exception:
	pass
	return audio_clip


	def concat_audios_or_composite(clips: List):
	"""
	Concatenate audio clips. Prefer the built-in concatenator; otherwise composite
	sequentially using start offsets to emulate concatenation.
	"""
	if not clips:
	return None
	if len(clips) == 1:
	return clips[0]
	if _concat_audios is not None:
	try:
	return _concat_audios(clips)
	except Exception:
	pass
	# Fallback: sequential CompositeAudioClip
	if _CompositeAudioClip is not None:
	total = 0.0
	seq = []
	for c in clips:
	try:
	seq.append(c.set_start(total))
	total += float(c.duration)
	except Exception:
	pass
	comp = _CompositeAudioClip(seq)
	try:
	comp = clip_with_duration(comp, total)
	except Exception:
	pass
	return comp
	# Last resort
	return clips[0]


	# ---------- Image utilities ----------

	def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray:
	"""
	Loads an image file and returns an RGB numpy array with exact (height, width, 3).
	fit:
	- "contain": letterbox to fit within target size (keeps aspect), background fills rest.
	- "cover": fill target size (keeps aspect) with center crop.
	- "stretch": distort to target size.
	"""
	img = Image.open(path).convert("RGB")

	if fit == "stretch":
	img = img.resize((width, height), Image.LANCZOS)
	return np.array(img)

	iw, ih = img.size
	target_aspect = float(width) / float(height)
	src_aspect = float(iw) / float(ih)

	if fit == "cover":
	# scale to cover, then center-crop
	if src_aspect > target_aspect:
	new_h = height
	new_w = int(round(src_aspect * new_h))
	else:
	new_w = width
	new_h = int(round(new_w / src_aspect))
	img = img.resize((new_w, new_h), Image.LANCZOS)
	left = (new_w - width) // 2
	top = (new_h - height) // 2
	img = img.crop((left, top, left + width, top + height))
	return np.array(img)

	# contain (letterbox/pillarbox)
	canvas = Image.new("RGB", (width, height), bg)
	if src_aspect > target_aspect:
	new_w = width
	new_h = int(round(new_w / src_aspect))
	else:
	new_h = height
	new_w = int(round(src_aspect * new_h))
	resized = img.resize((new_w, new_h), Image.LANCZOS)
	left = (width - new_w) // 2
	top = (height - new_h) // 2
	canvas.paste(resized, (left, top))
	return np.array(canvas)


	# ---------- TTS backends ----------

	_TTS_CACHE: Dict[str, object] = {}

	def _get_tts_backend(backend_name: str):
	"""
	Lazy-load a TTS backend instance.
	- "Coqui (VCTK multi-speaker)" -> coqui-ai/TTS model: tts_models/en/vctk/vits
	- "gTTS (simple)" -> sentinel string "gTTS"
	"""
	if backend_name == "Coqui (VCTK multi-speaker)":
	if backend_name not in _TTS_CACHE:
	from TTS.api import TTS # heavy import
	_TTS_CACHE[backend_name] = TTS("tts_models/en/vctk/vits")
	return _TTS_CACHE[backend_name]
	elif backend_name == "gTTS (simple)":
	return "gTTS"
	return None


	def list_voices(backend_name: str) -> List[str]:
	if backend_name != "Coqui (VCTK multi-speaker)":
	return []

	try:
	tts = _get_tts_backend(backend_name)
	candidates: List[str] = []

	# Try common attributes across TTS versions
	for path in [
	"speakers",
	"speaker_manager.speaker_names",
	"speaker_manager.speaker_ids",
	]:
	obj = tts
	try:
	for part in path.split("."):
	obj = getattr(obj, part)
	names = list(obj) if obj is not None else []
	if names:
	candidates = [str(x) for x in names]
	break
	except Exception:
	continue

	# Sensible fallback if nothing found (known VCTK IDs)
	if not candidates:
	candidates = [
	"p225","p226","p233","p243","p254","p256","p258","p259",
	"p270","p273","p274","p278","p279","p302","p311","p316",
	"p334","p345","p360","p363","p374"
	]

	# Nudge common male IDs toward the top if present
	male_pref = ["p225","p226","p233","p243","p270","p274","p279","p311","p345","p360","p363"]
	ordered = candidates[:]
	for pref in reversed(male_pref):
	if pref in ordered:
	ordered.remove(pref)
	ordered.insert(0, pref)

	# Deduplicate while preserving order
	seen, final = set(), []
	for v in ordered:
	if v not in seen:
	seen.add(v)
	final.append(v)
	return final

	except Exception:
	# Absolute fallback
	return ["p225","p226","p233","p243"]


	def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
	text = (text or "").strip()
	if not text:
	return None

	if backend_name == "Coqui (VCTK multi-speaker)":
	try:
	tts = _get_tts_backend(backend_name)
	if not out_path.lower().endswith(".wav"):
	out_path = os.path.splitext(out_path)[0] + ".wav"
	tts.tts_to_file(text=text, speaker=voice, file_path=out_path)
	return out_path
	except Exception:
	return None

	if backend_name == "gTTS (simple)":
	try:
	from gtts import gTTS
	if not out_path.lower().endswith(".mp3"):
	out_path = os.path.splitext(out_path)[0] + ".mp3"
	gTTS(text=text, lang="en").save(out_path)
	return out_path
	except Exception:
	return None

	return None


	# ---------- Text parsing for multiline-per-image ----------

	def parse_multiline_blocks(text: str, expected_images: int) -> List[List[str]]:
	"""
	Split text into blocks by blank lines. Each block = one image.
	Within a block, each non-empty line is a separate TTS segment.
	"""
	if not (text or "").strip():
	return [[] for _ in range(expected_images)]
	blocks = [b.strip() for b in re.split(r"\n\s*\n", text.strip()) if b.strip()]
	# Pad/trim to match number of images
	if len(blocks) < expected_images:
	blocks += [""] * (expected_images - len(blocks))
	elif len(blocks) > expected_images:
	blocks = blocks[:expected_images]
	result = []
	for b in blocks:
	lines = [ln.strip() for ln in b.splitlines() if ln.strip()]
	result.append(lines)
	return result


	# ---------- Build audio for each image from multiple lines ----------

	def build_audio_for_image_lines(
	lines: List[str],
	tts_backend: str,
	default_voice: Optional[str],
	audio_gain_db: float,
	tmp_dir: str
	):
	"""
	For a single image:
	- Generate TTS for each line (respect 'speaker\| text' override).
	- Concatenate segments.
	- Apply gain to the final track.
	- Return (audio_clip, total_duration) or (None, 0.0) if no audio.
	"""
	segments = []
	for idx, raw in enumerate(lines):
	voice = default_voice
	text = raw
	if "\|" in raw and tts_backend.startswith("Coqui"):
	spk, txt = raw.split("\|", 1)
	if txt.strip():
	text = txt.strip()
	if spk.strip():
	voice = spk.strip()
	out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
	gen = synth_tts_to_file(text, tts_backend, voice, out_p)
	if gen and os.path.exists(gen):
	try:
	seg = AudioFileClip(gen)
	segments.append(seg)
	except Exception:
	pass

	if not segments:
	return None, 0.0

	combined = concat_audios_or_composite(segments)
	if combined is None:
	return None, 0.0

	# Apply gain on the final composite if needed
	gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
	if abs(gain - 1.0) > 1e-3:
	combined = apply_linear_gain(combined, gain)

	total = float(combined.duration)
	return combined, total


	# ---------- Variable-duration video (per-image) ----------

	def build_variable_duration_video(
	frames: List[np.ndarray],
	per_image_durations: List[float],
	per_image_audios: List[Optional[object]], # AudioFileClip or CompositeAudioClip
	):
	"""
	Create a video where each image has its own duration and optional audio.
	"""
	clips = []
	for frame, dur, aclip in zip(frames, per_image_durations, per_image_audios):
	iclip = ImageClip(frame)
	iclip = clip_with_duration(iclip, float(max(0.05, dur)))
	if aclip is not None:
	try:
	iclip = clip_with_audio(iclip, aclip)
	except Exception:
	pass
	clips.append(iclip)

	final = concatenate_videoclips(clips, method="compose")
	return final


	# ---------- Main create function ----------

	def create_slideshow(
	image_files: List,

	narration_mode: str, # "None" \| "Single story" \| "Per-image (files)" \| "Per-image (TTS per line)" \| "Per-image (TTS multiline per image)"
	seconds_per_image: float,
	width: int,
	height: int,
	fit_mode: str,
	bg_color: str,
	sort_mode: str,
	shuffle_seed: Optional[float],

	# single-story inputs
	story_text: str,
	match_video_to_narration: bool,

	# per-image inputs
	per_image_texts: str, # one line per image
	per_image_multiline_blocks: str, # blocks separated by blank lines
	per_image_audio_files: List, # uploaded audio files
	sync_per_image_audio: bool, # sync duration to audio for per-image modes

	# TTS config
	tts_backend: str,
	tts_voice: Optional[str],
	audio_gain_db: float
	):
	if not image_files:
	return None, "Please upload at least one image."

	# Normalize image paths
	paths = []
	for f in image_files:
	p = getattr(f, "name", None) or getattr(f, "path", None) or f
	if p and os.path.exists(p):
	paths.append(p)
	if not paths:
	return None, "Could not read the uploaded images."

	# Order
	if sort_mode == "Filename (A→Z)":
	paths = sorted(paths, key=lambda p: os.path.basename(p).lower())
	elif sort_mode == "Filename (Z→A)":
	paths = sorted(paths, key=lambda p: os.path.basename(p).lower(), reverse=True)
	elif sort_mode == "Shuffle":
	rnd = random.Random(int(shuffle_seed or 0))
	rnd.shuffle(paths)

	# Load frames
	width = int(width); height = int(height)
	frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths]
	num_images = len(frames)

	out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4")

	# --- Per-image AUDIO FILES ---
	if narration_mode == "Per-image (files)" and per_image_audio_files:
	# Normalize audio paths & sort by filename
	aud_paths = []
	for a in per_image_audio_files:
	ap = getattr(a, "name", None) or getattr(a, "path", None) or a
	if ap and os.path.exists(ap):
	aud_paths.append(ap)
	aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower())

	# Basename match, then index fallback
	def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]:
	result = [None] * len(image_paths)
	if not audio_paths:
	return result
	audio_map = {}
	for a in audio_paths:
	base = os.path.splitext(os.path.basename(a))[0].lower()
	audio_map[base] = a
	used = set()
	for i, ip in enumerate(image_paths):
	base = os.path.splitext(os.path.basename(ip))[0].lower()
	if base in audio_map:
	result[i] = audio_map[base]; used.add(audio_map[base])
	leftover = [a for a in audio_paths if a not in used]
	for i in range(len(image_paths)):
	if result[i] is None and leftover:
	result[i] = leftover.pop(0)
	return result

	per_img_audio_paths = map_audio_to_images_by_name(paths, aud_paths)

	per_img_audios = []
	per_img_durs = []
	for ap in per_img_audio_paths:
	if ap:
	try:
	aclip = AudioFileClip(ap)
	per_img_audios.append(aclip)
	per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
	except Exception:
	per_img_audios.append(None)
	per_img_durs.append(float(seconds_per_image))
	else:
	per_img_audios.append(None)
	per_img_durs.append(float(seconds_per_image))

	final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
	final_clip.write_videofile(
	out_path,
	codec="libx264",
	audio_codec="aac",
	fps=24,
	preset="medium",
	threads=max(1, (os.cpu_count() or 2) // 2),
	)
	return out_path, "Done! Per-image audio applied."

	# --- Per-image TTS per single line ---
	if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
	lines = [ln.strip() for ln in per_image_texts.splitlines()]
	# Pad / trim to image count
	if len(lines) < num_images:
	lines += [""] * (num_images - len(lines))
	else:
	lines = lines[:num_images]

	tmp_dir = tempfile.gettempdir()
	per_img_audios = []
	per_img_durs = []

	for idx, text in enumerate(lines):
	voice = tts_voice
	if "\|" in text and tts_backend.startswith("Coqui"):
	maybe_speaker, maybe_text = text.split("\|", 1)
	if maybe_text.strip():
	text = maybe_text.strip()
	if maybe_speaker.strip():
	voice = maybe_speaker.strip()

	apath = None
	if text:
	apath = os.path.join(tmp_dir, f"tts_line_{idx}.wav")
	gen = synth_tts_to_file(text, tts_backend, voice, apath)
	apath = gen if gen and os.path.exists(gen) else None

	if apath:
	try:
	aclip = AudioFileClip(apath)
	per_img_audios.append(aclip)
	per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
	except Exception:
	per_img_audios.append(None)
	per_img_durs.append(float(seconds_per_image))
	else:
	per_img_audios.append(None)
	per_img_durs.append(float(seconds_per_image))

	final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
	final_clip.write_videofile(
	out_path,
	codec="libx264",
	audio_codec="aac",
	fps=24,
	preset="medium",
	threads=max(1, (os.cpu_count() or 2) // 2),
	)
	return out_path, "Done! Per-image TTS (single line) applied."

	# --- Per-image TTS multiline per image ---
	if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
	blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
	tmp_dir = tempfile.gettempdir()

	per_img_audios = []
	per_img_durs = []

	for idx, lines in enumerate(blocks):
	if not lines:
	per_img_audios.append(None)
	per_img_durs.append(float(seconds_per_image))
	continue

	aclip, total = build_audio_for_image_lines(
	lines=lines,
	tts_backend=tts_backend,
	default_voice=tts_voice,
	audio_gain_db=audio_gain_db,
	tmp_dir=tmp_dir
	)

	if aclip is not None:
	per_img_audios.append(aclip)
	per_img_durs.append(float(total) if sync_per_image_audio else float(seconds_per_image))
	else:
	per_img_audios.append(None)
	per_img_durs.append(float(seconds_per_image))

	final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
	final_clip.write_videofile(
	out_path,
	codec="libx264",
	audio_codec="aac",
	fps=24,
	preset="medium",
	threads=max(1, (os.cpu_count() or 2) // 2),
	)
	return out_path, "Done! Per-image multiline TTS applied."

	# --- Single story (one track) ---
	if narration_mode == "Single story" and story_text.strip():
	# Base video (uniform duration)
	fps = 24
	repeats = max(1, int(round(float(seconds_per_image) * fps)))
	expanded = []
	for frame in frames:
	expanded.extend([frame] * repeats)
	clip = ImageSequenceClip(expanded, fps=fps)

	# TTS
	tmp = tempfile.gettempdir()
	audio_path = os.path.join(tmp, "narration_single.wav")
	gen = synth_tts_to_file(story_text.strip(), tts_backend, tts_voice, audio_path)
	audio_path = gen if gen and os.path.exists(gen) else None

	if audio_path:
	try:
	aclip = AudioFileClip(audio_path)
	if match_video_to_narration:
	clip = clip_with_duration(clip, float(aclip.duration))
	gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
	if abs(gain - 1.0) > 1e-3:
	aclip = apply_linear_gain(aclip, gain)
	clip = clip_with_audio(clip, aclip)
	except Exception:
	pass

	clip.write_videofile(
	out_path,
	codec="libx264",
	audio_codec="aac",
	fps=fps,
	preset="medium",
	threads=max(1, (os.cpu_count() or 2) // 2),
	)
	return out_path, "Done! Story narration applied."

	# --- No narration: uniform duration slideshow ---
	fps = 24
	repeats = max(1, int(round(float(seconds_per_image) * fps)))
	expanded = []
	for frame in frames:
	expanded.extend([frame] * repeats)
	clip = ImageSequenceClip(expanded, fps=fps)
	clip.write_videofile(
	out_path,
	codec="libx264",
	audio_codec="aac",
	fps=fps,
	preset="medium",
	threads=max(1, (os.cpu_count() or 2) // 2),
	)
	return out_path, "Done! Video created without narration."


	# ---------- UI ----------

	def update_voice_choices(backend_name: str):
	voices = list_voices(backend_name)
	value = voices[0] if voices else None
	msg = f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
	return gr.update(choices=voices, value=value), msg


	def ui():
	with gr.Blocks(title="Slideshow + Per-Image Audio + Multiline TTS + Voice Picker", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🖼️ → 🎬 Slideshow Maker
	- Per-image audio: upload audio files, one (or more) per image (matched by filename or order).
	- Per-image TTS (multiline): write blocks separated by blank lines; lines inside a block are spoken sequentially for that image.
	- TTS voices: pick from Coqui VCTK multi-speaker voices (male/female) or use gTTS as a lightweight fallback.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	image_files = gr.Files(
	label="Upload Images (multiple)",
	file_count="multiple",
	file_types=["image"],
	)
	sort_mode = gr.Radio(
	["Filename (A→Z)", "Filename (Z→A)", "Shuffle"],
	value="Filename (A→Z)",
	label="Image Order",
	)
	shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)")

	seconds_per_image = gr.Slider(
	minimum=0.1, maximum=10.0, step=0.1, value=1.5,
	label="Seconds per Image (used when not syncing to audio)"
	)

	with gr.Row():
	width = gr.Number(value=1280, precision=0, label="Width (px)")
	height = gr.Number(value=720, precision=0, label="Height (px)")

	fit_mode = gr.Radio(["contain", "cover", "stretch"], value="contain", label="Sizing Mode")
	bg_color = gr.ColorPicker(value="#000000", label="Background (for 'contain')")

	with gr.Column(scale=1):
	narration_mode = gr.Radio(
	["None",
	"Single story",
	"Per-image (files)",
	"Per-image (TTS per line)",
	"Per-image (TTS multiline per image)"],
	value="None",
	label="Narration mode"
	)

	# Single-story UI
	story_text = gr.Textbox(
	label="Story (Single track narration)",
	placeholder="Type or paste your story...",
	lines=20,

	)
	match_video_to_narration = gr.Checkbox(
	value=True, label="Match video duration to narration length (single-story)"
	)

	# Per-image UI (files)
	per_image_audio_files = gr.Files(
	label="Per-image audio files (optional) — matched by filename or order",
	file_count="multiple",
	file_types=["audio"]
	)
	sync_per_image_audio = gr.Checkbox(
	value=True, label="Sync image to audio duration (per-image modes)"
	)

	# Per-image UI (text)
	per_image_texts = gr.Textbox(
	label="Per-image TTS (one line per image)",
	placeholder="Line 1 (image 1)\nLine 2 (image 2)\n...",
	lines=8,

	)
	per_image_multiline_blocks = gr.Textbox(
	label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker\| text' to override",
	placeholder="p225\| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n...",
	lines=40,

	)

	with gr.Row():
	tts_backend = gr.Dropdown(
	["Coqui (VCTK multi-speaker)", "gTTS (simple)"],
	value="Coqui (VCTK multi-speaker)",
	label="TTS backend"
	)
	tts_voice = gr.Dropdown(choices=[], label="Default Voice (for Coqui)")
	voice_status = gr.Markdown("")

	audio_gain_db = gr.Slider(
	minimum=-12, maximum=12, step=1, value=0, label="Narration Gain (dB)"
	)

	run_btn = gr.Button("Create Video", variant="primary")
	status = gr.Markdown("")

	video_out = gr.Video(label="Result", autoplay=False)

	# Load voices when backend changes
	tts_backend.change(
	fn=update_voice_choices,
	inputs=[tts_backend],
	outputs=[tts_voice, voice_status]
	)

	# Also populate on initial load
	demo.load(
	fn=update_voice_choices,
	inputs=[tts_backend],
	outputs=[tts_voice, voice_status]
	)

	# Main action
	run_btn.click(
	fn=create_slideshow,
	inputs=[
	image_files,
	narration_mode,
	seconds_per_image,
	width, height,
	fit_mode, bg_color,
	sort_mode, shuffle_seed,
	# single-story
	story_text, match_video_to_narration,
	# per-image text inputs
	per_image_texts, per_image_multiline_blocks,
	# per-image files + sync
	per_image_audio_files, sync_per_image_audio,
	# tts
	tts_backend, tts_voice,
	audio_gain_db
	],
	outputs=[video_out, status],
	)

	gr.Markdown(
	"""
	Tips
	- Multiline per image: separate image blocks with a blank line. Within each block, lines are spoken in order.
	- Coqui per-line speaker: prefix a line with `speaker\| text`, e.g., `p225\| Hello there`.
	- Sync option: turn it on to make each image stay up for the full duration of its own audio.
	"""
	)

	return demo


	if __name__ == "__main__":
	ui().launch()