Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
-
# app.py — Slideshow with per-image audio
|
| 2 |
-
# Works with MoviePy v2.x; falls back to v1
|
| 3 |
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
import tempfile
|
| 7 |
import random
|
| 8 |
-
from typing import Optional, List, Dict
|
| 9 |
|
| 10 |
import numpy as np
|
| 11 |
from PIL import Image
|
|
@@ -46,8 +46,8 @@ except Exception:
|
|
| 46 |
AudioFileClip,
|
| 47 |
ImageClip,
|
| 48 |
concatenate_videoclips,
|
| 49 |
-
CompositeAudioClip as _CompositeAudioClip,
|
| 50 |
-
concatenate_audioclips as _concat_audios,
|
| 51 |
)
|
| 52 |
MPY_V2 = False
|
| 53 |
|
|
@@ -87,7 +87,7 @@ def apply_linear_gain(audio_clip, gain_linear: float):
|
|
| 87 |
|
| 88 |
def concat_audios_or_composite(clips: List):
|
| 89 |
"""
|
| 90 |
-
Concatenate audio clips. Prefer built-in concatenator; otherwise composite
|
| 91 |
sequentially using start offsets to emulate concatenation.
|
| 92 |
"""
|
| 93 |
if not clips:
|
|
@@ -101,19 +101,21 @@ def concat_audios_or_composite(clips: List):
|
|
| 101 |
pass
|
| 102 |
# Fallback: sequential CompositeAudioClip
|
| 103 |
if _CompositeAudioClip is not None:
|
| 104 |
-
starts = []
|
| 105 |
total = 0.0
|
| 106 |
seq = []
|
| 107 |
for c in clips:
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
| 110 |
comp = _CompositeAudioClip(seq)
|
| 111 |
try:
|
| 112 |
comp = clip_with_duration(comp, total)
|
| 113 |
except Exception:
|
| 114 |
pass
|
| 115 |
return comp
|
| 116 |
-
#
|
| 117 |
return clips[0]
|
| 118 |
|
| 119 |
|
|
@@ -187,20 +189,57 @@ def _get_tts_backend(backend_name: str):
|
|
| 187 |
|
| 188 |
|
| 189 |
def list_voices(backend_name: str) -> List[str]:
|
| 190 |
-
if backend_name =
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
break
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
|
| 206 |
def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
|
|
@@ -279,7 +318,6 @@ def build_audio_for_image_lines(
|
|
| 279 |
text = txt.strip()
|
| 280 |
if spk.strip():
|
| 281 |
voice = spk.strip()
|
| 282 |
-
# Synthesize this line
|
| 283 |
out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
|
| 284 |
gen = synth_tts_to_file(text, tts_backend, voice, out_p)
|
| 285 |
if gen and os.path.exists(gen):
|
|
@@ -352,7 +390,7 @@ def create_slideshow(
|
|
| 352 |
per_image_texts: str, # one line per image
|
| 353 |
per_image_multiline_blocks: str, # blocks separated by blank lines
|
| 354 |
per_image_audio_files: List, # uploaded audio files
|
| 355 |
-
sync_per_image_audio: bool, #
|
| 356 |
|
| 357 |
# TTS config
|
| 358 |
tts_backend: str,
|
|
@@ -445,7 +483,7 @@ def create_slideshow(
|
|
| 445 |
)
|
| 446 |
return out_path, "Done! Per-image audio applied."
|
| 447 |
|
| 448 |
-
# --- Per-image TTS per single line
|
| 449 |
if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
|
| 450 |
lines = [ln.strip() for ln in per_image_texts.splitlines()]
|
| 451 |
# Pad / trim to image count
|
|
@@ -496,7 +534,7 @@ def create_slideshow(
|
|
| 496 |
)
|
| 497 |
return out_path, "Done! Per-image TTS (single line) applied."
|
| 498 |
|
| 499 |
-
# ---
|
| 500 |
if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
|
| 501 |
blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
|
| 502 |
tmp_dir = tempfile.gettempdir()
|
|
@@ -538,7 +576,7 @@ def create_slideshow(
|
|
| 538 |
|
| 539 |
# --- Single story (one track) ---
|
| 540 |
if narration_mode == "Single story" and story_text.strip():
|
| 541 |
-
#
|
| 542 |
fps = 24
|
| 543 |
repeats = max(1, int(round(float(seconds_per_image) * fps)))
|
| 544 |
expanded = []
|
|
@@ -597,7 +635,8 @@ def create_slideshow(
|
|
| 597 |
def update_voice_choices(backend_name: str):
|
| 598 |
voices = list_voices(backend_name)
|
| 599 |
value = voices[0] if voices else None
|
| 600 |
-
|
|
|
|
| 601 |
|
| 602 |
|
| 603 |
def ui():
|
|
@@ -651,7 +690,9 @@ def ui():
|
|
| 651 |
# Single-story UI
|
| 652 |
story_text = gr.Textbox(
|
| 653 |
label="Story (Single track narration)",
|
| 654 |
-
placeholder="Type or paste your story..."
|
|
|
|
|
|
|
| 655 |
)
|
| 656 |
match_video_to_narration = gr.Checkbox(
|
| 657 |
value=True, label="Match video duration to narration length (single-story)"
|
|
@@ -670,11 +711,15 @@ def ui():
|
|
| 670 |
# Per-image UI (text)
|
| 671 |
per_image_texts = gr.Textbox(
|
| 672 |
label="Per-image TTS (one line per image)",
|
| 673 |
-
placeholder="Line 1 (image 1)\nLine 2 (image 2)\n..."
|
|
|
|
|
|
|
| 674 |
)
|
| 675 |
per_image_multiline_blocks = gr.Textbox(
|
| 676 |
label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
|
| 677 |
-
placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n..."
|
|
|
|
|
|
|
| 678 |
)
|
| 679 |
|
| 680 |
with gr.Row():
|
|
@@ -702,6 +747,13 @@ def ui():
|
|
| 702 |
outputs=[tts_voice, voice_status]
|
| 703 |
)
|
| 704 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
# Main action
|
| 706 |
run_btn.click(
|
| 707 |
fn=create_slideshow,
|
|
@@ -714,9 +766,9 @@ def ui():
|
|
| 714 |
sort_mode, shuffle_seed,
|
| 715 |
# single-story
|
| 716 |
story_text, match_video_to_narration,
|
| 717 |
-
# per-image text
|
| 718 |
per_image_texts, per_image_multiline_blocks,
|
| 719 |
-
# per-image files
|
| 720 |
per_image_audio_files, sync_per_image_audio,
|
| 721 |
# tts
|
| 722 |
tts_backend, tts_voice,
|
|
|
|
| 1 |
+
# app.py — Slideshow with per-image audio, multiline TTS per image, and voice picker
|
| 2 |
+
# Works with MoviePy v2.x; falls back to v1 when necessary. Python 3.9+ safe.
|
| 3 |
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
import tempfile
|
| 7 |
import random
|
| 8 |
+
from typing import Optional, List, Dict
|
| 9 |
|
| 10 |
import numpy as np
|
| 11 |
from PIL import Image
|
|
|
|
| 46 |
AudioFileClip,
|
| 47 |
ImageClip,
|
| 48 |
concatenate_videoclips,
|
| 49 |
+
CompositeAudioClip as _CompositeAudioClip, # type: ignore
|
| 50 |
+
concatenate_audioclips as _concat_audios, # type: ignore
|
| 51 |
)
|
| 52 |
MPY_V2 = False
|
| 53 |
|
|
|
|
| 87 |
|
| 88 |
def concat_audios_or_composite(clips: List):
|
| 89 |
"""
|
| 90 |
+
Concatenate audio clips. Prefer the built-in concatenator; otherwise composite
|
| 91 |
sequentially using start offsets to emulate concatenation.
|
| 92 |
"""
|
| 93 |
if not clips:
|
|
|
|
| 101 |
pass
|
| 102 |
# Fallback: sequential CompositeAudioClip
|
| 103 |
if _CompositeAudioClip is not None:
|
|
|
|
| 104 |
total = 0.0
|
| 105 |
seq = []
|
| 106 |
for c in clips:
|
| 107 |
+
try:
|
| 108 |
+
seq.append(c.set_start(total))
|
| 109 |
+
total += float(c.duration)
|
| 110 |
+
except Exception:
|
| 111 |
+
pass
|
| 112 |
comp = _CompositeAudioClip(seq)
|
| 113 |
try:
|
| 114 |
comp = clip_with_duration(comp, total)
|
| 115 |
except Exception:
|
| 116 |
pass
|
| 117 |
return comp
|
| 118 |
+
# Last resort
|
| 119 |
return clips[0]
|
| 120 |
|
| 121 |
|
|
|
|
| 189 |
|
| 190 |
|
| 191 |
def list_voices(backend_name: str) -> List[str]:
|
| 192 |
+
if backend_name != "Coqui (VCTK multi-speaker)":
|
| 193 |
+
return []
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
tts = _get_tts_backend(backend_name)
|
| 197 |
+
candidates: List[str] = []
|
| 198 |
+
|
| 199 |
+
# Try common attributes across TTS versions
|
| 200 |
+
for path in [
|
| 201 |
+
"speakers",
|
| 202 |
+
"speaker_manager.speaker_names",
|
| 203 |
+
"speaker_manager.speaker_ids",
|
| 204 |
+
]:
|
| 205 |
+
obj = tts
|
| 206 |
+
try:
|
| 207 |
+
for part in path.split("."):
|
| 208 |
+
obj = getattr(obj, part)
|
| 209 |
+
names = list(obj) if obj is not None else []
|
| 210 |
+
if names:
|
| 211 |
+
candidates = [str(x) for x in names]
|
| 212 |
break
|
| 213 |
+
except Exception:
|
| 214 |
+
continue
|
| 215 |
+
|
| 216 |
+
# Sensible fallback if nothing found (known VCTK IDs)
|
| 217 |
+
if not candidates:
|
| 218 |
+
candidates = [
|
| 219 |
+
"p225","p226","p233","p243","p254","p256","p258","p259",
|
| 220 |
+
"p270","p273","p274","p278","p279","p302","p311","p316",
|
| 221 |
+
"p334","p345","p360","p363","p374"
|
| 222 |
+
]
|
| 223 |
+
|
| 224 |
+
# Nudge common male IDs toward the top if present
|
| 225 |
+
male_pref = ["p225","p226","p233","p243","p270","p274","p279","p311","p345","p360","p363"]
|
| 226 |
+
ordered = candidates[:]
|
| 227 |
+
for pref in reversed(male_pref):
|
| 228 |
+
if pref in ordered:
|
| 229 |
+
ordered.remove(pref)
|
| 230 |
+
ordered.insert(0, pref)
|
| 231 |
+
|
| 232 |
+
# Deduplicate while preserving order
|
| 233 |
+
seen, final = set(), []
|
| 234 |
+
for v in ordered:
|
| 235 |
+
if v not in seen:
|
| 236 |
+
seen.add(v)
|
| 237 |
+
final.append(v)
|
| 238 |
+
return final
|
| 239 |
+
|
| 240 |
+
except Exception:
|
| 241 |
+
# Absolute fallback
|
| 242 |
+
return ["p225","p226","p233","p243"]
|
| 243 |
|
| 244 |
|
| 245 |
def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
|
|
|
|
| 318 |
text = txt.strip()
|
| 319 |
if spk.strip():
|
| 320 |
voice = spk.strip()
|
|
|
|
| 321 |
out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
|
| 322 |
gen = synth_tts_to_file(text, tts_backend, voice, out_p)
|
| 323 |
if gen and os.path.exists(gen):
|
|
|
|
| 390 |
per_image_texts: str, # one line per image
|
| 391 |
per_image_multiline_blocks: str, # blocks separated by blank lines
|
| 392 |
per_image_audio_files: List, # uploaded audio files
|
| 393 |
+
sync_per_image_audio: bool, # sync duration to audio for per-image modes
|
| 394 |
|
| 395 |
# TTS config
|
| 396 |
tts_backend: str,
|
|
|
|
| 483 |
)
|
| 484 |
return out_path, "Done! Per-image audio applied."
|
| 485 |
|
| 486 |
+
# --- Per-image TTS per single line ---
|
| 487 |
if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
|
| 488 |
lines = [ln.strip() for ln in per_image_texts.splitlines()]
|
| 489 |
# Pad / trim to image count
|
|
|
|
| 534 |
)
|
| 535 |
return out_path, "Done! Per-image TTS (single line) applied."
|
| 536 |
|
| 537 |
+
# --- Per-image TTS multiline per image ---
|
| 538 |
if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
|
| 539 |
blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
|
| 540 |
tmp_dir = tempfile.gettempdir()
|
|
|
|
| 576 |
|
| 577 |
# --- Single story (one track) ---
|
| 578 |
if narration_mode == "Single story" and story_text.strip():
|
| 579 |
+
# Base video (uniform duration)
|
| 580 |
fps = 24
|
| 581 |
repeats = max(1, int(round(float(seconds_per_image) * fps)))
|
| 582 |
expanded = []
|
|
|
|
| 635 |
def update_voice_choices(backend_name: str):
|
| 636 |
voices = list_voices(backend_name)
|
| 637 |
value = voices[0] if voices else None
|
| 638 |
+
msg = f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
|
| 639 |
+
return gr.update(choices=voices, value=value), msg
|
| 640 |
|
| 641 |
|
| 642 |
def ui():
|
|
|
|
| 690 |
# Single-story UI
|
| 691 |
story_text = gr.Textbox(
|
| 692 |
label="Story (Single track narration)",
|
| 693 |
+
placeholder="Type or paste your story...",
|
| 694 |
+
lines=6,
|
| 695 |
+
autogrow=True
|
| 696 |
)
|
| 697 |
match_video_to_narration = gr.Checkbox(
|
| 698 |
value=True, label="Match video duration to narration length (single-story)"
|
|
|
|
| 711 |
# Per-image UI (text)
|
| 712 |
per_image_texts = gr.Textbox(
|
| 713 |
label="Per-image TTS (one line per image)",
|
| 714 |
+
placeholder="Line 1 (image 1)\nLine 2 (image 2)\n...",
|
| 715 |
+
lines=8,
|
| 716 |
+
autogrow=True
|
| 717 |
)
|
| 718 |
per_image_multiline_blocks = gr.Textbox(
|
| 719 |
label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
|
| 720 |
+
placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n...",
|
| 721 |
+
lines=12,
|
| 722 |
+
autogrow=True
|
| 723 |
)
|
| 724 |
|
| 725 |
with gr.Row():
|
|
|
|
| 747 |
outputs=[tts_voice, voice_status]
|
| 748 |
)
|
| 749 |
|
| 750 |
+
# Also populate on initial load
|
| 751 |
+
demo.load(
|
| 752 |
+
fn=update_voice_choices,
|
| 753 |
+
inputs=[tts_backend],
|
| 754 |
+
outputs=[tts_voice, voice_status]
|
| 755 |
+
)
|
| 756 |
+
|
| 757 |
# Main action
|
| 758 |
run_btn.click(
|
| 759 |
fn=create_slideshow,
|
|
|
|
| 766 |
sort_mode, shuffle_seed,
|
| 767 |
# single-story
|
| 768 |
story_text, match_video_to_narration,
|
| 769 |
+
# per-image text inputs
|
| 770 |
per_image_texts, per_image_multiline_blocks,
|
| 771 |
+
# per-image files + sync
|
| 772 |
per_image_audio_files, sync_per_image_audio,
|
| 773 |
# tts
|
| 774 |
tts_backend, tts_voice,
|