Vishwas1 commited on
Commit
902db85
·
verified ·
1 Parent(s): f47c03b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -98
app.py CHANGED
@@ -1,7 +1,8 @@
1
- # app.py — Slideshow with per-image audio + multi-voice TTS (HF Coqui)
2
- # Works with MoviePy v2.x; falls back to v1 where possible.
3
 
4
  import os
 
5
  import tempfile
6
  import random
7
  from typing import Optional, List, Dict, Tuple
@@ -13,19 +14,41 @@ import gradio as gr
13
  # ---- MoviePy imports with v2/v1 compatibility ----
14
  MPY_V2 = False
15
  afx = None # audio effects (v2)
 
 
16
 
17
  try:
18
  # v2.x preferred
19
- from moviepy import ImageSequenceClip, AudioFileClip, ImageClip, concatenate_videoclips # type: ignore
 
 
 
 
 
20
  try:
21
  from moviepy import afx as _afx # type: ignore
22
  afx = _afx
23
  except Exception:
24
  afx = None
 
 
 
 
 
 
 
 
25
  MPY_V2 = True
26
  except Exception:
27
  # v1.x fallback
28
- from moviepy.editor import ImageSequenceClip, AudioFileClip, ImageClip, concatenate_videoclips # type: ignore
 
 
 
 
 
 
 
29
  MPY_V2 = False
30
 
31
 
@@ -62,13 +85,45 @@ def apply_linear_gain(audio_clip, gain_linear: float):
62
  return audio_clip
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  # ---------- Image utilities ----------
66
 
67
  def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray:
68
  """
69
  Loads an image file and returns an RGB numpy array with exact (height, width, 3).
70
- fit modes:
71
- - "contain": letterbox to fit within target size (keeps aspect), background color fills the rest.
72
  - "cover": fill target size (keeps aspect) with center crop.
73
  - "stretch": distort to target size.
74
  """
@@ -85,11 +140,9 @@ def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain",
85
  if fit == "cover":
86
  # scale to cover, then center-crop
87
  if src_aspect > target_aspect:
88
- # image too wide -> fit height, crop width
89
  new_h = height
90
  new_w = int(round(src_aspect * new_h))
91
  else:
92
- # image too tall -> fit width, crop height
93
  new_w = width
94
  new_h = int(round(new_w / src_aspect))
95
  img = img.resize((new_w, new_h), Image.LANCZOS)
@@ -113,7 +166,6 @@ def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain",
113
  return np.array(canvas)
114
 
115
 
116
-
117
  # ---------- TTS backends ----------
118
 
119
  _TTS_CACHE: Dict[str, object] = {}
@@ -139,15 +191,13 @@ def list_voices(backend_name: str) -> List[str]:
139
  try:
140
  tts = _get_tts_backend(backend_name)
141
  spks = list(getattr(tts, "speakers", []))
142
- # Prefer a common male default if present
143
- default_pref = ["p225", "p226", "p233", "p243"]
144
- ordered = sorted(spks)
145
- for pref in default_pref:
146
- if pref in ordered:
147
- ordered.remove(pref)
148
- ordered.insert(0, pref)
149
  break
150
- return ordered
151
  except Exception:
152
  return []
153
  return []
@@ -161,7 +211,6 @@ def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_pa
161
  if backend_name == "Coqui (VCTK multi-speaker)":
162
  try:
163
  tts = _get_tts_backend(backend_name)
164
- # Coqui writes WAV by default; we'll give a .wav path
165
  if not out_path.lower().endswith(".wav"):
166
  out_path = os.path.splitext(out_path)[0] + ".wav"
167
  tts.tts_to_file(text=text, speaker=voice, file_path=out_path)
@@ -169,7 +218,6 @@ def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_pa
169
  except Exception:
170
  return None
171
 
172
- # gTTS fallback
173
  if backend_name == "gTTS (simple)":
174
  try:
175
  from gtts import gTTS
@@ -183,73 +231,111 @@ def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_pa
183
  return None
184
 
185
 
186
- # ---------- Helpers for per-image mapping ----------
187
 
188
- def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]:
189
  """
190
- Try basename match first; then fall back to index order.
 
191
  """
192
- result = [None] * len(image_paths)
193
- if not audio_paths:
194
- return result
195
-
196
- # Basename map (without extension)
197
- audio_map = {}
198
- for a in audio_paths:
199
- base = os.path.splitext(os.path.basename(a))[0].lower()
200
- audio_map[base] = a
201
-
202
- used = set()
203
- # First pass: basename matches
204
- for i, ip in enumerate(image_paths):
205
- base = os.path.splitext(os.path.basename(ip))[0].lower()
206
- if base in audio_map:
207
- result[i] = audio_map[base]
208
- used.add(audio_map[base])
209
-
210
- # Second pass: index fallback for any remaining
211
- leftover = [a for a in audio_paths if a not in used]
212
- for i in range(len(image_paths)):
213
- if result[i] is None and leftover:
214
- result[i] = leftover.pop(0)
215
-
216
  return result
217
 
218
 
219
- # ---------- Core builder ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
  def build_variable_duration_video(
222
  frames: List[np.ndarray],
223
  per_image_durations: List[float],
224
- per_image_audios: List[Optional[str]],
225
- audio_gain_db: float
226
  ):
227
  """
228
  Create a video where each image has its own duration and optional audio.
229
  """
230
  clips = []
231
- for frame, dur, apath in zip(frames, per_image_durations, per_image_audios):
232
  iclip = ImageClip(frame)
233
- iclip = clip_with_duration(iclip, float(dur))
234
- if apath:
235
  try:
236
- aclip = AudioFileClip(apath)
237
- gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
238
- if abs(gain - 1.0) > 1e-3:
239
- aclip = apply_linear_gain(aclip, gain)
240
  iclip = clip_with_audio(iclip, aclip)
241
  except Exception:
242
  pass
243
  clips.append(iclip)
244
 
245
- # Compose ensures audio & size are aligned
246
  final = concatenate_videoclips(clips, method="compose")
247
  return final
248
 
249
 
 
 
250
  def create_slideshow(
251
  image_files: List,
252
- narration_mode: str, # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)"
 
253
  seconds_per_image: float,
254
  width: int,
255
  height: int,
@@ -263,8 +349,10 @@ def create_slideshow(
263
  match_video_to_narration: bool,
264
 
265
  # per-image inputs
266
- per_image_texts: str, # one line per image; optional "speaker| text" when using Coqui
 
267
  per_image_audio_files: List, # uploaded audio files
 
268
 
269
  # TTS config
270
  tts_backend: str,
@@ -295,8 +383,8 @@ def create_slideshow(
295
  # Load frames
296
  width = int(width); height = int(height)
297
  frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths]
 
298
 
299
- # Build outputs based on narration_mode
300
  out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4")
301
 
302
  # --- Per-image AUDIO FILES ---
@@ -309,22 +397,44 @@ def create_slideshow(
309
  aud_paths.append(ap)
310
  aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower())
311
 
312
- per_img_audio = map_audio_to_images_by_name(paths, aud_paths)
313
-
314
- # Durations: match each audio if present, else fall back to seconds_per_image
315
- durations = []
316
- for ap in per_img_audio:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  if ap:
318
  try:
319
  aclip = AudioFileClip(ap)
320
- durations.append(float(aclip.duration))
 
321
  except Exception:
322
- durations.append(float(seconds_per_image))
 
323
  else:
324
- durations.append(float(seconds_per_image))
325
-
326
- final_clip = build_variable_duration_video(frames, durations, per_img_audio, audio_gain_db)
327
 
 
328
  final_clip.write_videofile(
329
  out_path,
330
  codec="libx264",
@@ -335,22 +445,21 @@ def create_slideshow(
335
  )
336
  return out_path, "Done! Per-image audio applied."
337
 
338
- # --- Per-image TTS per line ---
339
  if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
340
  lines = [ln.strip() for ln in per_image_texts.splitlines()]
341
  # Pad / trim to image count
342
- if len(lines) < len(paths):
343
- lines += [""] * (len(paths) - len(lines))
344
  else:
345
- lines = lines[:len(paths)]
346
 
347
- # Generate audio per line
348
  tmp_dir = tempfile.gettempdir()
349
- per_img_audio = []
350
- durations = []
 
351
  for idx, text in enumerate(lines):
352
  voice = tts_voice
353
- # Optional "speaker| text" override for Coqui
354
  if "|" in text and tts_backend.startswith("Coqui"):
355
  maybe_speaker, maybe_text = text.split("|", 1)
356
  if maybe_text.strip():
@@ -364,18 +473,59 @@ def create_slideshow(
364
  gen = synth_tts_to_file(text, tts_backend, voice, apath)
365
  apath = gen if gen and os.path.exists(gen) else None
366
 
367
- per_img_audio.append(apath)
368
  if apath:
369
  try:
370
  aclip = AudioFileClip(apath)
371
- durations.append(float(aclip.duration))
 
372
  except Exception:
373
- durations.append(float(seconds_per_image))
 
374
  else:
375
- durations.append(float(seconds_per_image))
 
376
 
377
- final_clip = build_variable_duration_video(frames, durations, per_img_audio, audio_gain_db)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
 
379
  final_clip.write_videofile(
380
  out_path,
381
  codec="libx264",
@@ -384,7 +534,7 @@ def create_slideshow(
384
  preset="medium",
385
  threads=max(1, (os.cpu_count() or 2) // 2),
386
  )
387
- return out_path, "Done! Per-image TTS applied."
388
 
389
  # --- Single story (one track) ---
390
  if narration_mode == "Single story" and story_text.strip():
@@ -451,12 +601,13 @@ def update_voice_choices(backend_name: str):
451
 
452
 
453
  def ui():
454
- with gr.Blocks(title="Slideshow + Per-Image Audio + Voice Picker", theme=gr.themes.Soft()) as demo:
455
  gr.Markdown(
456
  """
457
  # 🖼️ → 🎬 Slideshow Maker
458
- - **Per-image audio**: upload audio files (matched by filename or order) **or** generate per-image narration from text lines.
459
- - **TTS voices**: pick from **Coqui VCTK**'s multi-speaker voices (male/female), or use gTTS as a lightweight fallback.
 
460
  """
461
  )
462
 
@@ -475,7 +626,8 @@ def ui():
475
  shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)")
476
 
477
  seconds_per_image = gr.Slider(
478
- minimum=0.1, maximum=10.0, step=0.1, value=1.5, label="Seconds per Image (used when no per-image audio)"
 
479
  )
480
 
481
  with gr.Row():
@@ -487,10 +639,15 @@ def ui():
487
 
488
  with gr.Column(scale=1):
489
  narration_mode = gr.Radio(
490
- ["None", "Single story", "Per-image (files)", "Per-image (TTS per line)"],
 
 
 
 
491
  value="None",
492
  label="Narration mode"
493
  )
 
494
  # Single-story UI
495
  story_text = gr.Textbox(
496
  label="Story (Single track narration)",
@@ -500,15 +657,24 @@ def ui():
500
  value=True, label="Match video duration to narration length (single-story)"
501
  )
502
 
503
- # Per-image UI
504
  per_image_audio_files = gr.Files(
505
  label="Per-image audio files (optional) — matched by filename or order",
506
  file_count="multiple",
507
  file_types=["audio"]
508
  )
 
 
 
 
 
509
  per_image_texts = gr.Textbox(
510
- label="Per-image TTS text (one line per image). For Coqui, optional 'speaker| text' per line.",
511
- placeholder="Line 1 text\nLine 2 text\n..."
 
 
 
 
512
  )
513
 
514
  with gr.Row():
@@ -517,7 +683,7 @@ def ui():
517
  value="Coqui (VCTK multi-speaker)",
518
  label="TTS backend"
519
  )
520
- tts_voice = gr.Dropdown(choices=[], label="Voice (for Coqui)")
521
  voice_status = gr.Markdown("")
522
 
523
  audio_gain_db = gr.Slider(
@@ -548,8 +714,10 @@ def ui():
548
  sort_mode, shuffle_seed,
549
  # single-story
550
  story_text, match_video_to_narration,
551
- # per-image
552
- per_image_texts, per_image_audio_files,
 
 
553
  # tts
554
  tts_backend, tts_voice,
555
  audio_gain_db
@@ -560,9 +728,9 @@ def ui():
560
  gr.Markdown(
561
  """
562
  **Tips**
563
- - *Per-image audio (files)*: name audio like your images (e.g., `001.jpg` `001.wav`) for automatic matching.
564
- - *Per-image TTS per line*: supply the same number of lines as images; extra/missing lines are trimmed/padded.
565
- - *Coqui voice per line*: prefix a line with `speaker| text` to override the dropdown voice (e.g., `p225| Hello there`).
566
  """
567
  )
568
 
 
1
+ # app.py — Slideshow with per-image audio + multiline TTS per image + voice picker
2
+ # Works with MoviePy v2.x; falls back to v1 where possible. Python 3.9+ safe.
3
 
4
  import os
5
+ import re
6
  import tempfile
7
  import random
8
  from typing import Optional, List, Dict, Tuple
 
14
  # ---- MoviePy imports with v2/v1 compatibility ----
15
  MPY_V2 = False
16
  afx = None # audio effects (v2)
17
+ _CompositeAudioClip = None
18
+ _concat_audios = None
19
 
20
  try:
21
  # v2.x preferred
22
+ from moviepy import (
23
+ ImageSequenceClip,
24
+ AudioFileClip,
25
+ ImageClip,
26
+ concatenate_videoclips,
27
+ )
28
  try:
29
  from moviepy import afx as _afx # type: ignore
30
  afx = _afx
31
  except Exception:
32
  afx = None
33
+ try:
34
+ from moviepy import CompositeAudioClip as _CompositeAudioClip # type: ignore
35
+ except Exception:
36
+ _CompositeAudioClip = None
37
+ try:
38
+ from moviepy import concatenate_audioclips as _concat_audios # type: ignore
39
+ except Exception:
40
+ _concat_audios = None
41
  MPY_V2 = True
42
  except Exception:
43
  # v1.x fallback
44
+ from moviepy.editor import (
45
+ ImageSequenceClip,
46
+ AudioFileClip,
47
+ ImageClip,
48
+ concatenate_videoclips,
49
+ CompositeAudioClip as _CompositeAudioClip, # type: ignore
50
+ concatenate_audioclips as _concat_audios, # type: ignore
51
+ )
52
  MPY_V2 = False
53
 
54
 
 
85
  return audio_clip
86
 
87
 
88
+ def concat_audios_or_composite(clips: List):
89
+ """
90
+ Concatenate audio clips. Prefer built-in concatenator; otherwise composite
91
+ sequentially using start offsets to emulate concatenation.
92
+ """
93
+ if not clips:
94
+ return None
95
+ if len(clips) == 1:
96
+ return clips[0]
97
+ if _concat_audios is not None:
98
+ try:
99
+ return _concat_audios(clips)
100
+ except Exception:
101
+ pass
102
+ # Fallback: sequential CompositeAudioClip
103
+ if _CompositeAudioClip is not None:
104
+ starts = []
105
+ total = 0.0
106
+ seq = []
107
+ for c in clips:
108
+ seq.append(c.set_start(total))
109
+ total += float(c.duration)
110
+ comp = _CompositeAudioClip(seq)
111
+ try:
112
+ comp = clip_with_duration(comp, total)
113
+ except Exception:
114
+ pass
115
+ return comp
116
+ # last resort
117
+ return clips[0]
118
+
119
+
120
  # ---------- Image utilities ----------
121
 
122
  def load_and_fit_image(path: str, width: int, height: int, fit: str = "contain", bg: str = "#000000") -> np.ndarray:
123
  """
124
  Loads an image file and returns an RGB numpy array with exact (height, width, 3).
125
+ fit:
126
+ - "contain": letterbox to fit within target size (keeps aspect), background fills rest.
127
  - "cover": fill target size (keeps aspect) with center crop.
128
  - "stretch": distort to target size.
129
  """
 
140
  if fit == "cover":
141
  # scale to cover, then center-crop
142
  if src_aspect > target_aspect:
 
143
  new_h = height
144
  new_w = int(round(src_aspect * new_h))
145
  else:
 
146
  new_w = width
147
  new_h = int(round(new_w / src_aspect))
148
  img = img.resize((new_w, new_h), Image.LANCZOS)
 
166
  return np.array(canvas)
167
 
168
 
 
169
  # ---------- TTS backends ----------
170
 
171
  _TTS_CACHE: Dict[str, object] = {}
 
191
  try:
192
  tts = _get_tts_backend(backend_name)
193
  spks = list(getattr(tts, "speakers", []))
194
+ # Bring a common male voice to the top if present
195
+ for pref in ["p225", "p226", "p233", "p243"]:
196
+ if pref in spks:
197
+ spks.remove(pref)
198
+ spks.insert(0, pref)
 
 
199
  break
200
+ return sorted(spks) if not spks or spks[0] != "p225" else spks
201
  except Exception:
202
  return []
203
  return []
 
211
  if backend_name == "Coqui (VCTK multi-speaker)":
212
  try:
213
  tts = _get_tts_backend(backend_name)
 
214
  if not out_path.lower().endswith(".wav"):
215
  out_path = os.path.splitext(out_path)[0] + ".wav"
216
  tts.tts_to_file(text=text, speaker=voice, file_path=out_path)
 
218
  except Exception:
219
  return None
220
 
 
221
  if backend_name == "gTTS (simple)":
222
  try:
223
  from gtts import gTTS
 
231
  return None
232
 
233
 
234
+ # ---------- Text parsing for multiline-per-image ----------
235
 
236
+ def parse_multiline_blocks(text: str, expected_images: int) -> List[List[str]]:
237
  """
238
+ Split text into blocks by blank lines. Each block = one image.
239
+ Within a block, each non-empty line is a separate TTS segment.
240
  """
241
+ if not (text or "").strip():
242
+ return [[] for _ in range(expected_images)]
243
+ blocks = [b.strip() for b in re.split(r"\n\s*\n", text.strip()) if b.strip()]
244
+ # Pad/trim to match number of images
245
+ if len(blocks) < expected_images:
246
+ blocks += [""] * (expected_images - len(blocks))
247
+ elif len(blocks) > expected_images:
248
+ blocks = blocks[:expected_images]
249
+ result = []
250
+ for b in blocks:
251
+ lines = [ln.strip() for ln in b.splitlines() if ln.strip()]
252
+ result.append(lines)
 
 
 
 
 
 
 
 
 
 
 
 
253
  return result
254
 
255
 
256
+ # ---------- Build audio for each image from multiple lines ----------
257
+
258
+ def build_audio_for_image_lines(
259
+ lines: List[str],
260
+ tts_backend: str,
261
+ default_voice: Optional[str],
262
+ audio_gain_db: float,
263
+ tmp_dir: str
264
+ ):
265
+ """
266
+ For a single image:
267
+ - Generate TTS for each line (respect 'speaker| text' override).
268
+ - Concatenate segments.
269
+ - Apply gain to the final track.
270
+ - Return (audio_clip, total_duration) or (None, 0.0) if no audio.
271
+ """
272
+ segments = []
273
+ for idx, raw in enumerate(lines):
274
+ voice = default_voice
275
+ text = raw
276
+ if "|" in raw and tts_backend.startswith("Coqui"):
277
+ spk, txt = raw.split("|", 1)
278
+ if txt.strip():
279
+ text = txt.strip()
280
+ if spk.strip():
281
+ voice = spk.strip()
282
+ # Synthesize this line
283
+ out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
284
+ gen = synth_tts_to_file(text, tts_backend, voice, out_p)
285
+ if gen and os.path.exists(gen):
286
+ try:
287
+ seg = AudioFileClip(gen)
288
+ segments.append(seg)
289
+ except Exception:
290
+ pass
291
+
292
+ if not segments:
293
+ return None, 0.0
294
+
295
+ combined = concat_audios_or_composite(segments)
296
+ if combined is None:
297
+ return None, 0.0
298
+
299
+ # Apply gain on the final composite if needed
300
+ gain = 10 ** (float(audio_gain_db) / 20.0) if audio_gain_db else 1.0
301
+ if abs(gain - 1.0) > 1e-3:
302
+ combined = apply_linear_gain(combined, gain)
303
+
304
+ total = float(combined.duration)
305
+ return combined, total
306
+
307
+
308
+ # ---------- Variable-duration video (per-image) ----------
309
 
310
  def build_variable_duration_video(
311
  frames: List[np.ndarray],
312
  per_image_durations: List[float],
313
+ per_image_audios: List[Optional[object]], # AudioFileClip or CompositeAudioClip
 
314
  ):
315
  """
316
  Create a video where each image has its own duration and optional audio.
317
  """
318
  clips = []
319
+ for frame, dur, aclip in zip(frames, per_image_durations, per_image_audios):
320
  iclip = ImageClip(frame)
321
+ iclip = clip_with_duration(iclip, float(max(0.05, dur)))
322
+ if aclip is not None:
323
  try:
 
 
 
 
324
  iclip = clip_with_audio(iclip, aclip)
325
  except Exception:
326
  pass
327
  clips.append(iclip)
328
 
 
329
  final = concatenate_videoclips(clips, method="compose")
330
  return final
331
 
332
 
333
+ # ---------- Main create function ----------
334
+
335
  def create_slideshow(
336
  image_files: List,
337
+
338
+ narration_mode: str, # "None" | "Single story" | "Per-image (files)" | "Per-image (TTS per line)" | "Per-image (TTS multiline per image)"
339
  seconds_per_image: float,
340
  width: int,
341
  height: int,
 
349
  match_video_to_narration: bool,
350
 
351
  # per-image inputs
352
+ per_image_texts: str, # one line per image
353
+ per_image_multiline_blocks: str, # blocks separated by blank lines
354
  per_image_audio_files: List, # uploaded audio files
355
+ sync_per_image_audio: bool, # NEW: sync duration to audio for per-image modes
356
 
357
  # TTS config
358
  tts_backend: str,
 
383
  # Load frames
384
  width = int(width); height = int(height)
385
  frames = [load_and_fit_image(p, width, height, fit=fit_mode, bg=bg_color) for p in paths]
386
+ num_images = len(frames)
387
 
 
388
  out_path = os.path.join(tempfile.gettempdir(), "slideshow_output.mp4")
389
 
390
  # --- Per-image AUDIO FILES ---
 
397
  aud_paths.append(ap)
398
  aud_paths = sorted(aud_paths, key=lambda p: os.path.basename(p).lower())
399
 
400
+ # Basename match, then index fallback
401
+ def map_audio_to_images_by_name(image_paths: List[str], audio_paths: List[str]) -> List[Optional[str]]:
402
+ result = [None] * len(image_paths)
403
+ if not audio_paths:
404
+ return result
405
+ audio_map = {}
406
+ for a in audio_paths:
407
+ base = os.path.splitext(os.path.basename(a))[0].lower()
408
+ audio_map[base] = a
409
+ used = set()
410
+ for i, ip in enumerate(image_paths):
411
+ base = os.path.splitext(os.path.basename(ip))[0].lower()
412
+ if base in audio_map:
413
+ result[i] = audio_map[base]; used.add(audio_map[base])
414
+ leftover = [a for a in audio_paths if a not in used]
415
+ for i in range(len(image_paths)):
416
+ if result[i] is None and leftover:
417
+ result[i] = leftover.pop(0)
418
+ return result
419
+
420
+ per_img_audio_paths = map_audio_to_images_by_name(paths, aud_paths)
421
+
422
+ per_img_audios = []
423
+ per_img_durs = []
424
+ for ap in per_img_audio_paths:
425
  if ap:
426
  try:
427
  aclip = AudioFileClip(ap)
428
+ per_img_audios.append(aclip)
429
+ per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
430
  except Exception:
431
+ per_img_audios.append(None)
432
+ per_img_durs.append(float(seconds_per_image))
433
  else:
434
+ per_img_audios.append(None)
435
+ per_img_durs.append(float(seconds_per_image))
 
436
 
437
+ final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
438
  final_clip.write_videofile(
439
  out_path,
440
  codec="libx264",
 
445
  )
446
  return out_path, "Done! Per-image audio applied."
447
 
448
+ # --- Per-image TTS per single line (legacy one-line-per-image) ---
449
  if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
450
  lines = [ln.strip() for ln in per_image_texts.splitlines()]
451
  # Pad / trim to image count
452
+ if len(lines) < num_images:
453
+ lines += [""] * (num_images - len(lines))
454
  else:
455
+ lines = lines[:num_images]
456
 
 
457
  tmp_dir = tempfile.gettempdir()
458
+ per_img_audios = []
459
+ per_img_durs = []
460
+
461
  for idx, text in enumerate(lines):
462
  voice = tts_voice
 
463
  if "|" in text and tts_backend.startswith("Coqui"):
464
  maybe_speaker, maybe_text = text.split("|", 1)
465
  if maybe_text.strip():
 
473
  gen = synth_tts_to_file(text, tts_backend, voice, apath)
474
  apath = gen if gen and os.path.exists(gen) else None
475
 
 
476
  if apath:
477
  try:
478
  aclip = AudioFileClip(apath)
479
+ per_img_audios.append(aclip)
480
+ per_img_durs.append(float(aclip.duration) if sync_per_image_audio else float(seconds_per_image))
481
  except Exception:
482
+ per_img_audios.append(None)
483
+ per_img_durs.append(float(seconds_per_image))
484
  else:
485
+ per_img_audios.append(None)
486
+ per_img_durs.append(float(seconds_per_image))
487
 
488
+ final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
489
+ final_clip.write_videofile(
490
+ out_path,
491
+ codec="libx264",
492
+ audio_codec="aac",
493
+ fps=24,
494
+ preset="medium",
495
+ threads=max(1, (os.cpu_count() or 2) // 2),
496
+ )
497
+ return out_path, "Done! Per-image TTS (single line) applied."
498
+
499
+ # --- NEW: Per-image TTS multiline per image ---
500
+ if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
501
+ blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
502
+ tmp_dir = tempfile.gettempdir()
503
+
504
+ per_img_audios = []
505
+ per_img_durs = []
506
+
507
+ for idx, lines in enumerate(blocks):
508
+ if not lines:
509
+ per_img_audios.append(None)
510
+ per_img_durs.append(float(seconds_per_image))
511
+ continue
512
+
513
+ aclip, total = build_audio_for_image_lines(
514
+ lines=lines,
515
+ tts_backend=tts_backend,
516
+ default_voice=tts_voice,
517
+ audio_gain_db=audio_gain_db,
518
+ tmp_dir=tmp_dir
519
+ )
520
+
521
+ if aclip is not None:
522
+ per_img_audios.append(aclip)
523
+ per_img_durs.append(float(total) if sync_per_image_audio else float(seconds_per_image))
524
+ else:
525
+ per_img_audios.append(None)
526
+ per_img_durs.append(float(seconds_per_image))
527
 
528
+ final_clip = build_variable_duration_video(frames, per_img_durs, per_img_audios)
529
  final_clip.write_videofile(
530
  out_path,
531
  codec="libx264",
 
534
  preset="medium",
535
  threads=max(1, (os.cpu_count() or 2) // 2),
536
  )
537
+ return out_path, "Done! Per-image multiline TTS applied."
538
 
539
  # --- Single story (one track) ---
540
  if narration_mode == "Single story" and story_text.strip():
 
601
 
602
 
603
  def ui():
604
+ with gr.Blocks(title="Slideshow + Per-Image Audio + Multiline TTS + Voice Picker", theme=gr.themes.Soft()) as demo:
605
  gr.Markdown(
606
  """
607
  # 🖼️ → 🎬 Slideshow Maker
608
+ - **Per-image audio**: upload audio files, one (or more) per image (matched by filename or order).
609
+ - **Per-image TTS (multiline)**: write blocks separated by **blank lines**; lines inside a block are spoken sequentially for that image.
610
+ - **TTS voices**: pick from **Coqui VCTK** multi-speaker voices (male/female) or use gTTS as a lightweight fallback.
611
  """
612
  )
613
 
 
626
  shuffle_seed = gr.Number(value=0, precision=0, label="Shuffle Seed (integer)")
627
 
628
  seconds_per_image = gr.Slider(
629
+ minimum=0.1, maximum=10.0, step=0.1, value=1.5,
630
+ label="Seconds per Image (used when not syncing to audio)"
631
  )
632
 
633
  with gr.Row():
 
639
 
640
  with gr.Column(scale=1):
641
  narration_mode = gr.Radio(
642
+ ["None",
643
+ "Single story",
644
+ "Per-image (files)",
645
+ "Per-image (TTS per line)",
646
+ "Per-image (TTS multiline per image)"],
647
  value="None",
648
  label="Narration mode"
649
  )
650
+
651
  # Single-story UI
652
  story_text = gr.Textbox(
653
  label="Story (Single track narration)",
 
657
  value=True, label="Match video duration to narration length (single-story)"
658
  )
659
 
660
+ # Per-image UI (files)
661
  per_image_audio_files = gr.Files(
662
  label="Per-image audio files (optional) — matched by filename or order",
663
  file_count="multiple",
664
  file_types=["audio"]
665
  )
666
+ sync_per_image_audio = gr.Checkbox(
667
+ value=True, label="Sync image to audio duration (per-image modes)"
668
+ )
669
+
670
+ # Per-image UI (text)
671
  per_image_texts = gr.Textbox(
672
+ label="Per-image TTS (one line per image)",
673
+ placeholder="Line 1 (image 1)\nLine 2 (image 2)\n..."
674
+ )
675
+ per_image_multiline_blocks = gr.Textbox(
676
+ label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
677
+ placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n..."
678
  )
679
 
680
  with gr.Row():
 
683
  value="Coqui (VCTK multi-speaker)",
684
  label="TTS backend"
685
  )
686
+ tts_voice = gr.Dropdown(choices=[], label="Default Voice (for Coqui)")
687
  voice_status = gr.Markdown("")
688
 
689
  audio_gain_db = gr.Slider(
 
714
  sort_mode, shuffle_seed,
715
  # single-story
716
  story_text, match_video_to_narration,
717
+ # per-image text
718
+ per_image_texts, per_image_multiline_blocks,
719
+ # per-image files
720
+ per_image_audio_files, sync_per_image_audio,
721
  # tts
722
  tts_backend, tts_voice,
723
  audio_gain_db
 
728
  gr.Markdown(
729
  """
730
  **Tips**
731
+ - *Multiline per image*: separate image blocks with a **blank line**. Within each block, lines are spoken in order.
732
+ - *Coqui per-line speaker*: prefix a line with `speaker| text`, e.g., `p225| Hello there`.
733
+ - *Sync option*: turn it on to make each image stay up for the full duration of its own audio.
734
  """
735
  )
736