Vishwas1 commited on
Commit
1c01e22
·
verified ·
1 Parent(s): 902db85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -34
app.py CHANGED
@@ -1,11 +1,11 @@
1
- # app.py — Slideshow with per-image audio + multiline TTS per image + voice picker
2
- # Works with MoviePy v2.x; falls back to v1 where possible. Python 3.9+ safe.
3
 
4
  import os
5
  import re
6
  import tempfile
7
  import random
8
- from typing import Optional, List, Dict, Tuple
9
 
10
  import numpy as np
11
  from PIL import Image
@@ -46,8 +46,8 @@ except Exception:
46
  AudioFileClip,
47
  ImageClip,
48
  concatenate_videoclips,
49
- CompositeAudioClip as _CompositeAudioClip, # type: ignore
50
- concatenate_audioclips as _concat_audios, # type: ignore
51
  )
52
  MPY_V2 = False
53
 
@@ -87,7 +87,7 @@ def apply_linear_gain(audio_clip, gain_linear: float):
87
 
88
  def concat_audios_or_composite(clips: List):
89
  """
90
- Concatenate audio clips. Prefer built-in concatenator; otherwise composite
91
  sequentially using start offsets to emulate concatenation.
92
  """
93
  if not clips:
@@ -101,19 +101,21 @@ def concat_audios_or_composite(clips: List):
101
  pass
102
  # Fallback: sequential CompositeAudioClip
103
  if _CompositeAudioClip is not None:
104
- starts = []
105
  total = 0.0
106
  seq = []
107
  for c in clips:
108
- seq.append(c.set_start(total))
109
- total += float(c.duration)
 
 
 
110
  comp = _CompositeAudioClip(seq)
111
  try:
112
  comp = clip_with_duration(comp, total)
113
  except Exception:
114
  pass
115
  return comp
116
- # last resort
117
  return clips[0]
118
 
119
 
@@ -187,20 +189,57 @@ def _get_tts_backend(backend_name: str):
187
 
188
 
189
  def list_voices(backend_name: str) -> List[str]:
190
- if backend_name == "Coqui (VCTK multi-speaker)":
191
- try:
192
- tts = _get_tts_backend(backend_name)
193
- spks = list(getattr(tts, "speakers", []))
194
- # Bring a common male voice to the top if present
195
- for pref in ["p225", "p226", "p233", "p243"]:
196
- if pref in spks:
197
- spks.remove(pref)
198
- spks.insert(0, pref)
 
 
 
 
 
 
 
 
 
 
 
199
  break
200
- return sorted(spks) if not spks or spks[0] != "p225" else spks
201
- except Exception:
202
- return []
203
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
 
206
  def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
@@ -279,7 +318,6 @@ def build_audio_for_image_lines(
279
  text = txt.strip()
280
  if spk.strip():
281
  voice = spk.strip()
282
- # Synthesize this line
283
  out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
284
  gen = synth_tts_to_file(text, tts_backend, voice, out_p)
285
  if gen and os.path.exists(gen):
@@ -352,7 +390,7 @@ def create_slideshow(
352
  per_image_texts: str, # one line per image
353
  per_image_multiline_blocks: str, # blocks separated by blank lines
354
  per_image_audio_files: List, # uploaded audio files
355
- sync_per_image_audio: bool, # NEW: sync duration to audio for per-image modes
356
 
357
  # TTS config
358
  tts_backend: str,
@@ -445,7 +483,7 @@ def create_slideshow(
445
  )
446
  return out_path, "Done! Per-image audio applied."
447
 
448
- # --- Per-image TTS per single line (legacy one-line-per-image) ---
449
  if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
450
  lines = [ln.strip() for ln in per_image_texts.splitlines()]
451
  # Pad / trim to image count
@@ -496,7 +534,7 @@ def create_slideshow(
496
  )
497
  return out_path, "Done! Per-image TTS (single line) applied."
498
 
499
- # --- NEW: Per-image TTS multiline per image ---
500
  if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
501
  blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
502
  tmp_dir = tempfile.gettempdir()
@@ -538,7 +576,7 @@ def create_slideshow(
538
 
539
  # --- Single story (one track) ---
540
  if narration_mode == "Single story" and story_text.strip():
541
- # Build base video (uniform duration)
542
  fps = 24
543
  repeats = max(1, int(round(float(seconds_per_image) * fps)))
544
  expanded = []
@@ -597,7 +635,8 @@ def create_slideshow(
597
  def update_voice_choices(backend_name: str):
598
  voices = list_voices(backend_name)
599
  value = voices[0] if voices else None
600
- return gr.update(choices=voices, value=value), f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
 
601
 
602
 
603
  def ui():
@@ -651,7 +690,9 @@ def ui():
651
  # Single-story UI
652
  story_text = gr.Textbox(
653
  label="Story (Single track narration)",
654
- placeholder="Type or paste your story..."
 
 
655
  )
656
  match_video_to_narration = gr.Checkbox(
657
  value=True, label="Match video duration to narration length (single-story)"
@@ -670,11 +711,15 @@ def ui():
670
  # Per-image UI (text)
671
  per_image_texts = gr.Textbox(
672
  label="Per-image TTS (one line per image)",
673
- placeholder="Line 1 (image 1)\nLine 2 (image 2)\n..."
 
 
674
  )
675
  per_image_multiline_blocks = gr.Textbox(
676
  label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
677
- placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n..."
 
 
678
  )
679
 
680
  with gr.Row():
@@ -702,6 +747,13 @@ def ui():
702
  outputs=[tts_voice, voice_status]
703
  )
704
 
 
 
 
 
 
 
 
705
  # Main action
706
  run_btn.click(
707
  fn=create_slideshow,
@@ -714,9 +766,9 @@ def ui():
714
  sort_mode, shuffle_seed,
715
  # single-story
716
  story_text, match_video_to_narration,
717
- # per-image text
718
  per_image_texts, per_image_multiline_blocks,
719
- # per-image files
720
  per_image_audio_files, sync_per_image_audio,
721
  # tts
722
  tts_backend, tts_voice,
 
1
+ # app.py — Slideshow with per-image audio, multiline TTS per image, and voice picker
2
+ # Works with MoviePy v2.x; falls back to v1 when necessary. Python 3.9+ safe.
3
 
4
  import os
5
  import re
6
  import tempfile
7
  import random
8
+ from typing import Optional, List, Dict
9
 
10
  import numpy as np
11
  from PIL import Image
 
46
  AudioFileClip,
47
  ImageClip,
48
  concatenate_videoclips,
49
+ CompositeAudioClip as _CompositeAudioClip, # type: ignore
50
+ concatenate_audioclips as _concat_audios, # type: ignore
51
  )
52
  MPY_V2 = False
53
 
 
87
 
88
  def concat_audios_or_composite(clips: List):
89
  """
90
+ Concatenate audio clips. Prefer the built-in concatenator; otherwise composite
91
  sequentially using start offsets to emulate concatenation.
92
  """
93
  if not clips:
 
101
  pass
102
  # Fallback: sequential CompositeAudioClip
103
  if _CompositeAudioClip is not None:
 
104
  total = 0.0
105
  seq = []
106
  for c in clips:
107
+ try:
108
+ seq.append(c.set_start(total))
109
+ total += float(c.duration)
110
+ except Exception:
111
+ pass
112
  comp = _CompositeAudioClip(seq)
113
  try:
114
  comp = clip_with_duration(comp, total)
115
  except Exception:
116
  pass
117
  return comp
118
+ # Last resort
119
  return clips[0]
120
 
121
 
 
189
 
190
 
191
  def list_voices(backend_name: str) -> List[str]:
192
+ if backend_name != "Coqui (VCTK multi-speaker)":
193
+ return []
194
+
195
+ try:
196
+ tts = _get_tts_backend(backend_name)
197
+ candidates: List[str] = []
198
+
199
+ # Try common attributes across TTS versions
200
+ for path in [
201
+ "speakers",
202
+ "speaker_manager.speaker_names",
203
+ "speaker_manager.speaker_ids",
204
+ ]:
205
+ obj = tts
206
+ try:
207
+ for part in path.split("."):
208
+ obj = getattr(obj, part)
209
+ names = list(obj) if obj is not None else []
210
+ if names:
211
+ candidates = [str(x) for x in names]
212
  break
213
+ except Exception:
214
+ continue
215
+
216
+ # Sensible fallback if nothing found (known VCTK IDs)
217
+ if not candidates:
218
+ candidates = [
219
+ "p225","p226","p233","p243","p254","p256","p258","p259",
220
+ "p270","p273","p274","p278","p279","p302","p311","p316",
221
+ "p334","p345","p360","p363","p374"
222
+ ]
223
+
224
+ # Nudge common male IDs toward the top if present
225
+ male_pref = ["p225","p226","p233","p243","p270","p274","p279","p311","p345","p360","p363"]
226
+ ordered = candidates[:]
227
+ for pref in reversed(male_pref):
228
+ if pref in ordered:
229
+ ordered.remove(pref)
230
+ ordered.insert(0, pref)
231
+
232
+ # Deduplicate while preserving order
233
+ seen, final = set(), []
234
+ for v in ordered:
235
+ if v not in seen:
236
+ seen.add(v)
237
+ final.append(v)
238
+ return final
239
+
240
+ except Exception:
241
+ # Absolute fallback
242
+ return ["p225","p226","p233","p243"]
243
 
244
 
245
  def synth_tts_to_file(text: str, backend_name: str, voice: Optional[str], out_path: str) -> Optional[str]:
 
318
  text = txt.strip()
319
  if spk.strip():
320
  voice = spk.strip()
 
321
  out_p = os.path.join(tmp_dir, f"tts_seg_{random.randint(1, 1_000_000)}_{idx}.wav")
322
  gen = synth_tts_to_file(text, tts_backend, voice, out_p)
323
  if gen and os.path.exists(gen):
 
390
  per_image_texts: str, # one line per image
391
  per_image_multiline_blocks: str, # blocks separated by blank lines
392
  per_image_audio_files: List, # uploaded audio files
393
+ sync_per_image_audio: bool, # sync duration to audio for per-image modes
394
 
395
  # TTS config
396
  tts_backend: str,
 
483
  )
484
  return out_path, "Done! Per-image audio applied."
485
 
486
+ # --- Per-image TTS per single line ---
487
  if narration_mode == "Per-image (TTS per line)" and per_image_texts.strip():
488
  lines = [ln.strip() for ln in per_image_texts.splitlines()]
489
  # Pad / trim to image count
 
534
  )
535
  return out_path, "Done! Per-image TTS (single line) applied."
536
 
537
+ # --- Per-image TTS multiline per image ---
538
  if narration_mode == "Per-image (TTS multiline per image)" and per_image_multiline_blocks.strip():
539
  blocks = parse_multiline_blocks(per_image_multiline_blocks, num_images)
540
  tmp_dir = tempfile.gettempdir()
 
576
 
577
  # --- Single story (one track) ---
578
  if narration_mode == "Single story" and story_text.strip():
579
+ # Base video (uniform duration)
580
  fps = 24
581
  repeats = max(1, int(round(float(seconds_per_image) * fps)))
582
  expanded = []
 
635
  def update_voice_choices(backend_name: str):
636
  voices = list_voices(backend_name)
637
  value = voices[0] if voices else None
638
+ msg = f"Loaded {len(voices)} voices." if voices else "No voices found (or using gTTS)."
639
+ return gr.update(choices=voices, value=value), msg
640
 
641
 
642
  def ui():
 
690
  # Single-story UI
691
  story_text = gr.Textbox(
692
  label="Story (Single track narration)",
693
+ placeholder="Type or paste your story...",
694
+ lines=6,
695
+ autogrow=True
696
  )
697
  match_video_to_narration = gr.Checkbox(
698
  value=True, label="Match video duration to narration length (single-story)"
 
711
  # Per-image UI (text)
712
  per_image_texts = gr.Textbox(
713
  label="Per-image TTS (one line per image)",
714
+ placeholder="Line 1 (image 1)\nLine 2 (image 2)\n...",
715
+ lines=8,
716
+ autogrow=True
717
  )
718
  per_image_multiline_blocks = gr.Textbox(
719
  label="Per-image TTS (multiline): blocks separated by blank lines; use 'speaker| text' to override",
720
+ placeholder="p225| First line for image 1\nSecond line for image 1\n\nLine 1 for image 2\nLine 2 for image 2\n...",
721
+ lines=12,
722
+ autogrow=True
723
  )
724
 
725
  with gr.Row():
 
747
  outputs=[tts_voice, voice_status]
748
  )
749
 
750
+ # Also populate on initial load
751
+ demo.load(
752
+ fn=update_voice_choices,
753
+ inputs=[tts_backend],
754
+ outputs=[tts_voice, voice_status]
755
+ )
756
+
757
  # Main action
758
  run_btn.click(
759
  fn=create_slideshow,
 
766
  sort_mode, shuffle_seed,
767
  # single-story
768
  story_text, match_video_to_narration,
769
+ # per-image text inputs
770
  per_image_texts, per_image_multiline_blocks,
771
+ # per-image files + sync
772
  per_image_audio_files, sync_per_image_audio,
773
  # tts
774
  tts_backend, tts_voice,