Files changed (3) hide show
  1. README.md +2 -1
  2. app.py +116 -11
  3. tts/engine.py +22 -1
README.md CHANGED
@@ -85,7 +85,8 @@ huggingface-cli upload <user>/podify . --repo-type=space
85
  MacLeod — 100% public domain (CC0). Rebuild with `scripts/build_music_loops.py`.
86
  A procedural numpy fallback in `tts/music.py` is used if the loops are absent.
87
 
 
88
  #backyard-ai
89
- Blog:
90
  Social Media Post:
91
  Demo:
 
85
  MacLeod — 100% public domain (CC0). Rebuild with `scripts/build_music_loops.py`.
86
  A procedural numpy fallback in `tts/music.py` is used if the loops are absent.
87
 
88
+
89
  #backyard-ai
90
+ Blog: [Article](https://huggingface.co/blog/build-small-hackathon/podify)
91
  Social Media Post:
92
  Demo:
app.py CHANGED
@@ -88,6 +88,15 @@ LIBRARY_VOICE_NAMES = list(_VOICE_NAME_TO_ID.keys())
88
  _VOICE_DEFAULTS = ["Nova", "Atlas", "Echo", "Sage"]
89
 
90
 
 
 
 
 
 
 
 
 
 
91
  def _voice_config_for(name):
92
  """Resolve a library voice name (e.g. 'Nova') to its cloning reference clip + text."""
93
  vid = _VOICE_NAME_TO_ID.get(name)
@@ -373,7 +382,7 @@ def run_research(topic, style, duration, num_speakers, *voice_names, progress=gr
373
  topic = (topic or "").strip()
374
  if not topic:
375
  raise gr.Error("Please enter a topic for the podcast.")
376
- n = int(num_speakers)
377
  # Use the picked voice names as the script's speaker names when they're distinct, so the
378
  # dialogue reads "Nova: …" / "Atlas: …". (Audio mapping is order-based regardless, so a
379
  # duplicate pick simply falls back to generic Host/Guest labels.)
@@ -427,7 +436,10 @@ def run_tts(lines, speakers, topic, bed, *voice_names, progress=gr.Progress()):
427
  tts_lines = [(spk, _strip_cues(txt)) for spk, txt in lines]
428
 
429
  progress(0.05, desc="Analyzing script & pacing…")
430
- sr, audio = tts_engine.generate_podcast(tts_lines, voice_map, progress=progress)
 
 
 
431
 
432
  # Mix the selected background-music bed under the voices (no-op for "No music").
433
  progress(0.95, desc="Mixing the sound bed…")
@@ -567,6 +579,14 @@ footer { display:none !important; }
567
  /* ---------- cards / panels ---------- */
568
  .pf-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
569
  border-radius:18px !important; padding:20px 22px !important; }
 
 
 
 
 
 
 
 
570
  .pf-eyebrow { font-size:10.5px;letter-spacing:2px;color:var(--pf-faint);font-weight:700;
571
  display:flex;align-items:center;gap:7px;margin-bottom:8px; }
572
  .pf-step-h { display:flex;align-items:center;gap:11px;margin:4px 0 16px; }
@@ -576,8 +596,26 @@ footer { display:none !important; }
576
  .pf-step-h .h { font-size:12.5px;color:var(--pf-mut);font-weight:500; }
577
 
578
  /* topic textbox */
579
- #pf-topic textarea { background:transparent !important; border:none !important; font-size:21px !important;
580
- color:var(--pf-text) !important; line-height:1.5 !important; box-shadow:none !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
  #pf-topic-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
582
  border-radius:18px !important; padding:20px 22px 22px !important; }
583
 
@@ -594,10 +632,69 @@ footer { display:none !important; }
594
  .pf-config-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
595
  border-radius:15px !important; padding:14px 16px !important; }
596
  .pf-config-card label span { font-size:10px !important; letter-spacing:1.5px !important;
597
- text-transform:uppercase !important; color:var(--pf-faint) !important; font-weight:700 !important; }
598
- .pf-config-card .wrap, .pf-config-card input, .pf-config-card select,
599
- .pf-config-card .secondary-wrap { background:transparent !important; border:none !important; color:#fff !important; }
600
- .pf-config-card .gr-box, .pf-config-card .container { border:none !important; background:transparent !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
  /* dropdowns open downward — keep the popup above the cards that follow it, otherwise
602
  the next card paints over all but the first option. Raise the row's stacking context
603
  (no overflow change, so nothing gets clipped). */
@@ -911,7 +1008,10 @@ def build_ui():
911
  value=2, label="LENGTH", filterable=False,
912
  )
913
  with gr.Column(elem_classes=["pf-config-card"]):
914
- num_speakers = gr.Slider(1, MAX_SPEAKERS, value=2, step=1, label="VOICES")
 
 
 
915
  with gr.Column(elem_classes=["pf-config-card"]):
916
  bed_step1 = gr.Dropdown(
917
  [b[0] for b in SOUND_BEDS], value="Ambient Drift", label="SOUND BED",
@@ -1085,14 +1185,19 @@ def build_ui():
1085
  script_box2.change(script_to_bubbles, inputs=[script_box2], outputs=[review_bubbles])
1086
 
1087
  # show one voice picker per selected speaker
 
 
 
 
1088
  num_speakers.change(
1089
- lambda n: [gr.update(visible=i < int(n)) for i in range(MAX_SPEAKERS)],
1090
  inputs=[num_speakers], outputs=voice_pickers,
1091
  )
1092
 
1093
  # keep the header cast chip in sync with the picked voices (all pages)
1094
  def _refresh_headers(n, *names):
1095
- sel = [names[i] for i in range(int(n)) if i < len(names) and names[i]]
 
1096
  return (
1097
  gr.update(value=_header("Create",
1098
  "Type a topic — Podify writes &amp; voices it", sel)),
 
88
  _VOICE_DEFAULTS = ["Nova", "Atlas", "Echo", "Sage"]
89
 
90
 
91
+ def _speaker_count(value, default: int = 2) -> int:
92
+ """Normalize transient Gradio slider values such as None or 2.0."""
93
+ try:
94
+ n = int(float(value))
95
+ except (TypeError, ValueError):
96
+ n = default
97
+ return max(1, min(MAX_SPEAKERS, n))
98
+
99
+
100
  def _voice_config_for(name):
101
  """Resolve a library voice name (e.g. 'Nova') to its cloning reference clip + text."""
102
  vid = _VOICE_NAME_TO_ID.get(name)
 
382
  topic = (topic or "").strip()
383
  if not topic:
384
  raise gr.Error("Please enter a topic for the podcast.")
385
+ n = _speaker_count(num_speakers)
386
  # Use the picked voice names as the script's speaker names when they're distinct, so the
387
  # dialogue reads "Nova: …" / "Atlas: …". (Audio mapping is order-based regardless, so a
388
  # duplicate pick simply falls back to generic Host/Guest labels.)
 
436
  tts_lines = [(spk, _strip_cues(txt)) for spk, txt in lines]
437
 
438
  progress(0.05, desc="Analyzing script & pacing…")
439
+ try:
440
+ sr, audio = tts_engine.generate_podcast(tts_lines, voice_map, progress=progress)
441
+ except tts_engine.TTSModelAccessError as e:
442
+ raise gr.Error(str(e)) from e
443
 
444
  # Mix the selected background-music bed under the voices (no-op for "No music").
445
  progress(0.95, desc="Mixing the sound bed…")
 
579
  /* ---------- cards / panels ---------- */
580
  .pf-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
581
  border-radius:18px !important; padding:20px 22px !important; }
582
+ .pf-card .form {
583
+ background:transparent !important; border:none !important; box-shadow:none !important;
584
+ padding:0 !important;
585
+ }
586
+ .pf-card .form > .pf-config-card {
587
+ background:rgba(255,255,255,.035) !important;
588
+ border-color:rgba(255,255,255,.08) !important;
589
+ }
590
  .pf-eyebrow { font-size:10.5px;letter-spacing:2px;color:var(--pf-faint);font-weight:700;
591
  display:flex;align-items:center;gap:7px;margin-bottom:8px; }
592
  .pf-step-h { display:flex;align-items:center;gap:11px;margin:4px 0 16px; }
 
596
  .pf-step-h .h { font-size:12.5px;color:var(--pf-mut);font-weight:500; }
597
 
598
  /* topic textbox */
599
+ #pf-topic-card, .pf-config-card {
600
+ --block-background-fill:transparent;
601
+ --block-border-color:transparent;
602
+ --input-background-fill:rgba(255,255,255,.035);
603
+ --input-border-color:rgba(255,255,255,.10);
604
+ --input-text-color:var(--pf-text);
605
+ --body-text-color:var(--pf-text);
606
+ --block-label-text-color:#b9bbcf;
607
+ }
608
+ #pf-topic, #pf-topic .block, #pf-topic .form, #pf-topic .wrap,
609
+ #pf-topic .container, #pf-topic .input-container,
610
+ #pf-topic-card .block, #pf-topic-card .form, #pf-topic-card .wrap,
611
+ #pf-topic-card .container, #pf-topic-card .input-container {
612
+ background:transparent !important; border:none !important; box-shadow:none !important;
613
+ }
614
+ #pf-topic textarea { background:rgba(255,255,255,.035) !important;
615
+ border:1px solid rgba(255,255,255,.10) !important; border-radius:14px !important;
616
+ font-size:21px !important; color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
617
+ line-height:1.5 !important; box-shadow:none !important; padding:18px 20px !important; }
618
+ #pf-topic textarea::placeholder { color:#c3c5d5 !important; -webkit-text-fill-color:#c3c5d5 !important; opacity:1 !important; }
619
  #pf-topic-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
620
  border-radius:18px !important; padding:20px 22px 22px !important; }
621
 
 
632
  .pf-config-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
633
  border-radius:15px !important; padding:14px 16px !important; }
634
  .pf-config-card label span { font-size:10px !important; letter-spacing:1.5px !important;
635
+ text-transform:uppercase !important; color:#aeb1c6 !important;
636
+ -webkit-text-fill-color:#aeb1c6 !important; font-weight:700 !important; }
637
+ .pf-config-card .block, .pf-config-card .form, .pf-config-card .gr-box,
638
+ .pf-config-card .container, .pf-config-card .wrap {
639
+ background:transparent !important; border:none !important; box-shadow:none !important;
640
+ }
641
+ .pf-config-card .secondary-wrap, .pf-config-card .input-container,
642
+ .pf-config-card input, .pf-config-card select,
643
+ .pf-config-card [data-testid="dropdown"], .pf-config-card [role="button"] {
644
+ background:rgba(255,255,255,.045) !important;
645
+ border:1px solid rgba(255,255,255,.10) !important; border-radius:10px !important;
646
+ color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
647
+ box-shadow:none !important;
648
+ }
649
+ .pf-config-card input, .pf-config-card select { min-height:40px !important; padding:7px 10px !important; }
650
+ .pf-config-card input::placeholder { color:#c3c5d5 !important; -webkit-text-fill-color:#c3c5d5 !important; opacity:1 !important; }
651
+ .pf-config-card .secondary-wrap *, .pf-config-card [data-testid="dropdown"] *,
652
+ .pf-config-card [role="button"] * {
653
+ color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
654
+ }
655
+ .pf-config-card svg { color:#c9b8ff !important; fill:currentColor !important; }
656
+ .pf-config-card option { background:var(--pf-panel-2) !important; color:var(--pf-text) !important; }
657
+ .pf-config-card .slider input, .pf-config-card input[type="range"] {
658
+ background:transparent !important; border:none !important; -webkit-text-fill-color:initial !important;
659
+ }
660
+ .pf-config-card .input-container input[type="number"],
661
+ .pf-config-card input[data-testid="number-input"] {
662
+ width:74px !important; min-width:74px !important; max-width:74px !important;
663
+ height:40px !important; min-height:40px !important; padding:0 10px !important;
664
+ text-align:center !important; background:rgba(255,255,255,.045) !important;
665
+ border:1px solid rgba(255,255,255,.10) !important; border-radius:9px !important;
666
+ color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
667
+ }
668
+ #pf-voices input[data-testid="number-input"] {
669
+ width:54px !important; min-width:54px !important; max-width:54px !important;
670
+ height:34px !important; min-height:34px !important; line-height:34px !important;
671
+ padding:0 !important; box-sizing:border-box !important; text-align:center !important;
672
+ font-size:15px !important; font-weight:800 !important;
673
+ background:rgba(255,255,255,.055) !important;
674
+ border:1px solid rgba(255,255,255,.13) !important; border-radius:9px !important;
675
+ }
676
+ #pf-voices input[data-testid="number-input"]::-webkit-outer-spin-button,
677
+ #pf-voices input[data-testid="number-input"]::-webkit-inner-spin-button {
678
+ -webkit-appearance:none !important; margin:0 !important;
679
+ }
680
+ .pf-config-card .slider, .pf-config-card .slider * {
681
+ color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
682
+ }
683
+ .gradio-container .options, .gradio-container .options ul,
684
+ .gradio-container [role="listbox"] {
685
+ background:var(--pf-panel-2) !important; border:1px solid rgba(124,92,255,.34) !important;
686
+ color:var(--pf-text) !important; box-shadow:0 18px 46px rgba(0,0,0,.55) !important;
687
+ }
688
+ .gradio-container .options li, .gradio-container .options .item,
689
+ .gradio-container [role="option"] {
690
+ background:transparent !important; color:var(--pf-text) !important;
691
+ -webkit-text-fill-color:var(--pf-text) !important;
692
+ }
693
+ .gradio-container .options li:hover, .gradio-container .options .item:hover,
694
+ .gradio-container [role="option"]:hover,
695
+ .gradio-container [role="option"][aria-selected="true"] {
696
+ background:rgba(124,92,255,.18) !important; color:#fff !important; -webkit-text-fill-color:#fff !important;
697
+ }
698
  /* dropdowns open downward — keep the popup above the cards that follow it, otherwise
699
  the next card paints over all but the first option. Raise the row's stacking context
700
  (no overflow change, so nothing gets clipped). */
 
1008
  value=2, label="LENGTH", filterable=False,
1009
  )
1010
  with gr.Column(elem_classes=["pf-config-card"]):
1011
+ num_speakers = gr.Slider(
1012
+ 1, MAX_SPEAKERS, value=2, step=1, label="VOICES",
1013
+ elem_id="pf-voices", show_reset_button=False,
1014
+ )
1015
  with gr.Column(elem_classes=["pf-config-card"]):
1016
  bed_step1 = gr.Dropdown(
1017
  [b[0] for b in SOUND_BEDS], value="Ambient Drift", label="SOUND BED",
 
1185
  script_box2.change(script_to_bubbles, inputs=[script_box2], outputs=[review_bubbles])
1186
 
1187
  # show one voice picker per selected speaker
1188
+ def _voice_picker_visibility(n):
1189
+ count = _speaker_count(n)
1190
+ return [gr.update(visible=i < count) for i in range(MAX_SPEAKERS)]
1191
+
1192
  num_speakers.change(
1193
+ _voice_picker_visibility,
1194
  inputs=[num_speakers], outputs=voice_pickers,
1195
  )
1196
 
1197
  # keep the header cast chip in sync with the picked voices (all pages)
1198
  def _refresh_headers(n, *names):
1199
+ count = _speaker_count(n)
1200
+ sel = [names[i] for i in range(count) if i < len(names) and names[i]]
1201
  return (
1202
  gr.update(value=_header("Create",
1203
  "Type a topic — Podify writes &amp; voices it", sel)),
tts/engine.py CHANGED
@@ -43,6 +43,10 @@ _ENGINE = None # cached TTSInferenceEngine
43
  _SAMPLE_RATE = 44100
44
 
45
 
 
 
 
 
46
  @dataclass
47
  class VoiceConfig:
48
  """Resolved voice for one speaker: a reference clip+text, or model default."""
@@ -146,7 +150,24 @@ def _load_engine():
146
  device = "cuda" if torch.cuda.is_available() else "cpu"
147
  precision = torch.half if device == "cuda" else torch.float32
148
 
149
- checkpoint_dir = snapshot_download(repo_id=TTS_MODEL_REPO)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  llama_queue = launch_thread_safe_queue(
152
  checkpoint_path=checkpoint_dir,
 
43
  _SAMPLE_RATE = 44100
44
 
45
 
46
+ class TTSModelAccessError(RuntimeError):
47
+ """Raised when the configured TTS model cannot be downloaded from HF Hub."""
48
+
49
+
50
  @dataclass
51
  class VoiceConfig:
52
  """Resolved voice for one speaker: a reference clip+text, or model default."""
 
150
  device = "cuda" if torch.cuda.is_available() else "cpu"
151
  precision = torch.half if device == "cuda" else torch.float32
152
 
153
+ token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
154
+ try:
155
+ checkpoint_dir = snapshot_download(repo_id=TTS_MODEL_REPO, token=token)
156
+ except Exception as e:
157
+ msg = str(e)
158
+ if type(e).__name__ == "GatedRepoError" or "Cannot access gated repo" in msg or "403" in msg:
159
+ access_url = (
160
+ "https://huggingface.co/fishaudio/s1-mini"
161
+ if TTS_MODEL_REPO == "fishaudio/openaudio-s1-mini"
162
+ else f"https://huggingface.co/{TTS_MODEL_REPO}"
163
+ )
164
+ raise TTSModelAccessError(
165
+ f"The TTS model '{TTS_MODEL_REPO}' is gated or not accessible with the current "
166
+ f"Hugging Face token. Request access at {access_url}, then log in locally or set "
167
+ "HF_TOKEN to a token with read access. You can also set TTS_MODEL_REPO to another "
168
+ "compatible Fish Audio/OpenAudio checkpoint you can access."
169
+ ) from e
170
+ raise
171
 
172
  llama_queue = launch_thread_safe_queue(
173
  checkpoint_path=checkpoint_dir,