Spaces:
Running on Zero
Running on Zero
pr/2
#2
by nvipin63 - opened
- README.md +2 -1
- app.py +116 -11
- tts/engine.py +22 -1
README.md
CHANGED
|
@@ -85,7 +85,8 @@ huggingface-cli upload <user>/podify . --repo-type=space
|
|
| 85 |
MacLeod — 100% public domain (CC0). Rebuild with `scripts/build_music_loops.py`.
|
| 86 |
A procedural numpy fallback in `tts/music.py` is used if the loops are absent.
|
| 87 |
|
|
|
|
| 88 |
#backyard-ai
|
| 89 |
-
Blog:
|
| 90 |
Social Media Post:
|
| 91 |
Demo:
|
|
|
|
| 85 |
MacLeod — 100% public domain (CC0). Rebuild with `scripts/build_music_loops.py`.
|
| 86 |
A procedural numpy fallback in `tts/music.py` is used if the loops are absent.
|
| 87 |
|
| 88 |
+
|
| 89 |
#backyard-ai
|
| 90 |
+
Blog: [Article](https://huggingface.co/blog/build-small-hackathon/podify)
|
| 91 |
Social Media Post:
|
| 92 |
Demo:
|
app.py
CHANGED
|
@@ -88,6 +88,15 @@ LIBRARY_VOICE_NAMES = list(_VOICE_NAME_TO_ID.keys())
|
|
| 88 |
_VOICE_DEFAULTS = ["Nova", "Atlas", "Echo", "Sage"]
|
| 89 |
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
def _voice_config_for(name):
|
| 92 |
"""Resolve a library voice name (e.g. 'Nova') to its cloning reference clip + text."""
|
| 93 |
vid = _VOICE_NAME_TO_ID.get(name)
|
|
@@ -373,7 +382,7 @@ def run_research(topic, style, duration, num_speakers, *voice_names, progress=gr
|
|
| 373 |
topic = (topic or "").strip()
|
| 374 |
if not topic:
|
| 375 |
raise gr.Error("Please enter a topic for the podcast.")
|
| 376 |
-
n =
|
| 377 |
# Use the picked voice names as the script's speaker names when they're distinct, so the
|
| 378 |
# dialogue reads "Nova: …" / "Atlas: …". (Audio mapping is order-based regardless, so a
|
| 379 |
# duplicate pick simply falls back to generic Host/Guest labels.)
|
|
@@ -427,7 +436,10 @@ def run_tts(lines, speakers, topic, bed, *voice_names, progress=gr.Progress()):
|
|
| 427 |
tts_lines = [(spk, _strip_cues(txt)) for spk, txt in lines]
|
| 428 |
|
| 429 |
progress(0.05, desc="Analyzing script & pacing…")
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
# Mix the selected background-music bed under the voices (no-op for "No music").
|
| 433 |
progress(0.95, desc="Mixing the sound bed…")
|
|
@@ -567,6 +579,14 @@ footer { display:none !important; }
|
|
| 567 |
/* ---------- cards / panels ---------- */
|
| 568 |
.pf-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
|
| 569 |
border-radius:18px !important; padding:20px 22px !important; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
.pf-eyebrow { font-size:10.5px;letter-spacing:2px;color:var(--pf-faint);font-weight:700;
|
| 571 |
display:flex;align-items:center;gap:7px;margin-bottom:8px; }
|
| 572 |
.pf-step-h { display:flex;align-items:center;gap:11px;margin:4px 0 16px; }
|
|
@@ -576,8 +596,26 @@ footer { display:none !important; }
|
|
| 576 |
.pf-step-h .h { font-size:12.5px;color:var(--pf-mut);font-weight:500; }
|
| 577 |
|
| 578 |
/* topic textbox */
|
| 579 |
-
#pf-topic
|
| 580 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
#pf-topic-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
|
| 582 |
border-radius:18px !important; padding:20px 22px 22px !important; }
|
| 583 |
|
|
@@ -594,10 +632,69 @@ footer { display:none !important; }
|
|
| 594 |
.pf-config-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
|
| 595 |
border-radius:15px !important; padding:14px 16px !important; }
|
| 596 |
.pf-config-card label span { font-size:10px !important; letter-spacing:1.5px !important;
|
| 597 |
-
text-transform:uppercase !important; color:
|
| 598 |
-
|
| 599 |
-
.pf-config-card .
|
| 600 |
-
.pf-config-card .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
/* dropdowns open downward — keep the popup above the cards that follow it, otherwise
|
| 602 |
the next card paints over all but the first option. Raise the row's stacking context
|
| 603 |
(no overflow change, so nothing gets clipped). */
|
|
@@ -911,7 +1008,10 @@ def build_ui():
|
|
| 911 |
value=2, label="LENGTH", filterable=False,
|
| 912 |
)
|
| 913 |
with gr.Column(elem_classes=["pf-config-card"]):
|
| 914 |
-
num_speakers = gr.Slider(
|
|
|
|
|
|
|
|
|
|
| 915 |
with gr.Column(elem_classes=["pf-config-card"]):
|
| 916 |
bed_step1 = gr.Dropdown(
|
| 917 |
[b[0] for b in SOUND_BEDS], value="Ambient Drift", label="SOUND BED",
|
|
@@ -1085,14 +1185,19 @@ def build_ui():
|
|
| 1085 |
script_box2.change(script_to_bubbles, inputs=[script_box2], outputs=[review_bubbles])
|
| 1086 |
|
| 1087 |
# show one voice picker per selected speaker
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1088 |
num_speakers.change(
|
| 1089 |
-
|
| 1090 |
inputs=[num_speakers], outputs=voice_pickers,
|
| 1091 |
)
|
| 1092 |
|
| 1093 |
# keep the header cast chip in sync with the picked voices (all pages)
|
| 1094 |
def _refresh_headers(n, *names):
|
| 1095 |
-
|
|
|
|
| 1096 |
return (
|
| 1097 |
gr.update(value=_header("Create",
|
| 1098 |
"Type a topic — Podify writes & voices it", sel)),
|
|
|
|
| 88 |
_VOICE_DEFAULTS = ["Nova", "Atlas", "Echo", "Sage"]
|
| 89 |
|
| 90 |
|
| 91 |
+
def _speaker_count(value, default: int = 2) -> int:
|
| 92 |
+
"""Normalize transient Gradio slider values such as None or 2.0."""
|
| 93 |
+
try:
|
| 94 |
+
n = int(float(value))
|
| 95 |
+
except (TypeError, ValueError):
|
| 96 |
+
n = default
|
| 97 |
+
return max(1, min(MAX_SPEAKERS, n))
|
| 98 |
+
|
| 99 |
+
|
| 100 |
def _voice_config_for(name):
|
| 101 |
"""Resolve a library voice name (e.g. 'Nova') to its cloning reference clip + text."""
|
| 102 |
vid = _VOICE_NAME_TO_ID.get(name)
|
|
|
|
| 382 |
topic = (topic or "").strip()
|
| 383 |
if not topic:
|
| 384 |
raise gr.Error("Please enter a topic for the podcast.")
|
| 385 |
+
n = _speaker_count(num_speakers)
|
| 386 |
# Use the picked voice names as the script's speaker names when they're distinct, so the
|
| 387 |
# dialogue reads "Nova: …" / "Atlas: …". (Audio mapping is order-based regardless, so a
|
| 388 |
# duplicate pick simply falls back to generic Host/Guest labels.)
|
|
|
|
| 436 |
tts_lines = [(spk, _strip_cues(txt)) for spk, txt in lines]
|
| 437 |
|
| 438 |
progress(0.05, desc="Analyzing script & pacing…")
|
| 439 |
+
try:
|
| 440 |
+
sr, audio = tts_engine.generate_podcast(tts_lines, voice_map, progress=progress)
|
| 441 |
+
except tts_engine.TTSModelAccessError as e:
|
| 442 |
+
raise gr.Error(str(e)) from e
|
| 443 |
|
| 444 |
# Mix the selected background-music bed under the voices (no-op for "No music").
|
| 445 |
progress(0.95, desc="Mixing the sound bed…")
|
|
|
|
| 579 |
/* ---------- cards / panels ---------- */
|
| 580 |
.pf-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
|
| 581 |
border-radius:18px !important; padding:20px 22px !important; }
|
| 582 |
+
.pf-card .form {
|
| 583 |
+
background:transparent !important; border:none !important; box-shadow:none !important;
|
| 584 |
+
padding:0 !important;
|
| 585 |
+
}
|
| 586 |
+
.pf-card .form > .pf-config-card {
|
| 587 |
+
background:rgba(255,255,255,.035) !important;
|
| 588 |
+
border-color:rgba(255,255,255,.08) !important;
|
| 589 |
+
}
|
| 590 |
.pf-eyebrow { font-size:10.5px;letter-spacing:2px;color:var(--pf-faint);font-weight:700;
|
| 591 |
display:flex;align-items:center;gap:7px;margin-bottom:8px; }
|
| 592 |
.pf-step-h { display:flex;align-items:center;gap:11px;margin:4px 0 16px; }
|
|
|
|
| 596 |
.pf-step-h .h { font-size:12.5px;color:var(--pf-mut);font-weight:500; }
|
| 597 |
|
| 598 |
/* topic textbox */
|
| 599 |
+
#pf-topic-card, .pf-config-card {
|
| 600 |
+
--block-background-fill:transparent;
|
| 601 |
+
--block-border-color:transparent;
|
| 602 |
+
--input-background-fill:rgba(255,255,255,.035);
|
| 603 |
+
--input-border-color:rgba(255,255,255,.10);
|
| 604 |
+
--input-text-color:var(--pf-text);
|
| 605 |
+
--body-text-color:var(--pf-text);
|
| 606 |
+
--block-label-text-color:#b9bbcf;
|
| 607 |
+
}
|
| 608 |
+
#pf-topic, #pf-topic .block, #pf-topic .form, #pf-topic .wrap,
|
| 609 |
+
#pf-topic .container, #pf-topic .input-container,
|
| 610 |
+
#pf-topic-card .block, #pf-topic-card .form, #pf-topic-card .wrap,
|
| 611 |
+
#pf-topic-card .container, #pf-topic-card .input-container {
|
| 612 |
+
background:transparent !important; border:none !important; box-shadow:none !important;
|
| 613 |
+
}
|
| 614 |
+
#pf-topic textarea { background:rgba(255,255,255,.035) !important;
|
| 615 |
+
border:1px solid rgba(255,255,255,.10) !important; border-radius:14px !important;
|
| 616 |
+
font-size:21px !important; color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
|
| 617 |
+
line-height:1.5 !important; box-shadow:none !important; padding:18px 20px !important; }
|
| 618 |
+
#pf-topic textarea::placeholder { color:#c3c5d5 !important; -webkit-text-fill-color:#c3c5d5 !important; opacity:1 !important; }
|
| 619 |
#pf-topic-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
|
| 620 |
border-radius:18px !important; padding:20px 22px 22px !important; }
|
| 621 |
|
|
|
|
| 632 |
.pf-config-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
|
| 633 |
border-radius:15px !important; padding:14px 16px !important; }
|
| 634 |
.pf-config-card label span { font-size:10px !important; letter-spacing:1.5px !important;
|
| 635 |
+
text-transform:uppercase !important; color:#aeb1c6 !important;
|
| 636 |
+
-webkit-text-fill-color:#aeb1c6 !important; font-weight:700 !important; }
|
| 637 |
+
.pf-config-card .block, .pf-config-card .form, .pf-config-card .gr-box,
|
| 638 |
+
.pf-config-card .container, .pf-config-card .wrap {
|
| 639 |
+
background:transparent !important; border:none !important; box-shadow:none !important;
|
| 640 |
+
}
|
| 641 |
+
.pf-config-card .secondary-wrap, .pf-config-card .input-container,
|
| 642 |
+
.pf-config-card input, .pf-config-card select,
|
| 643 |
+
.pf-config-card [data-testid="dropdown"], .pf-config-card [role="button"] {
|
| 644 |
+
background:rgba(255,255,255,.045) !important;
|
| 645 |
+
border:1px solid rgba(255,255,255,.10) !important; border-radius:10px !important;
|
| 646 |
+
color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
|
| 647 |
+
box-shadow:none !important;
|
| 648 |
+
}
|
| 649 |
+
.pf-config-card input, .pf-config-card select { min-height:40px !important; padding:7px 10px !important; }
|
| 650 |
+
.pf-config-card input::placeholder { color:#c3c5d5 !important; -webkit-text-fill-color:#c3c5d5 !important; opacity:1 !important; }
|
| 651 |
+
.pf-config-card .secondary-wrap *, .pf-config-card [data-testid="dropdown"] *,
|
| 652 |
+
.pf-config-card [role="button"] * {
|
| 653 |
+
color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
|
| 654 |
+
}
|
| 655 |
+
.pf-config-card svg { color:#c9b8ff !important; fill:currentColor !important; }
|
| 656 |
+
.pf-config-card option { background:var(--pf-panel-2) !important; color:var(--pf-text) !important; }
|
| 657 |
+
.pf-config-card .slider input, .pf-config-card input[type="range"] {
|
| 658 |
+
background:transparent !important; border:none !important; -webkit-text-fill-color:initial !important;
|
| 659 |
+
}
|
| 660 |
+
.pf-config-card .input-container input[type="number"],
|
| 661 |
+
.pf-config-card input[data-testid="number-input"] {
|
| 662 |
+
width:74px !important; min-width:74px !important; max-width:74px !important;
|
| 663 |
+
height:40px !important; min-height:40px !important; padding:0 10px !important;
|
| 664 |
+
text-align:center !important; background:rgba(255,255,255,.045) !important;
|
| 665 |
+
border:1px solid rgba(255,255,255,.10) !important; border-radius:9px !important;
|
| 666 |
+
color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
|
| 667 |
+
}
|
| 668 |
+
#pf-voices input[data-testid="number-input"] {
|
| 669 |
+
width:54px !important; min-width:54px !important; max-width:54px !important;
|
| 670 |
+
height:34px !important; min-height:34px !important; line-height:34px !important;
|
| 671 |
+
padding:0 !important; box-sizing:border-box !important; text-align:center !important;
|
| 672 |
+
font-size:15px !important; font-weight:800 !important;
|
| 673 |
+
background:rgba(255,255,255,.055) !important;
|
| 674 |
+
border:1px solid rgba(255,255,255,.13) !important; border-radius:9px !important;
|
| 675 |
+
}
|
| 676 |
+
#pf-voices input[data-testid="number-input"]::-webkit-outer-spin-button,
|
| 677 |
+
#pf-voices input[data-testid="number-input"]::-webkit-inner-spin-button {
|
| 678 |
+
-webkit-appearance:none !important; margin:0 !important;
|
| 679 |
+
}
|
| 680 |
+
.pf-config-card .slider, .pf-config-card .slider * {
|
| 681 |
+
color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
|
| 682 |
+
}
|
| 683 |
+
.gradio-container .options, .gradio-container .options ul,
|
| 684 |
+
.gradio-container [role="listbox"] {
|
| 685 |
+
background:var(--pf-panel-2) !important; border:1px solid rgba(124,92,255,.34) !important;
|
| 686 |
+
color:var(--pf-text) !important; box-shadow:0 18px 46px rgba(0,0,0,.55) !important;
|
| 687 |
+
}
|
| 688 |
+
.gradio-container .options li, .gradio-container .options .item,
|
| 689 |
+
.gradio-container [role="option"] {
|
| 690 |
+
background:transparent !important; color:var(--pf-text) !important;
|
| 691 |
+
-webkit-text-fill-color:var(--pf-text) !important;
|
| 692 |
+
}
|
| 693 |
+
.gradio-container .options li:hover, .gradio-container .options .item:hover,
|
| 694 |
+
.gradio-container [role="option"]:hover,
|
| 695 |
+
.gradio-container [role="option"][aria-selected="true"] {
|
| 696 |
+
background:rgba(124,92,255,.18) !important; color:#fff !important; -webkit-text-fill-color:#fff !important;
|
| 697 |
+
}
|
| 698 |
/* dropdowns open downward — keep the popup above the cards that follow it, otherwise
|
| 699 |
the next card paints over all but the first option. Raise the row's stacking context
|
| 700 |
(no overflow change, so nothing gets clipped). */
|
|
|
|
| 1008 |
value=2, label="LENGTH", filterable=False,
|
| 1009 |
)
|
| 1010 |
with gr.Column(elem_classes=["pf-config-card"]):
|
| 1011 |
+
num_speakers = gr.Slider(
|
| 1012 |
+
1, MAX_SPEAKERS, value=2, step=1, label="VOICES",
|
| 1013 |
+
elem_id="pf-voices", show_reset_button=False,
|
| 1014 |
+
)
|
| 1015 |
with gr.Column(elem_classes=["pf-config-card"]):
|
| 1016 |
bed_step1 = gr.Dropdown(
|
| 1017 |
[b[0] for b in SOUND_BEDS], value="Ambient Drift", label="SOUND BED",
|
|
|
|
| 1185 |
script_box2.change(script_to_bubbles, inputs=[script_box2], outputs=[review_bubbles])
|
| 1186 |
|
| 1187 |
# show one voice picker per selected speaker
|
| 1188 |
+
def _voice_picker_visibility(n):
|
| 1189 |
+
count = _speaker_count(n)
|
| 1190 |
+
return [gr.update(visible=i < count) for i in range(MAX_SPEAKERS)]
|
| 1191 |
+
|
| 1192 |
num_speakers.change(
|
| 1193 |
+
_voice_picker_visibility,
|
| 1194 |
inputs=[num_speakers], outputs=voice_pickers,
|
| 1195 |
)
|
| 1196 |
|
| 1197 |
# keep the header cast chip in sync with the picked voices (all pages)
|
| 1198 |
def _refresh_headers(n, *names):
|
| 1199 |
+
count = _speaker_count(n)
|
| 1200 |
+
sel = [names[i] for i in range(count) if i < len(names) and names[i]]
|
| 1201 |
return (
|
| 1202 |
gr.update(value=_header("Create",
|
| 1203 |
"Type a topic — Podify writes & voices it", sel)),
|
tts/engine.py
CHANGED
|
@@ -43,6 +43,10 @@ _ENGINE = None # cached TTSInferenceEngine
|
|
| 43 |
_SAMPLE_RATE = 44100
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
@dataclass
|
| 47 |
class VoiceConfig:
|
| 48 |
"""Resolved voice for one speaker: a reference clip+text, or model default."""
|
|
@@ -146,7 +150,24 @@ def _load_engine():
|
|
| 146 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 147 |
precision = torch.half if device == "cuda" else torch.float32
|
| 148 |
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
llama_queue = launch_thread_safe_queue(
|
| 152 |
checkpoint_path=checkpoint_dir,
|
|
|
|
| 43 |
_SAMPLE_RATE = 44100
|
| 44 |
|
| 45 |
|
| 46 |
+
class TTSModelAccessError(RuntimeError):
|
| 47 |
+
"""Raised when the configured TTS model cannot be downloaded from HF Hub."""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
@dataclass
|
| 51 |
class VoiceConfig:
|
| 52 |
"""Resolved voice for one speaker: a reference clip+text, or model default."""
|
|
|
|
| 150 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 151 |
precision = torch.half if device == "cuda" else torch.float32
|
| 152 |
|
| 153 |
+
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
|
| 154 |
+
try:
|
| 155 |
+
checkpoint_dir = snapshot_download(repo_id=TTS_MODEL_REPO, token=token)
|
| 156 |
+
except Exception as e:
|
| 157 |
+
msg = str(e)
|
| 158 |
+
if type(e).__name__ == "GatedRepoError" or "Cannot access gated repo" in msg or "403" in msg:
|
| 159 |
+
access_url = (
|
| 160 |
+
"https://huggingface.co/fishaudio/s1-mini"
|
| 161 |
+
if TTS_MODEL_REPO == "fishaudio/openaudio-s1-mini"
|
| 162 |
+
else f"https://huggingface.co/{TTS_MODEL_REPO}"
|
| 163 |
+
)
|
| 164 |
+
raise TTSModelAccessError(
|
| 165 |
+
f"The TTS model '{TTS_MODEL_REPO}' is gated or not accessible with the current "
|
| 166 |
+
f"Hugging Face token. Request access at {access_url}, then log in locally or set "
|
| 167 |
+
"HF_TOKEN to a token with read access. You can also set TTS_MODEL_REPO to another "
|
| 168 |
+
"compatible Fish Audio/OpenAudio checkpoint you can access."
|
| 169 |
+
) from e
|
| 170 |
+
raise
|
| 171 |
|
| 172 |
llama_queue = launch_thread_safe_queue(
|
| 173 |
checkpoint_path=checkpoint_dir,
|