Spaces:

build-small-hackathon
/

podify

Running on Zero

App Files Files Community

pr/2

by nvipin63 - opened 12 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+140

-13

Files changed (3) hide show

README.md +2 -1
app.py +116 -11
tts/engine.py +22 -1

README.md CHANGED Viewed

@@ -85,7 +85,8 @@ huggingface-cli upload <user>/podify . --repo-type=space
   MacLeod — 100% public domain (CC0). Rebuild with `scripts/build_music_loops.py`.
   A procedural numpy fallback in `tts/music.py` is used if the loops are absent.
 #backyard-ai
-Blog:
 Social Media Post:
 Demo:

   MacLeod — 100% public domain (CC0). Rebuild with `scripts/build_music_loops.py`.
   A procedural numpy fallback in `tts/music.py` is used if the loops are absent.
 #backyard-ai
+Blog: [Article](https://huggingface.co/blog/build-small-hackathon/podify)
 Social Media Post:
 Demo:

app.py CHANGED Viewed

@@ -88,6 +88,15 @@ LIBRARY_VOICE_NAMES = list(_VOICE_NAME_TO_ID.keys())
 _VOICE_DEFAULTS = ["Nova", "Atlas", "Echo", "Sage"]
 def _voice_config_for(name):
     """Resolve a library voice name (e.g. 'Nova') to its cloning reference clip + text."""
     vid = _VOICE_NAME_TO_ID.get(name)
@@ -373,7 +382,7 @@ def run_research(topic, style, duration, num_speakers, *voice_names, progress=gr
     topic = (topic or "").strip()
     if not topic:
         raise gr.Error("Please enter a topic for the podcast.")
-    n = int(num_speakers)
     # Use the picked voice names as the script's speaker names when they're distinct, so the
     # dialogue reads "Nova: …" / "Atlas: …". (Audio mapping is order-based regardless, so a
     # duplicate pick simply falls back to generic Host/Guest labels.)
@@ -427,7 +436,10 @@ def run_tts(lines, speakers, topic, bed, *voice_names, progress=gr.Progress()):
     tts_lines = [(spk, _strip_cues(txt)) for spk, txt in lines]
     progress(0.05, desc="Analyzing script & pacing…")
-    sr, audio = tts_engine.generate_podcast(tts_lines, voice_map, progress=progress)
     # Mix the selected background-music bed under the voices (no-op for "No music").
     progress(0.95, desc="Mixing the sound bed…")
@@ -567,6 +579,14 @@ footer { display:none !important; }
 /* ---------- cards / panels ---------- */
 .pf-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
   border-radius:18px !important; padding:20px 22px !important; }
 .pf-eyebrow { font-size:10.5px;letter-spacing:2px;color:var(--pf-faint);font-weight:700;
   display:flex;align-items:center;gap:7px;margin-bottom:8px; }
 .pf-step-h { display:flex;align-items:center;gap:11px;margin:4px 0 16px; }
@@ -576,8 +596,26 @@ footer { display:none !important; }
 .pf-step-h .h { font-size:12.5px;color:var(--pf-mut);font-weight:500; }
 /* topic textbox */
-#pf-topic textarea { background:transparent !important; border:none !important; font-size:21px !important;
-  color:var(--pf-text) !important; line-height:1.5 !important; box-shadow:none !important; }
 #pf-topic-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
   border-radius:18px !important; padding:20px 22px 22px !important; }
@@ -594,10 +632,69 @@ footer { display:none !important; }
 .pf-config-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
   border-radius:15px !important; padding:14px 16px !important; }
 .pf-config-card label span { font-size:10px !important; letter-spacing:1.5px !important;
-  text-transform:uppercase !important; color:var(--pf-faint) !important; font-weight:700 !important; }
-.pf-config-card .wrap, .pf-config-card input, .pf-config-card select,
-.pf-config-card .secondary-wrap { background:transparent !important; border:none !important; color:#fff !important; }
-.pf-config-card .gr-box, .pf-config-card .container { border:none !important; background:transparent !important; }
 /* dropdowns open downward — keep the popup above the cards that follow it, otherwise
    the next card paints over all but the first option. Raise the row's stacking context
    (no overflow change, so nothing gets clipped). */
@@ -911,7 +1008,10 @@ def build_ui():
                                     value=2, label="LENGTH", filterable=False,
                                 )
                             with gr.Column(elem_classes=["pf-config-card"]):
-                                num_speakers = gr.Slider(1, MAX_SPEAKERS, value=2, step=1, label="VOICES")
                             with gr.Column(elem_classes=["pf-config-card"]):
                                 bed_step1 = gr.Dropdown(
                                     [b[0] for b in SOUND_BEDS], value="Ambient Drift", label="SOUND BED",
@@ -1085,14 +1185,19 @@ def build_ui():
         script_box2.change(script_to_bubbles, inputs=[script_box2], outputs=[review_bubbles])
         # show one voice picker per selected speaker
         num_speakers.change(
-            lambda n: [gr.update(visible=i < int(n)) for i in range(MAX_SPEAKERS)],
             inputs=[num_speakers], outputs=voice_pickers,
         )
         # keep the header cast chip in sync with the picked voices (all pages)
         def _refresh_headers(n, *names):
-            sel = [names[i] for i in range(int(n)) if i < len(names) and names[i]]
             return (
                 gr.update(value=_header("Create",
                                         "Type a topic — Podify writes &amp; voices it", sel)),

 _VOICE_DEFAULTS = ["Nova", "Atlas", "Echo", "Sage"]
+def _speaker_count(value, default: int = 2) -> int:
+    """Normalize transient Gradio slider values such as None or 2.0."""
+    try:
+        n = int(float(value))
+    except (TypeError, ValueError):
+        n = default
+    return max(1, min(MAX_SPEAKERS, n))
 def _voice_config_for(name):
     """Resolve a library voice name (e.g. 'Nova') to its cloning reference clip + text."""
     vid = _VOICE_NAME_TO_ID.get(name)
     topic = (topic or "").strip()
     if not topic:
         raise gr.Error("Please enter a topic for the podcast.")
+    n = _speaker_count(num_speakers)
     # Use the picked voice names as the script's speaker names when they're distinct, so the
     # dialogue reads "Nova: …" / "Atlas: …". (Audio mapping is order-based regardless, so a
     # duplicate pick simply falls back to generic Host/Guest labels.)
     tts_lines = [(spk, _strip_cues(txt)) for spk, txt in lines]
     progress(0.05, desc="Analyzing script & pacing…")
+    try:
+        sr, audio = tts_engine.generate_podcast(tts_lines, voice_map, progress=progress)
+    except tts_engine.TTSModelAccessError as e:
+        raise gr.Error(str(e)) from e
     # Mix the selected background-music bed under the voices (no-op for "No music").
     progress(0.95, desc="Mixing the sound bed…")
 /* ---------- cards / panels ---------- */
 .pf-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
   border-radius:18px !important; padding:20px 22px !important; }
+.pf-card .form {
+  background:transparent !important; border:none !important; box-shadow:none !important;
+  padding:0 !important;
+}
+.pf-card .form > .pf-config-card {
+  background:rgba(255,255,255,.035) !important;
+  border-color:rgba(255,255,255,.08) !important;
+}
 .pf-eyebrow { font-size:10.5px;letter-spacing:2px;color:var(--pf-faint);font-weight:700;
   display:flex;align-items:center;gap:7px;margin-bottom:8px; }
 .pf-step-h { display:flex;align-items:center;gap:11px;margin:4px 0 16px; }
 .pf-step-h .h { font-size:12.5px;color:var(--pf-mut);font-weight:500; }
 /* topic textbox */
+#pf-topic-card, .pf-config-card {
+  --block-background-fill:transparent;
+  --block-border-color:transparent;
+  --input-background-fill:rgba(255,255,255,.035);
+  --input-border-color:rgba(255,255,255,.10);
+  --input-text-color:var(--pf-text);
+  --body-text-color:var(--pf-text);
+  --block-label-text-color:#b9bbcf;
+}
+#pf-topic, #pf-topic .block, #pf-topic .form, #pf-topic .wrap,
+#pf-topic .container, #pf-topic .input-container,
+#pf-topic-card .block, #pf-topic-card .form, #pf-topic-card .wrap,
+#pf-topic-card .container, #pf-topic-card .input-container {
+  background:transparent !important; border:none !important; box-shadow:none !important;
+}
+#pf-topic textarea { background:rgba(255,255,255,.035) !important;
+  border:1px solid rgba(255,255,255,.10) !important; border-radius:14px !important;
+  font-size:21px !important; color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
+  line-height:1.5 !important; box-shadow:none !important; padding:18px 20px !important; }
+#pf-topic textarea::placeholder { color:#c3c5d5 !important; -webkit-text-fill-color:#c3c5d5 !important; opacity:1 !important; }
 #pf-topic-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
   border-radius:18px !important; padding:20px 22px 22px !important; }
 .pf-config-card { background:var(--pf-panel) !important; border:1px solid var(--pf-line) !important;
   border-radius:15px !important; padding:14px 16px !important; }
 .pf-config-card label span { font-size:10px !important; letter-spacing:1.5px !important;
+  text-transform:uppercase !important; color:#aeb1c6 !important;
+  -webkit-text-fill-color:#aeb1c6 !important; font-weight:700 !important; }
+.pf-config-card .block, .pf-config-card .form, .pf-config-card .gr-box,
+.pf-config-card .container, .pf-config-card .wrap {
+  background:transparent !important; border:none !important; box-shadow:none !important;
+}
+.pf-config-card .secondary-wrap, .pf-config-card .input-container,
+.pf-config-card input, .pf-config-card select,
+.pf-config-card [data-testid="dropdown"], .pf-config-card [role="button"] {
+  background:rgba(255,255,255,.045) !important;
+  border:1px solid rgba(255,255,255,.10) !important; border-radius:10px !important;
+  color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
+  box-shadow:none !important;
+}
+.pf-config-card input, .pf-config-card select { min-height:40px !important; padding:7px 10px !important; }
+.pf-config-card input::placeholder { color:#c3c5d5 !important; -webkit-text-fill-color:#c3c5d5 !important; opacity:1 !important; }
+.pf-config-card .secondary-wrap *, .pf-config-card [data-testid="dropdown"] *,
+.pf-config-card [role="button"] * {
+  color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
+}
+.pf-config-card svg { color:#c9b8ff !important; fill:currentColor !important; }
+.pf-config-card option { background:var(--pf-panel-2) !important; color:var(--pf-text) !important; }
+.pf-config-card .slider input, .pf-config-card input[type="range"] {
+  background:transparent !important; border:none !important; -webkit-text-fill-color:initial !important;
+}
+.pf-config-card .input-container input[type="number"],
+.pf-config-card input[data-testid="number-input"] {
+  width:74px !important; min-width:74px !important; max-width:74px !important;
+  height:40px !important; min-height:40px !important; padding:0 10px !important;
+  text-align:center !important; background:rgba(255,255,255,.045) !important;
+  border:1px solid rgba(255,255,255,.10) !important; border-radius:9px !important;
+  color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
+}
+#pf-voices input[data-testid="number-input"] {
+  width:54px !important; min-width:54px !important; max-width:54px !important;
+  height:34px !important; min-height:34px !important; line-height:34px !important;
+  padding:0 !important; box-sizing:border-box !important; text-align:center !important;
+  font-size:15px !important; font-weight:800 !important;
+  background:rgba(255,255,255,.055) !important;
+  border:1px solid rgba(255,255,255,.13) !important; border-radius:9px !important;
+}
+#pf-voices input[data-testid="number-input"]::-webkit-outer-spin-button,
+#pf-voices input[data-testid="number-input"]::-webkit-inner-spin-button {
+  -webkit-appearance:none !important; margin:0 !important;
+}
+.pf-config-card .slider, .pf-config-card .slider * {
+  color:var(--pf-text) !important; -webkit-text-fill-color:var(--pf-text) !important;
+}
+.gradio-container .options, .gradio-container .options ul,
+.gradio-container [role="listbox"] {
+  background:var(--pf-panel-2) !important; border:1px solid rgba(124,92,255,.34) !important;
+  color:var(--pf-text) !important; box-shadow:0 18px 46px rgba(0,0,0,.55) !important;
+}
+.gradio-container .options li, .gradio-container .options .item,
+.gradio-container [role="option"] {
+  background:transparent !important; color:var(--pf-text) !important;
+  -webkit-text-fill-color:var(--pf-text) !important;
+}
+.gradio-container .options li:hover, .gradio-container .options .item:hover,
+.gradio-container [role="option"]:hover,
+.gradio-container [role="option"][aria-selected="true"] {
+  background:rgba(124,92,255,.18) !important; color:#fff !important; -webkit-text-fill-color:#fff !important;
+}
 /* dropdowns open downward — keep the popup above the cards that follow it, otherwise
    the next card paints over all but the first option. Raise the row's stacking context
    (no overflow change, so nothing gets clipped). */
                                     value=2, label="LENGTH", filterable=False,
                                 )
                             with gr.Column(elem_classes=["pf-config-card"]):
+                                num_speakers = gr.Slider(
+                                    1, MAX_SPEAKERS, value=2, step=1, label="VOICES",
+                                    elem_id="pf-voices", show_reset_button=False,
+                                )
                             with gr.Column(elem_classes=["pf-config-card"]):
                                 bed_step1 = gr.Dropdown(
                                     [b[0] for b in SOUND_BEDS], value="Ambient Drift", label="SOUND BED",
         script_box2.change(script_to_bubbles, inputs=[script_box2], outputs=[review_bubbles])
         # show one voice picker per selected speaker
+        def _voice_picker_visibility(n):
+            count = _speaker_count(n)
+            return [gr.update(visible=i < count) for i in range(MAX_SPEAKERS)]
         num_speakers.change(
+            _voice_picker_visibility,
             inputs=[num_speakers], outputs=voice_pickers,
         )
         # keep the header cast chip in sync with the picked voices (all pages)
         def _refresh_headers(n, *names):
+            count = _speaker_count(n)
+            sel = [names[i] for i in range(count) if i < len(names) and names[i]]
             return (
                 gr.update(value=_header("Create",
                                         "Type a topic — Podify writes &amp; voices it", sel)),

tts/engine.py CHANGED Viewed

@@ -43,6 +43,10 @@ _ENGINE = None  # cached TTSInferenceEngine
 _SAMPLE_RATE = 44100
 @dataclass
 class VoiceConfig:
     """Resolved voice for one speaker: a reference clip+text, or model default."""
@@ -146,7 +150,24 @@ def _load_engine():
     device = "cuda" if torch.cuda.is_available() else "cpu"
     precision = torch.half if device == "cuda" else torch.float32
-    checkpoint_dir = snapshot_download(repo_id=TTS_MODEL_REPO)
     llama_queue = launch_thread_safe_queue(
         checkpoint_path=checkpoint_dir,

 _SAMPLE_RATE = 44100
+class TTSModelAccessError(RuntimeError):
+    """Raised when the configured TTS model cannot be downloaded from HF Hub."""
 @dataclass
 class VoiceConfig:
     """Resolved voice for one speaker: a reference clip+text, or model default."""
     device = "cuda" if torch.cuda.is_available() else "cpu"
     precision = torch.half if device == "cuda" else torch.float32
+    token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+    try:
+        checkpoint_dir = snapshot_download(repo_id=TTS_MODEL_REPO, token=token)
+    except Exception as e:
+        msg = str(e)
+        if type(e).__name__ == "GatedRepoError" or "Cannot access gated repo" in msg or "403" in msg:
+            access_url = (
+                "https://huggingface.co/fishaudio/s1-mini"
+                if TTS_MODEL_REPO == "fishaudio/openaudio-s1-mini"
+                else f"https://huggingface.co/{TTS_MODEL_REPO}"
+            )
+            raise TTSModelAccessError(
+                f"The TTS model '{TTS_MODEL_REPO}' is gated or not accessible with the current "
+                f"Hugging Face token. Request access at {access_url}, then log in locally or set "
+                "HF_TOKEN to a token with read access. You can also set TTS_MODEL_REPO to another "
+                "compatible Fish Audio/OpenAudio checkpoint you can access."
+            ) from e
+        raise
     llama_queue = launch_thread_safe_queue(
         checkpoint_path=checkpoint_dir,