DramaboxTTS / app.py
Daankular's picture
Remove temporary ZeroGPU probe
d7ce2aa verified
Raw
History Blame Contribute Delete
15.6 kB
#!/usr/bin/env python3
"""Higgs Audio v3 TTS voice gallery for Hugging Face ZeroGPU."""
import json
import logging
import os
import sys
import tempfile
import gradio as gr
import requests
import soundfile as sf
import spaces
_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(_DIR, "src"))
import higgs_backend # noqa: E402
import asr_backend # noqa: E402
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
# ── Voice backends ─────────────────────────────────────────────────────────────
# ── Voices ─────────────────────────────────────────────────────────────────────
with open(os.path.join(_DIR, "voices.json"), encoding="utf-8") as _f:
VOICES = json.load(_f)
LANGUAGES = ["All"] + sorted({v.get("language", "") for v in VOICES if v.get("language")})
GENDERS = ["All", "female", "male"]
PER_PAGE = 20
logging.info(f"Loaded {len(VOICES):,} voices")
def _filter(search, lang, gender, accent):
s = (search or "").lower()
return [
v for v in VOICES
if (lang == "All" or v.get("language") == lang)
and (gender == "All" or v.get("gender") == gender)
and (accent == "All" or v.get("accent") == accent)
and (not s or s in v.get("name", "").lower()
or s in (v.get("description") or "").lower())
]
def _accents_for(lang):
pool = VOICES if lang == "All" else [v for v in VOICES if v.get("language") == lang]
return ["All"] + sorted({v.get("accent", "") for v in pool if v.get("accent")})
# ── Model ──────────────────────────────────────────────────────────────────────
higgs_backend.load()
asr_backend.load()
@spaces.GPU(duration=10, size="large")
def on_generate(prompt, preview_url, ref_text, temperature, top_p, top_k,
max_new_tok, seed,
progress=gr.Progress()):
if not (prompt or "").strip():
raise gr.Error("Prompt is empty.")
ref_path = None
if preview_url:
r = requests.get(preview_url, timeout=30)
r.raise_for_status()
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tmp.write(r.content)
tmp.close()
ref_path = tmp.name
try:
progress(0.5, desc="Generating with Higgs Audio v3…")
waveform, sr = higgs_backend.generate(
prompt.strip(), voice_ref=ref_path, reference_text=ref_text,
temperature=float(temperature), top_p=float(top_p),
top_k=int(top_k), max_new_tokens=int(max_new_tok), seed=int(seed),
)
out = tempfile.mktemp(suffix=".wav", prefix="higgs_", dir="/tmp")
sf.write(out, waveform, sr)
return out
finally:
if ref_path and os.path.exists(ref_path):
os.unlink(ref_path)
# ── CSS ─────────────────────────────────────────────────────────────────────────
CSS = """
/* card grid */
.card-grid { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; }
@media (max-width: 1200px) { .card-grid { grid-template-columns: repeat(3, 1fr); } }
@media (max-width: 800px) { .card-grid { grid-template-columns: repeat(2, 1fr); } }
/* individual card β€” scoped inside the Gradio column */
.voice-card { background: #16161e !important; border: 1px solid #2a2a3a !important;
border-radius: 10px !important; padding: 14px !important; height: 100% !important; }
.voice-card:hover { border-color: #ff6b35 !important; }
/* card header line */
.card-header { display: flex; align-items: flex-start; gap: 8px; margin-bottom: 6px; }
.badge-f { background: #3d0e3d; color: #e080e0; font-size: 11px; font-weight: 700;
padding: 2px 7px; border-radius: 4px; white-space: nowrap; }
.badge-m { background: #0e1e3d; color: #80a8e0; font-size: 11px; font-weight: 700;
padding: 2px 7px; border-radius: 4px; white-space: nowrap; }
.card-name { font-size: 13px; font-weight: 600; color: #dde0f0; line-height: 1.35; }
/* tags row */
.card-tags { display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 4px; }
.card-tags span { font-size: 10px; padding: 2px 6px; border-radius: 3px; }
.t-lang { background: #1e3a1e; color: #88cc88; }
.t-acc { background: #1e2a3a; color: #88a8cc; }
.t-age { background: #2a1e2a; color: #aa88aa; }
/* description */
.card-desc { font-size: 11px; color: #5a5a80; line-height: 1.4; margin-bottom: 4px; }
/* "Use this voice" button override */
.use-btn { background: #ff6b35 !important; border: none !important;
font-weight: 700 !important; }
.use-btn:hover { background: #ff8755 !important; }
/* selected voice banner */
.sel-banner { background: #0d1a0d; border: 1px solid #2a4a2a; border-radius: 8px;
padding: 10px 14px; margin: 6px 0; }
/* pagination */
.pager-row { display: flex; align-items: center; gap: 12px; padding: 8px 0; }
"""
# ── UI ──────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Higgs Audio v3 TTS", analytics_enabled=False) as app:
gr.Markdown(f"# Higgs Audio v3 TTS\nBrowse **{len(VOICES):,} voices**. Hit β–Ά to preview, then **Use this voice** to generate.")
# Filters
with gr.Row():
search_in = gr.Textbox(placeholder="Search by name or description…", label="Search", scale=3)
lang_in = gr.Dropdown(LANGUAGES, value="All", label="Language", scale=2)
gender_in = gr.Radio(GENDERS, value="All", label="Gender", scale=2)
accent_in = gr.Dropdown(["All"], value="All", label="Accent", scale=2)
result_md = gr.Markdown("")
# ── Fixed card grid (PER_PAGE slots) ───────────────────────────────────────
# Build PER_PAGE card slots; each slot has HTML header, Audio, Use button.
# Slots are hidden when a page has fewer voices than PER_PAGE.
card_rows = [] # gr.Column slots (show/hide)
card_html = [] # gr.HTML β€” full card content incl. <audio> tag
card_btns = [] # gr.Button β€” "Use this voice"
page_voices = gr.State([]) # voice dicts on the current page
COLS = 4
for r_idx in range((PER_PAGE + COLS - 1) // COLS):
with gr.Row():
for c_idx in range(COLS):
slot = r_idx * COLS + c_idx
if slot >= PER_PAGE:
break
with gr.Column(elem_classes=["voice-card"]) as col:
html = gr.HTML("")
btn = gr.Button("βœ… Use this voice", size="sm",
elem_classes=["use-btn"])
card_html.append(html)
card_btns.append(btn)
card_rows.append(col)
# Pagination
with gr.Row(elem_classes=["pager-row"]):
prev_btn = gr.Button("← Prev", size="sm", interactive=False)
page_info = gr.Markdown("", elem_classes=["pager-info"])
next_btn = gr.Button("Next β†’", size="sm", interactive=False)
# Selected voice banner
with gr.Row(visible=False, elem_classes=["sel-banner"]) as sel_row:
with gr.Column(scale=2):
sel_md = gr.Markdown("**No voice selected**")
with gr.Column(scale=3):
sel_audio = gr.Audio(label="Selected voice preview", type="filepath",
interactive=False)
sel_url = gr.State(None)
# Generation
gr.Markdown("---\n## Write text to synthesize")
with gr.Row():
with gr.Column(scale=3):
prompt_box = gr.Textbox(
label="Text", lines=5,
placeholder="Type what you want the selected voice to say.",
)
gen_btn = gr.Button("Generate", variant="primary", size="lg")
with gr.Column(scale=2):
with gr.Accordion("Settings", open=False):
ref_text_in = gr.Textbox(
label="Reference transcript (auto-filled on selection, improves cloning)",
lines=2, placeholder="Auto-transcribed from the selected voice preview.",
)
temperature_s = gr.Slider(0., 1.5, .7, step=.05, label="Temperature")
top_p_s = gr.Slider(.1, 1., .95, step=.01, label="Top-p")
top_k_s = gr.Slider(0, 1026, 50, step=1, label="Top-k (0 = off)")
max_tok_s = gr.Slider(64, 4096, 2048, step=64, label="Max new tokens")
seed_n = gr.Number(-1, precision=0, label="Seed (-1 = random)")
audio_out = gr.Audio(label="Generated audio", type="filepath")
# ── Page state ─────────────────────────────────────────────────────────────
page_state = gr.State(1)
# ── Helper: build all card + pagination outputs from a voice list + page ────
def _all_updates(filtered, page):
total = len(filtered)
total_pages = max(1, (total + PER_PAGE - 1) // PER_PAGE)
page = max(1, min(page, total_pages))
chunk = filtered[(page - 1) * PER_PAGE : page * PER_PAGE]
html_updates, vis_updates = [], []
for i in range(PER_PAGE):
if i < len(chunk):
v = chunk[i]
g = v.get("gender", "")
badge = f'<span class="badge-{"f" if g=="female" else "m"}">{"♀" if g=="female" else "β™‚"}</span>'
name = v.get("name", "Unknown")
lt, at, ag = v.get("language","?"), v.get("accent","?"), v.get("age","?")
desc = (v.get("description") or "")[:100]
src = v.get("preview_url", "")
html = (
f'<div class="card-header">{badge}'
f'<span class="card-name">{name}</span></div>'
f'<div class="card-tags">'
f'<span class="t-lang">{lt}</span>'
f'<span class="t-acc">{at}</span>'
f'<span class="t-age">{ag}</span></div>'
+ (f'<p class="card-desc">{desc}</p>' if desc else "")
+ f'<audio controls preload="none" src="{src}" style="width:100%;height:32px;margin-top:4px"></audio>'
)
html_updates.append(gr.update(value=html))
vis_updates.append(gr.update(visible=True))
else:
html_updates.append(gr.update(value=""))
vis_updates.append(gr.update(visible=False))
return (
html_updates + vis_updates +
[gr.update(value=f"**{total:,}** voices found"),
gr.update(value=f"Page **{page}** / {total_pages}"),
gr.update(interactive=page > 1),
gr.update(interactive=page < total_pages),
chunk, page]
)
_gallery_outputs = (
card_html + card_rows +
[result_md, page_info, prev_btn, next_btn, page_voices, page_state]
)
# ── Filter change β†’ reset to page 1 ────────────────────────────────────────
def on_filter(s, l, g, a):
filtered = _filter(s, l, g, a)
return _all_updates(filtered, 1)
def on_lang(l):
return gr.Dropdown(choices=_accents_for(l), value="All")
lang_in.change(on_lang, lang_in, accent_in)
for inp in [search_in, lang_in, gender_in, accent_in]:
inp.change(on_filter,
[search_in, lang_in, gender_in, accent_in],
_gallery_outputs)
# ── Pagination ──────────────────────────────────────────────────────────────
def on_prev(s, l, g, a, pg):
return _all_updates(_filter(s, l, g, a), int(pg) - 1)
def on_next(s, l, g, a, pg):
return _all_updates(_filter(s, l, g, a), int(pg) + 1)
prev_btn.click(on_prev, [search_in, lang_in, gender_in, accent_in, page_state], _gallery_outputs)
next_btn.click(on_next, [search_in, lang_in, gender_in, accent_in, page_state], _gallery_outputs)
# ── "Use this voice" buttons ────────────────────────────────────────────────
def _make_use_handler(slot_idx):
def handler(voices):
if slot_idx >= len(voices):
return gr.update(), gr.update(), gr.update(), gr.update(visible=False), None
v = voices[slot_idx]
name = v.get("name", "Unknown")
preview = v.get("preview_url", "")
tmp = None
if preview:
try:
r = requests.get(preview, timeout=15)
r.raise_for_status()
f = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
f.write(r.content)
f.close()
tmp = f.name
except Exception as e:
logging.warning(f"Preview download failed: {e}")
return (
gr.update(value=f"**Selected:** {name}"),
gr.update(value=tmp),
gr.update(visible=True),
preview,
)
return handler
for i, btn in enumerate(card_btns):
btn.click(
_make_use_handler(i),
inputs=[page_voices],
outputs=[sel_md, sel_audio, sel_row, sel_url],
)
# Auto-transcribe the selected voice's preview clip on CPU (Whisper) so
# "Reference transcript" is pre-filled for Higgs Audio v3 cloning β€” the
# user can still edit or clear it before generating.
sel_audio.change(asr_backend.transcribe, inputs=[sel_audio], outputs=[ref_text_in])
# ── Generate ────────────────────────────────────────────────────────────────
gen_btn.click(
on_generate,
[prompt_box, sel_url, ref_text_in, temperature_s, top_p_s, top_k_s,
max_tok_s, seed_n],
[audio_out],
)
# ── Initial load ────────────────────────────────────────────────────────────
app.load(
lambda: _all_updates(VOICES, 1),
outputs=_gallery_outputs,
)
if __name__ == "__main__":
port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
app.queue(max_size=10).launch(
server_name="0.0.0.0", server_port=port,
share=os.environ.get("GRADIO_SHARE", "1") == "1",
css=CSS,
ssr_mode=False,
)