Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """Higgs Audio v3 TTS voice gallery for Hugging Face ZeroGPU.""" | |
| import json | |
| import logging | |
| import os | |
| import sys | |
| import tempfile | |
| import gradio as gr | |
| import requests | |
| import soundfile as sf | |
| import spaces | |
| _DIR = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.insert(0, os.path.join(_DIR, "src")) | |
| import higgs_backend # noqa: E402 | |
| import asr_backend # noqa: E402 | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") | |
| # ββ Voice backends βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ββ Voices βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with open(os.path.join(_DIR, "voices.json"), encoding="utf-8") as _f: | |
| VOICES = json.load(_f) | |
| LANGUAGES = ["All"] + sorted({v.get("language", "") for v in VOICES if v.get("language")}) | |
| GENDERS = ["All", "female", "male"] | |
| PER_PAGE = 20 | |
| logging.info(f"Loaded {len(VOICES):,} voices") | |
| def _filter(search, lang, gender, accent): | |
| s = (search or "").lower() | |
| return [ | |
| v for v in VOICES | |
| if (lang == "All" or v.get("language") == lang) | |
| and (gender == "All" or v.get("gender") == gender) | |
| and (accent == "All" or v.get("accent") == accent) | |
| and (not s or s in v.get("name", "").lower() | |
| or s in (v.get("description") or "").lower()) | |
| ] | |
| def _accents_for(lang): | |
| pool = VOICES if lang == "All" else [v for v in VOICES if v.get("language") == lang] | |
| return ["All"] + sorted({v.get("accent", "") for v in pool if v.get("accent")}) | |
| # ββ Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| higgs_backend.load() | |
| asr_backend.load() | |
| def on_generate(prompt, preview_url, ref_text, temperature, top_p, top_k, | |
| max_new_tok, seed, | |
| progress=gr.Progress()): | |
| if not (prompt or "").strip(): | |
| raise gr.Error("Prompt is empty.") | |
| ref_path = None | |
| if preview_url: | |
| r = requests.get(preview_url, timeout=30) | |
| r.raise_for_status() | |
| tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) | |
| tmp.write(r.content) | |
| tmp.close() | |
| ref_path = tmp.name | |
| try: | |
| progress(0.5, desc="Generating with Higgs Audio v3β¦") | |
| waveform, sr = higgs_backend.generate( | |
| prompt.strip(), voice_ref=ref_path, reference_text=ref_text, | |
| temperature=float(temperature), top_p=float(top_p), | |
| top_k=int(top_k), max_new_tokens=int(max_new_tok), seed=int(seed), | |
| ) | |
| out = tempfile.mktemp(suffix=".wav", prefix="higgs_", dir="/tmp") | |
| sf.write(out, waveform, sr) | |
| return out | |
| finally: | |
| if ref_path and os.path.exists(ref_path): | |
| os.unlink(ref_path) | |
| # ββ CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CSS = """ | |
| /* card grid */ | |
| .card-grid { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; } | |
| @media (max-width: 1200px) { .card-grid { grid-template-columns: repeat(3, 1fr); } } | |
| @media (max-width: 800px) { .card-grid { grid-template-columns: repeat(2, 1fr); } } | |
| /* individual card β scoped inside the Gradio column */ | |
| .voice-card { background: #16161e !important; border: 1px solid #2a2a3a !important; | |
| border-radius: 10px !important; padding: 14px !important; height: 100% !important; } | |
| .voice-card:hover { border-color: #ff6b35 !important; } | |
| /* card header line */ | |
| .card-header { display: flex; align-items: flex-start; gap: 8px; margin-bottom: 6px; } | |
| .badge-f { background: #3d0e3d; color: #e080e0; font-size: 11px; font-weight: 700; | |
| padding: 2px 7px; border-radius: 4px; white-space: nowrap; } | |
| .badge-m { background: #0e1e3d; color: #80a8e0; font-size: 11px; font-weight: 700; | |
| padding: 2px 7px; border-radius: 4px; white-space: nowrap; } | |
| .card-name { font-size: 13px; font-weight: 600; color: #dde0f0; line-height: 1.35; } | |
| /* tags row */ | |
| .card-tags { display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 4px; } | |
| .card-tags span { font-size: 10px; padding: 2px 6px; border-radius: 3px; } | |
| .t-lang { background: #1e3a1e; color: #88cc88; } | |
| .t-acc { background: #1e2a3a; color: #88a8cc; } | |
| .t-age { background: #2a1e2a; color: #aa88aa; } | |
| /* description */ | |
| .card-desc { font-size: 11px; color: #5a5a80; line-height: 1.4; margin-bottom: 4px; } | |
| /* "Use this voice" button override */ | |
| .use-btn { background: #ff6b35 !important; border: none !important; | |
| font-weight: 700 !important; } | |
| .use-btn:hover { background: #ff8755 !important; } | |
| /* selected voice banner */ | |
| .sel-banner { background: #0d1a0d; border: 1px solid #2a4a2a; border-radius: 8px; | |
| padding: 10px 14px; margin: 6px 0; } | |
| /* pagination */ | |
| .pager-row { display: flex; align-items: center; gap: 12px; padding: 8px 0; } | |
| """ | |
| # ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Higgs Audio v3 TTS", analytics_enabled=False) as app: | |
| gr.Markdown(f"# Higgs Audio v3 TTS\nBrowse **{len(VOICES):,} voices**. Hit βΆ to preview, then **Use this voice** to generate.") | |
| # Filters | |
| with gr.Row(): | |
| search_in = gr.Textbox(placeholder="Search by name or descriptionβ¦", label="Search", scale=3) | |
| lang_in = gr.Dropdown(LANGUAGES, value="All", label="Language", scale=2) | |
| gender_in = gr.Radio(GENDERS, value="All", label="Gender", scale=2) | |
| accent_in = gr.Dropdown(["All"], value="All", label="Accent", scale=2) | |
| result_md = gr.Markdown("") | |
| # ββ Fixed card grid (PER_PAGE slots) βββββββββββββββββββββββββββββββββββββββ | |
| # Build PER_PAGE card slots; each slot has HTML header, Audio, Use button. | |
| # Slots are hidden when a page has fewer voices than PER_PAGE. | |
| card_rows = [] # gr.Column slots (show/hide) | |
| card_html = [] # gr.HTML β full card content incl. <audio> tag | |
| card_btns = [] # gr.Button β "Use this voice" | |
| page_voices = gr.State([]) # voice dicts on the current page | |
| COLS = 4 | |
| for r_idx in range((PER_PAGE + COLS - 1) // COLS): | |
| with gr.Row(): | |
| for c_idx in range(COLS): | |
| slot = r_idx * COLS + c_idx | |
| if slot >= PER_PAGE: | |
| break | |
| with gr.Column(elem_classes=["voice-card"]) as col: | |
| html = gr.HTML("") | |
| btn = gr.Button("β Use this voice", size="sm", | |
| elem_classes=["use-btn"]) | |
| card_html.append(html) | |
| card_btns.append(btn) | |
| card_rows.append(col) | |
| # Pagination | |
| with gr.Row(elem_classes=["pager-row"]): | |
| prev_btn = gr.Button("β Prev", size="sm", interactive=False) | |
| page_info = gr.Markdown("", elem_classes=["pager-info"]) | |
| next_btn = gr.Button("Next β", size="sm", interactive=False) | |
| # Selected voice banner | |
| with gr.Row(visible=False, elem_classes=["sel-banner"]) as sel_row: | |
| with gr.Column(scale=2): | |
| sel_md = gr.Markdown("**No voice selected**") | |
| with gr.Column(scale=3): | |
| sel_audio = gr.Audio(label="Selected voice preview", type="filepath", | |
| interactive=False) | |
| sel_url = gr.State(None) | |
| # Generation | |
| gr.Markdown("---\n## Write text to synthesize") | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| prompt_box = gr.Textbox( | |
| label="Text", lines=5, | |
| placeholder="Type what you want the selected voice to say.", | |
| ) | |
| gen_btn = gr.Button("Generate", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| with gr.Accordion("Settings", open=False): | |
| ref_text_in = gr.Textbox( | |
| label="Reference transcript (auto-filled on selection, improves cloning)", | |
| lines=2, placeholder="Auto-transcribed from the selected voice preview.", | |
| ) | |
| temperature_s = gr.Slider(0., 1.5, .7, step=.05, label="Temperature") | |
| top_p_s = gr.Slider(.1, 1., .95, step=.01, label="Top-p") | |
| top_k_s = gr.Slider(0, 1026, 50, step=1, label="Top-k (0 = off)") | |
| max_tok_s = gr.Slider(64, 4096, 2048, step=64, label="Max new tokens") | |
| seed_n = gr.Number(-1, precision=0, label="Seed (-1 = random)") | |
| audio_out = gr.Audio(label="Generated audio", type="filepath") | |
| # ββ Page state βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| page_state = gr.State(1) | |
| # ββ Helper: build all card + pagination outputs from a voice list + page ββββ | |
| def _all_updates(filtered, page): | |
| total = len(filtered) | |
| total_pages = max(1, (total + PER_PAGE - 1) // PER_PAGE) | |
| page = max(1, min(page, total_pages)) | |
| chunk = filtered[(page - 1) * PER_PAGE : page * PER_PAGE] | |
| html_updates, vis_updates = [], [] | |
| for i in range(PER_PAGE): | |
| if i < len(chunk): | |
| v = chunk[i] | |
| g = v.get("gender", "") | |
| badge = f'<span class="badge-{"f" if g=="female" else "m"}">{"β" if g=="female" else "β"}</span>' | |
| name = v.get("name", "Unknown") | |
| lt, at, ag = v.get("language","?"), v.get("accent","?"), v.get("age","?") | |
| desc = (v.get("description") or "")[:100] | |
| src = v.get("preview_url", "") | |
| html = ( | |
| f'<div class="card-header">{badge}' | |
| f'<span class="card-name">{name}</span></div>' | |
| f'<div class="card-tags">' | |
| f'<span class="t-lang">{lt}</span>' | |
| f'<span class="t-acc">{at}</span>' | |
| f'<span class="t-age">{ag}</span></div>' | |
| + (f'<p class="card-desc">{desc}</p>' if desc else "") | |
| + f'<audio controls preload="none" src="{src}" style="width:100%;height:32px;margin-top:4px"></audio>' | |
| ) | |
| html_updates.append(gr.update(value=html)) | |
| vis_updates.append(gr.update(visible=True)) | |
| else: | |
| html_updates.append(gr.update(value="")) | |
| vis_updates.append(gr.update(visible=False)) | |
| return ( | |
| html_updates + vis_updates + | |
| [gr.update(value=f"**{total:,}** voices found"), | |
| gr.update(value=f"Page **{page}** / {total_pages}"), | |
| gr.update(interactive=page > 1), | |
| gr.update(interactive=page < total_pages), | |
| chunk, page] | |
| ) | |
| _gallery_outputs = ( | |
| card_html + card_rows + | |
| [result_md, page_info, prev_btn, next_btn, page_voices, page_state] | |
| ) | |
| # ββ Filter change β reset to page 1 ββββββββββββββββββββββββββββββββββββββββ | |
| def on_filter(s, l, g, a): | |
| filtered = _filter(s, l, g, a) | |
| return _all_updates(filtered, 1) | |
| def on_lang(l): | |
| return gr.Dropdown(choices=_accents_for(l), value="All") | |
| lang_in.change(on_lang, lang_in, accent_in) | |
| for inp in [search_in, lang_in, gender_in, accent_in]: | |
| inp.change(on_filter, | |
| [search_in, lang_in, gender_in, accent_in], | |
| _gallery_outputs) | |
| # ββ Pagination ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def on_prev(s, l, g, a, pg): | |
| return _all_updates(_filter(s, l, g, a), int(pg) - 1) | |
| def on_next(s, l, g, a, pg): | |
| return _all_updates(_filter(s, l, g, a), int(pg) + 1) | |
| prev_btn.click(on_prev, [search_in, lang_in, gender_in, accent_in, page_state], _gallery_outputs) | |
| next_btn.click(on_next, [search_in, lang_in, gender_in, accent_in, page_state], _gallery_outputs) | |
| # ββ "Use this voice" buttons ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _make_use_handler(slot_idx): | |
| def handler(voices): | |
| if slot_idx >= len(voices): | |
| return gr.update(), gr.update(), gr.update(), gr.update(visible=False), None | |
| v = voices[slot_idx] | |
| name = v.get("name", "Unknown") | |
| preview = v.get("preview_url", "") | |
| tmp = None | |
| if preview: | |
| try: | |
| r = requests.get(preview, timeout=15) | |
| r.raise_for_status() | |
| f = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) | |
| f.write(r.content) | |
| f.close() | |
| tmp = f.name | |
| except Exception as e: | |
| logging.warning(f"Preview download failed: {e}") | |
| return ( | |
| gr.update(value=f"**Selected:** {name}"), | |
| gr.update(value=tmp), | |
| gr.update(visible=True), | |
| preview, | |
| ) | |
| return handler | |
| for i, btn in enumerate(card_btns): | |
| btn.click( | |
| _make_use_handler(i), | |
| inputs=[page_voices], | |
| outputs=[sel_md, sel_audio, sel_row, sel_url], | |
| ) | |
| # Auto-transcribe the selected voice's preview clip on CPU (Whisper) so | |
| # "Reference transcript" is pre-filled for Higgs Audio v3 cloning β the | |
| # user can still edit or clear it before generating. | |
| sel_audio.change(asr_backend.transcribe, inputs=[sel_audio], outputs=[ref_text_in]) | |
| # ββ Generate ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gen_btn.click( | |
| on_generate, | |
| [prompt_box, sel_url, ref_text_in, temperature_s, top_p_s, top_k_s, | |
| max_tok_s, seed_n], | |
| [audio_out], | |
| ) | |
| # ββ Initial load ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app.load( | |
| lambda: _all_updates(VOICES, 1), | |
| outputs=_gallery_outputs, | |
| ) | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("GRADIO_SERVER_PORT", "7860")) | |
| app.queue(max_size=10).launch( | |
| server_name="0.0.0.0", server_port=port, | |
| share=os.environ.get("GRADIO_SHARE", "1") == "1", | |
| css=CSS, | |
| ssr_mode=False, | |
| ) | |