#!/usr/bin/env python3 """Higgs Audio v3 TTS voice gallery for Hugging Face ZeroGPU.""" import json import logging import os import sys import tempfile import gradio as gr import requests import soundfile as sf import spaces _DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(_DIR, "src")) import higgs_backend # noqa: E402 import asr_backend # noqa: E402 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") # ── Voice backends ───────────────────────────────────────────────────────────── # ── Voices ───────────────────────────────────────────────────────────────────── with open(os.path.join(_DIR, "voices.json"), encoding="utf-8") as _f: VOICES = json.load(_f) LANGUAGES = ["All"] + sorted({v.get("language", "") for v in VOICES if v.get("language")}) GENDERS = ["All", "female", "male"] PER_PAGE = 20 logging.info(f"Loaded {len(VOICES):,} voices") def _filter(search, lang, gender, accent): s = (search or "").lower() return [ v for v in VOICES if (lang == "All" or v.get("language") == lang) and (gender == "All" or v.get("gender") == gender) and (accent == "All" or v.get("accent") == accent) and (not s or s in v.get("name", "").lower() or s in (v.get("description") or "").lower()) ] def _accents_for(lang): pool = VOICES if lang == "All" else [v for v in VOICES if v.get("language") == lang] return ["All"] + sorted({v.get("accent", "") for v in pool if v.get("accent")}) # ── Model ────────────────────────────────────────────────────────────────────── higgs_backend.load() asr_backend.load() @spaces.GPU(duration=10, size="large") def on_generate(prompt, preview_url, ref_text, temperature, top_p, top_k, max_new_tok, seed, progress=gr.Progress()): if not (prompt or "").strip(): raise gr.Error("Prompt is empty.") ref_path = None if preview_url: r = requests.get(preview_url, timeout=30) r.raise_for_status() tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) tmp.write(r.content) tmp.close() ref_path = tmp.name try: progress(0.5, desc="Generating with Higgs Audio v3…") waveform, sr = higgs_backend.generate( prompt.strip(), voice_ref=ref_path, reference_text=ref_text, temperature=float(temperature), top_p=float(top_p), top_k=int(top_k), max_new_tokens=int(max_new_tok), seed=int(seed), ) out = tempfile.mktemp(suffix=".wav", prefix="higgs_", dir="/tmp") sf.write(out, waveform, sr) return out finally: if ref_path and os.path.exists(ref_path): os.unlink(ref_path) # ── CSS ───────────────────────────────────────────────────────────────────────── CSS = """ /* card grid */ .card-grid { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; } @media (max-width: 1200px) { .card-grid { grid-template-columns: repeat(3, 1fr); } } @media (max-width: 800px) { .card-grid { grid-template-columns: repeat(2, 1fr); } } /* individual card — scoped inside the Gradio column */ .voice-card { background: #16161e !important; border: 1px solid #2a2a3a !important; border-radius: 10px !important; padding: 14px !important; height: 100% !important; } .voice-card:hover { border-color: #ff6b35 !important; } /* card header line */ .card-header { display: flex; align-items: flex-start; gap: 8px; margin-bottom: 6px; } .badge-f { background: #3d0e3d; color: #e080e0; font-size: 11px; font-weight: 700; padding: 2px 7px; border-radius: 4px; white-space: nowrap; } .badge-m { background: #0e1e3d; color: #80a8e0; font-size: 11px; font-weight: 700; padding: 2px 7px; border-radius: 4px; white-space: nowrap; } .card-name { font-size: 13px; font-weight: 600; color: #dde0f0; line-height: 1.35; } /* tags row */ .card-tags { display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 4px; } .card-tags span { font-size: 10px; padding: 2px 6px; border-radius: 3px; } .t-lang { background: #1e3a1e; color: #88cc88; } .t-acc { background: #1e2a3a; color: #88a8cc; } .t-age { background: #2a1e2a; color: #aa88aa; } /* description */ .card-desc { font-size: 11px; color: #5a5a80; line-height: 1.4; margin-bottom: 4px; } /* "Use this voice" button override */ .use-btn { background: #ff6b35 !important; border: none !important; font-weight: 700 !important; } .use-btn:hover { background: #ff8755 !important; } /* selected voice banner */ .sel-banner { background: #0d1a0d; border: 1px solid #2a4a2a; border-radius: 8px; padding: 10px 14px; margin: 6px 0; } /* pagination */ .pager-row { display: flex; align-items: center; gap: 12px; padding: 8px 0; } """ # ── UI ────────────────────────────────────────────────────────────────────────── with gr.Blocks(title="Higgs Audio v3 TTS", analytics_enabled=False) as app: gr.Markdown(f"# Higgs Audio v3 TTS\nBrowse **{len(VOICES):,} voices**. Hit ▶ to preview, then **Use this voice** to generate.") # Filters with gr.Row(): search_in = gr.Textbox(placeholder="Search by name or description…", label="Search", scale=3) lang_in = gr.Dropdown(LANGUAGES, value="All", label="Language", scale=2) gender_in = gr.Radio(GENDERS, value="All", label="Gender", scale=2) accent_in = gr.Dropdown(["All"], value="All", label="Accent", scale=2) result_md = gr.Markdown("") # ── Fixed card grid (PER_PAGE slots) ─────────────────────────────────────── # Build PER_PAGE card slots; each slot has HTML header, Audio, Use button. # Slots are hidden when a page has fewer voices than PER_PAGE. card_rows = [] # gr.Column slots (show/hide) card_html = [] # gr.HTML — full card content incl.