Spaces:

Navid-AI
/

Arabic-TTS-Arena

Running

File size: 52,354 Bytes

import gradio as gr
import modal
import base64
import random
import json
import uuid
from pathlib import Path
from typing import Optional
from examples import SAMPLE_SENTENCES

APP_NAME = "arabic-tts-arena"

LEADERBOARD_FILE = Path(__file__).parent / "leaderboard.json"

MAX_SYNTHESIS_RETRIES = 2  # per-model retry cap before giving up
MIN_BATTLES = 45 # minimum battles for a model to appear on the leaderboard (to avoid unjust rankings for new models with few votes)

_AVAILABLE_MODELS_CACHE: dict[str, dict[str, str]] | None = None


def _fetch_model_registry() -> dict[str, dict[str, str]]:
    """Fetch the model registry from the Modal backend.

    Returns dict like:
        {"chatterbox": {"class_name": "ChatterboxModel", "display_name": "Chatterbox"}, ...}
    """
    service = modal.Cls.from_name(APP_NAME, "ArenaService")
    registry = service().get_model_registry.remote()
    if registry:
        print(f"✅ Fetched {len(registry)} models from Modal backend")
        return registry
    raise RuntimeError("Failed to fetch model registry from Modal backend")


def _get_available_models() -> dict[str, dict[str, str]]:
    """Lazy-load the model registry once on first use, then cache for the lifetime of the process."""
    global _AVAILABLE_MODELS_CACHE
    if _AVAILABLE_MODELS_CACHE is None:
        print("⏳ Fetching model registry from Modal backend...")
        _AVAILABLE_MODELS_CACHE = _fetch_model_registry()
        print(f"✅ Available models: {', '.join(_AVAILABLE_MODELS_CACHE.keys())}")
    return _AVAILABLE_MODELS_CACHE


def get_model_cls(model_id: str):
    """Get a Modal class by model_id using the registered class name."""
    available = _get_available_models()
    if model_id not in available:
        raise ValueError(f"Model not available: {model_id}")
    class_name = available[model_id]["class_name"]
    return modal.Cls.from_name(APP_NAME, class_name)


def get_display_name(model_id: str) -> str:
    """Get the human-readable display name for a model."""
    available = _get_available_models()
    if model_id in available:
        return available[model_id].get("display_name", model_id)
    return model_id


def get_arena_service():
    """Get ArenaService class for voting operations."""
    return modal.Cls.from_name(APP_NAME, "ArenaService")


HEADER_MD = """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
    <h1 style="font-size: 2.2em; margin-bottom: 0.2em;"> Arabic TTS Arena</h1>
    <p style="font-size: 1.1em; color: #666; margin-top: 0;">
        Compare Arabic text‑to‑speech models side by side.<br>
        Listen, vote, and help build the community leaderboard.
    </p>
    <p style="font-size: 0.85em; margin-top: 0.3em;">
        <a href="https://huggingface.co/blog/Navid-AI/introducing-arabic-tts-arena" target="_blank" style="color: #10b981; text-decoration: none;">Blog post</a>
        &nbsp;·&nbsp;
        <a href="https://github.com/Navid-Gen-AI/arabic-tts-arena" target="_blank" style="color: #10b981; text-decoration: none;">GitHub</a>
    </p>
</div>
"""

HOW_IT_WORKS_MD = """
<div style="text-align: center; color: #888; font-size: 0.9em; margin-bottom: 0.5em;">
    <strong>How it works:</strong>
    Enter Arabic text → Listen to two anonymous models → Vote for the better one
</div>
"""

# Leaderboard header removed — metadata is now rendered inline by refresh_leaderboard()

ABOUT_MD = """
<style>
.about-wrap {
    max-width: 680px; margin: 0 auto; padding: 0.5em 0 2em 0;
    font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
    line-height: 1.7; color: var(--body-text-color);
}
.about-wrap h2 {
    font-size: 1.6em; font-weight: 700; margin: 0 0 0.3em 0;
    letter-spacing: -0.01em;
}
.about-wrap h3 {
    font-size: 1.15em; font-weight: 700; margin: 1.6em 0 0.5em 0;
    letter-spacing: -0.01em;
}
.about-wrap p, .about-wrap li {
    font-size: 0.95em; color: #ccc;
}
.about-wrap ol { padding-left: 1.4em; }
.about-wrap ol li { margin-bottom: 0.35em; }
.about-wrap a {
    color: #10b981; text-decoration: none;
}
.about-wrap a:hover { text-decoration: underline; }
.about-wrap strong { color: var(--body-text-color); }

/* News / Updates section */
.news-section {
    border: 1px solid var(--border-color-primary);
    border-radius: 10px;
    padding: 1em 1.3em;
    margin-bottom: 1.8em;
    background: rgba(16,185,129,0.04);
}
.news-section h3 {
    margin: 0 0 0.6em 0 !important; font-size: 1.05em;
}
.news-item {
    display: flex; gap: 0.8em; align-items: baseline;
    margin-bottom: 0.4em; font-size: 0.9em;
}
.news-date {
    flex-shrink: 0; font-size: 0.82em; font-weight: 600;
    color: #10b981; white-space: nowrap;
    font-variant-numeric: tabular-nums;
}
.news-text { color: #ccc; }

/* Shortcuts / Voting tables */
.about-table {
    width: 100%; border-collapse: collapse; margin: 0.5em 0 0.8em 0;
    font-size: 0.9em;
}
.about-table th {
    text-align: left; padding: 0.5em 0.8em;
    border-bottom: 1px solid var(--border-color-primary);
    font-weight: 600; font-size: 0.85em;
    text-transform: uppercase; letter-spacing: 0.04em;
    color: var(--body-text-color-subdued, #888);
}
.about-table td {
    padding: 0.5em 0.8em;
    border-bottom: 1px solid rgba(255,255,255,0.05);
    color: #ccc;
}
.about-table td:first-child { font-weight: 600; color: var(--body-text-color); }
.about-table kbd {
    display: inline-block; padding: 0.15em 0.5em;
    border-radius: 4px; font-size: 0.9em; font-family: monospace;
    background: rgba(255,255,255,0.08);
    border: 1px solid rgba(255,255,255,0.12);
    color: var(--body-text-color);
}

/* Contribute card */
.contribute-card {
    border: 1px solid var(--border-color-primary);
    border-radius: 10px;
    padding: 1.2em 1.4em;
    margin-top: 0.5em;
    background: rgba(255,255,255,0.02);
}
.contribute-card p { margin-bottom: 0.6em; }
.contribute-card code {
    padding: 0.15em 0.45em; border-radius: 4px;
    font-size: 0.88em;
    background: rgba(255,255,255,0.08);
    color: #10b981;
}
.contribute-steps { display: flex; flex-direction: column; gap: 10px; }
.contribute-step {
    display: flex; align-items: center; gap: 12px;
    font-size: 0.93em; color: #ccc;
}
.step-num {
    flex-shrink: 0;
    width: 28px; height: 28px;
    display: flex; align-items: center; justify-content: center;
    border-radius: 50%;
    font-weight: 700; font-size: 0.85em;
    background: rgba(16,185,129,0.15);
    color: #10b981;
}
</style>

<div class="about-wrap">

<div class="news-section">
<h3>📢 Latest Updates</h3>
<div class="news-item"><span class="news-date">Apr 12, 2026</span><span class="news-text">🆕 Added 4 new models: <strong>OmniVoice</strong>, <strong>Lahgatna 2.0</strong>, <strong>SILMA TTS v2</strong>, <strong>VoxCPM 2</strong></span></div>
<div class="news-item"><span class="news-date">Apr 12, 2026</span><span class="news-text">🏷️ Retired 5 models: Lahgtna, SpeechT5 Arabic, OuteTTS 1.0, Arabic Spark TTS, SILMA TTS v1 Large — their ratings are preserved on the bottom accordion of the leaderboard</span></div>
<div class="news-item"><span class="news-date">Mar 17, 2026</span><span class="news-text">✨ Added Latency to Leaderboard based on feedback from <a href="https://www.linkedin.com/in/hazem-abdelazim-95153b72/" target="_blank">Dr. Hazem Abdelazim</a></span></div>
<div class="news-item"><span class="news-date">Mar 12, 2026</span><span class="news-text">🎉 Arena launched with 12 Arabic TTS models — <a href="https://huggingface.co/blog/Navid-AI/introducing-arabic-tts-arena" target="_blank">read the blog post</a></span></div>
<div class="news-item"><span class="news-date" style="opacity:0;">—</span><span class="news-text">🤝 Have a model that should be here? <a href="https://github.com/Navid-Gen-AI/arabic-tts-arena" target="_blank">Open a PR</a> — we'd love to welcome it in.</span></div>
</div>

<h2>Why We Built This</h2>

<p>Arabic is spoken by over 400 million people. It's the language of poetry, prayer, storytelling, and everyday life. Yet when it comes to text-to-speech, Arabic has been an afterthought — tested in labs, benchmarked on charts, but rarely <em>listened to</em> by the people it's meant to serve.</p>

<p>We wanted to change that. Not with another paper or another metric — but by putting the microphone in <strong>your</strong> hands. You listen. You choose. Your ear is the benchmark.</p>

<h3>How the Arena Works</h3>
<ol>
    <li>You type (or pick) an Arabic sentence</li>
    <li>Two anonymous models read it aloud</li>
    <li>You vote for the one that sounds more natural, more <em>human</em></li>
    <li>Rankings update — and the best voices rise to the top</li>
</ol>

<p>No model names are shown until after you vote, so every judgement is pure. Over time, thousands of these small choices build a leaderboard that reflects what people actually prefer — not what a loss function thinks is best.</p>

<h3>Your Moves</h3>
<table class="about-table">
    <thead><tr><th>Choice</th><th>What it means</th></tr></thead>
    <tbody>
        <tr><td>A is Better</td><td>Voice A sounded more natural to you</td></tr>
        <tr><td>B is Better</td><td>Voice B sounded more natural to you</td></tr>
        <tr><td>Both Good</td><td>Honestly, both sounded great</td></tr>
        <tr><td>Both Bad</td><td>Neither felt right</td></tr>
    </tbody>
</table>

<h3>Quick Keys</h3>
<table class="about-table">
    <thead><tr><th>Key</th><th>Action</th></tr></thead>
    <tbody>
        <tr><td><kbd>A</kbd></td><td>Vote for A</td></tr>
        <tr><td><kbd>B</kbd></td><td>Vote for B</td></tr>
        <tr><td><kbd>N</kbd></td><td>Next round</td></tr>
    </tbody>
</table>

<p style="text-align: center; color: #888; font-size: 0.88em; margin-top: 2em;">
    Built with ❤️ for the Arabic-speaking world by <a href="https://github.com/Navid-Gen-AI" target="_blank">Navid</a>
</p>

</div>
"""


def decode_audio_to_file(audio_base64: str) -> Optional[str]:
    """Decode base64 WAV audio and write to a temp file.

    Returns the file path (Gradio gr.Audio accepts file paths).
    """
    import tempfile

    try:
        wav_bytes = base64.b64decode(audio_base64)
        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        tmp.write(wav_bytes)
        tmp.flush()
        tmp.close()
        return tmp.name
    except Exception:
        return None


def synthesize_audio(text: str, model_id: str) -> dict:
    """Call the ArenaService to synthesize (or return cached) audio.

    The backend checks its audio cache first and only calls the GPU model
    on a cache miss, saving compute and reducing latency.

    Retries up to MAX_SYNTHESIS_RETRIES times on failure before giving up.
    """
    last_error = None
    for attempt in range(1, MAX_SYNTHESIS_RETRIES + 1):
        try:
            service = get_arena_service()
            result = service().synthesize_or_cache.remote(text, model_id)
            if result.get("success"):
                return result
            last_error = result.get("error", "Unknown synthesis error")
            print(f"⚠️ {model_id} attempt {attempt} failed: {last_error}")
        except Exception as e:
            last_error = str(e)
            print(f"⚠️ {model_id} attempt {attempt} exception: {last_error}")
    # All retries exhausted
    return {
        "success": False,
        "error": f"{model_id} failed after {MAX_SYNTHESIS_RETRIES} attempts: {last_error}",
        "model_id": model_id,
    }


def _get_model_ratings() -> dict[str, dict]:
    """Read per-model elo and ci from the local leaderboard file."""
    try:
        if LEADERBOARD_FILE.exists():
            with open(LEADERBOARD_FILE, "r") as f:
                data = json.load(f)
            return {
                m["model_id"]: {
                    "elo": m.get("elo", 1000),
                    "ci": m.get("ci", 0),
                    "battles": m.get("battles", 0),
                }
                for m in data.get("models", [])
            }
    except Exception:
        pass
    return {}


def get_random_model_pair() -> tuple[str, str]:
    """Select two models using adaptive pairing for maximum information gain.

    Combines two signals to score every possible pair:

    1. **CI overlap** — pairs whose confidence intervals overlap are the
       most uncertain (we don't know which is better), so a vote between
       them is maximally informative.  Measured as the fraction of overlap
       relative to the smaller CI.  Pairs with no CI data yet get the
       maximum overlap score (1.0) so new models are explored.

    2. **Under-sampling** — pairs where either model has few battles get
       a boost via inverse-sqrt weighting, same as before.

    The two signals are multiplied together and used as sampling weights
    over all possible pairs, so the selection is stochastic (not greedy)
    and every pair retains a non-zero chance of appearing.
    """
    import math
    from itertools import combinations

    models = [m for m, info in _get_available_models().items() if not info.get("retired", False)]
    if len(models) < 2:
        raise ValueError("Not enough models available for comparison")

    ratings = _get_model_ratings()

    # --- score every candidate pair ---
    pairs: list[tuple[str, str]] = list(combinations(models, 2))
    pair_weights: list[float] = []

    for a, b in pairs:
        ra = ratings.get(a, {})
        rb = ratings.get(b, {})

        elo_a = ra.get("elo", 1000)
        elo_b = rb.get("elo", 1000)
        ci_a = ra.get("ci", 0)
        ci_b = rb.get("ci", 0)
        battles_a = ra.get("battles", 0)
        battles_b = rb.get("battles", 0)

        # -- Signal 1: CI overlap score (0–1) --
        # If either model has no CI yet, treat as maximally uncertain → 1.0
        if ci_a <= 0 or ci_b <= 0:
            overlap_score = 1.0
        else:
            # Interval: [elo - ci, elo + ci]
            lo_a, hi_a = elo_a - ci_a, elo_a + ci_a
            lo_b, hi_b = elo_b - ci_b, elo_b + ci_b
            overlap = max(0.0, min(hi_a, hi_b) - max(lo_a, lo_b))
            span = min(ci_a, ci_b) * 2  # width of the smaller CI
            overlap_score = min(overlap / span, 1.0) if span > 0 else 1.0

        # Ensure a minimum floor so distant pairs still occasionally appear
        overlap_score = max(overlap_score, 0.05)

        # -- Signal 2: under-sampling boost --
        # Use inverse-sqrt so models with fewer battles get a stronger boost
        exploration = (
            1.0 / math.sqrt(battles_a + 1)
            + 1.0 / math.sqrt(battles_b + 1)
        ) / 2.0

        # Combine signals: additive blend instead of purely multiplicative.
        # The overlap term (weighted 0.6) focuses votes where rankings are
        # uncertain; the exploration term (weighted 0.4) ensures under-
        # sampled models get paired regardless of how far apart their
        # ratings are.  This prevents a dominant model with few battles
        # from being starved of matchups.
        pair_weights.append(0.6 * overlap_score * exploration + 0.4 * exploration)

    # --- sample one pair stochastically ---
    (first, second), = random.choices(pairs, weights=pair_weights, k=1)

    # Randomise A/B assignment so there's no positional bias
    if random.random() < 0.5:
        return (first, second)
    return (second, first)


def get_random_sentence():
    """Return a random Arabic sample sentence."""
    return random.choice(SAMPLE_SENTENCES)


def _empty_comparison():
    """Return values that reset the UI to the pre-synthesis state."""
    return (
        None,
        None,  # audio_a, audio_b
        None,
        None,  # model_a_id, model_b_id
        None,
        None,  # audio_a_base64, audio_b_base64
        None,
        None,  # latency_a, latency_b
        gr.update(visible=False),  # audio_row
        gr.update(visible=False),  # vote_row
        gr.update(visible=False),  # result_display
        gr.update(value="🔊 Synthesize", interactive=True),  # synth_btn
        gr.update(interactive=True),  # text_input
        gr.update(value="", visible=False),  # status_display
        "🔒 Hidden",  # model_a_label
        "🔒 Hidden",  # model_b_label
        gr.update(visible=False),  # next_round_btn
        gr.update(interactive=True),  # vote_a_btn
        gr.update(interactive=True),  # vote_b_btn
        gr.update(interactive=True),  # vote_both_good_btn
        gr.update(interactive=True),  # vote_both_bad_btn
        "",  # current_text
    )


def _pick_replacement(exclude: set[str]) -> str | None:
    """Pick a model not in *exclude*, or None if none left."""
    candidates = [m for m, info in _get_available_models().items()
                   if m not in exclude and not info.get("retired", False)]
    return random.choice(candidates) if candidates else None


def _synth_one(text: str, model_id: str, used: set[str]) -> tuple[dict | None, str]:
    """Try to synthesize with *model_id*; on failure swap in a replacement once.

    Returns (result_dict_or_None, final_model_id).
    """
    result = synthesize_audio(text, model_id)
    if result.get("success"):
        return result, model_id

    # First model failed after retries — try a replacement
    replacement = _pick_replacement(used)
    if replacement:
        used.add(replacement)
        result = synthesize_audio(text, replacement)
        if result.get("success"):
            return result, replacement

    return None, model_id  # give up


def generate_comparison(text: str):
    """Generate audio from two random TTS models for comparison.

    Both models are synthesized in parallel using threads to halve wait time.
    Uses a generator to yield status updates so the user sees progress.
    """
    from concurrent.futures import ThreadPoolExecutor

    if not text or not text.strip():
        gr.Warning("Please enter some Arabic text first.")
        yield _empty_comparison()
        return

    text = text.strip()
    model_a_id, model_b_id = get_random_model_pair()

    # — Show "synthesizing" status —
    yield (
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,  # latency_a, latency_b
        gr.update(visible=False),
        gr.update(visible=False),
        gr.update(visible=False),
        gr.update(value="⏳ Synthesizing…", interactive=False),
        gr.update(interactive=False),
        gr.update(value="⏳ Generating audio from both models…", visible=True),
        "🔒 Hidden",
        "🔒 Hidden",
        gr.update(visible=False),
        gr.update(interactive=True),
        gr.update(interactive=True),
        gr.update(interactive=True),
        gr.update(interactive=True),
        text,
    )

    # — Synthesize both models in parallel —
    # Each thread gets its own used set for the replacement fallback logic.
    def synth_a():
        used = {model_a_id, model_b_id}
        return _synth_one(text, model_a_id, used)

    def synth_b():
        used = {model_a_id, model_b_id}
        return _synth_one(text, model_b_id, used)

    try:
        with ThreadPoolExecutor(max_workers=2) as pool:
            future_a = pool.submit(synth_a)
            future_b = pool.submit(synth_b)
            result_a, model_a_id = future_a.result()
            result_b, model_b_id = future_b.result()
    except Exception as e:
        gr.Warning(f"Connection error — is the backend deployed? ({e})")
        yield _empty_comparison()
        return

    if result_a is None:
        gr.Warning("Model A synthesis failed after retries. Please try again.")
        yield _empty_comparison()
        return

    if result_b is None:
        gr.Warning("Model B synthesis failed after retries. Please try again.")
        yield _empty_comparison()
        return

    # — Decode audio to temp files for Gradio —
    audio_a_path = decode_audio_to_file(result_a["audio_base64"])
    audio_b_path = decode_audio_to_file(result_b["audio_base64"])
    if not audio_a_path or not audio_b_path:
        gr.Warning("Failed to decode audio from backend.")
        yield _empty_comparison()
        return

    # Extract latency (None for cache hits / legacy responses)
    latency_a = result_a.get("latency_seconds")
    latency_b = result_b.get("latency_seconds")

    yield (
        audio_a_path,
        audio_b_path,
        model_a_id,
        model_b_id,
        result_a["audio_base64"],
        result_b["audio_base64"],
        latency_a,
        latency_b,
        gr.update(visible=True),  # audio_row
        gr.update(visible=True),  # vote_row
        gr.update(visible=False),  # result_display
        gr.update(value="🔊 Synthesize", interactive=True),
        gr.update(interactive=True),
        gr.update(value="", visible=False),  # hide status
        "🔒 Hidden",  # model_a_label
        "🔒 Hidden",  # model_b_label
        gr.update(visible=False),  # next_round_btn
        gr.update(interactive=True),  # vote_a_btn
        gr.update(interactive=True),  # vote_b_btn
        gr.update(interactive=True),  # vote_both_good_btn
        gr.update(interactive=True),  # vote_both_bad_btn
        text,
    )


def submit_vote(
    vote: str,
    text_prompt: str,
    model_a_id: str,
    model_b_id: str,
    audio_a_b64: str,
    audio_b_b64: str,
    latency_a: float | None,
    latency_b: float | None,
):
    """Submit a vote for the comparison."""
    if not model_a_id or not model_b_id:
        gr.Warning("Please synthesize audio first.")
        return (
            gr.update(visible=True),  # vote_row stays
            gr.update(visible=False),  # result_display
            gr.update(visible=False),  # next_round_btn
            "🔒 Hidden",
            "🔒 Hidden",
            gr.update(),  # vote_a_btn unchanged
            gr.update(),  # vote_b_btn unchanged
            gr.update(),  # vote_both_good_btn unchanged
            gr.update(),  # vote_both_bad_btn unchanged
        )

    session_id = uuid.uuid4().hex

    try:
        service = get_arena_service()
        result = service().record_vote.remote(
            session_id=session_id,
            text=text_prompt,
            model_a=model_a_id,
            model_b=model_b_id,
            winner=vote,
            audio_a_base64=audio_a_b64,
            audio_b_base64=audio_b_b64,
            latency_a=latency_a,
            latency_b=latency_b,
        )
    except Exception as e:
        gr.Warning(f"Vote failed: {e}")
        return (
            gr.update(visible=True),
            gr.update(visible=False),
            gr.update(visible=False),
            "🔒 Hidden",
            "🔒 Hidden",
            gr.update(),  # vote_a_btn unchanged
            gr.update(),  # vote_b_btn unchanged
            gr.update(),  # vote_both_good_btn unchanged
            gr.update(),  # vote_both_bad_btn unchanged
        )

    if not result.get("success"):
        gr.Warning(f"Error: {result.get('error', 'Unknown')}")
        return (
            gr.update(visible=True),
            gr.update(visible=False),
            gr.update(visible=False),
            "🔒 Hidden",
            "🔒 Hidden",
            gr.update(),  # vote_a_btn unchanged
            gr.update(),  # vote_b_btn unchanged
            gr.update(),  # vote_both_good_btn unchanged
            gr.update(),  # vote_both_bad_btn unchanged
        )

    vote_emoji = {
        "model_a": "🅰️ Model A",
        "model_b": "🅱️ Model B",
        "both_good": "👍 Both Good",
        "both_bad": "👎 Both Bad",
    }

    name_a = get_display_name(model_a_id)
    name_b = get_display_name(model_b_id)

    result_md = f"""
<div style="text-align:center; padding: 1.2em 1em; border-radius: 12px;
            background: var(--block-background-fill); border: 1px solid var(--border-color-primary);">
    <div style="font-size: 1.6em; margin-bottom: 0.3em;">✅ Vote Recorded!</div>
    <div style="font-size: 1.05em; margin-bottom: 0.8em;">
        You chose: <strong>{vote_emoji.get(vote, vote)}</strong>
    </div>
    <div style="display: flex; justify-content: center; gap: 2em; font-size: 1em;">
        <div>🅰️ <strong>{name_a}</strong></div>
        <div style="color: #aaa;">vs</div>
        <div>🅱️ <strong>{name_b}</strong></div>
    </div>
    <div style="margin-top: 0.8em; color: #888; font-size: 0.85em;">
        Thanks for voting! The leaderboard updates daily.
    </div>
</div>
"""
    return (
        gr.update(visible=False),  # hide vote_row
        gr.update(value=result_md, visible=True),  # show result
        gr.update(visible=True),  # show next_round_btn
        f"**{name_a}**",  # reveal model A
        f"**{name_b}**",  # reveal model B
        gr.update(interactive=False),  # disable vote_a_btn
        gr.update(interactive=False),  # disable vote_b_btn
        gr.update(interactive=False),  # disable vote_both_good_btn
        gr.update(interactive=False),  # disable vote_both_bad_btn
    )


def refresh_leaderboard():
    """Read and display leaderboard from local JSON file."""
    try:
        if not LEADERBOARD_FILE.exists():
            return _empty_leaderboard_md()

        with open(LEADERBOARD_FILE, "r") as f:
            data = json.load(f)

        models = data.get("models", [])
        last_updated = data.get("last_updated", "")

        if not models:
            return _empty_leaderboard_md()

        # Hide active models with too few battles (retired models always shown)
        models = [m for m in models if m.get("retired", False) or m.get("battles", 0) >= MIN_BATTLES]

        if not models:
            return _empty_leaderboard_md()

        # Format timestamp
        try:
            from datetime import datetime

            dt = datetime.fromisoformat(last_updated.replace("Z", "+00:00"))
            updated_str = dt.strftime("%b %d, %Y")
        except Exception:
            updated_str = last_updated or "—"

        # --- Styles ---
        style_block = """
        <style>
        .lb-container { max-width: 660px; margin: 0 auto; }
        .lb-meta {
            display: flex; justify-content: center; align-items: center;
            gap: 0.6em;
            font-size: 0.85em;
            color: var(--body-text-color-subdued, #999);
            padding: 0 0 0.8em 0;
        }
        .lb-meta strong {
            color: var(--body-text-color);
            font-weight: 700;
        }
        .lb-meta-sep {
            color: var(--body-text-color-subdued, #666);
            opacity: 0.5;
        }
        .lb-list { display: flex; flex-direction: column; gap: 0; }

        /* Each model row */
        .lb-item {
            display: grid;
            grid-template-columns: 48px 1fr 120px 90px 72px;
            align-items: center;
            gap: 0 12px;
            padding: 14px 20px;
            border-bottom: 1px solid var(--border-color-primary);
        }
        .lb-item:first-child { border-top: 1px solid var(--border-color-primary); }
        .lb-item:hover { background: rgba(255,255,255,0.03); }

        /* Top-3 subtle left accent */
        .lb-item.gold   { background: rgba(255,195,0,0.04); }
        .lb-item.silver { background: rgba(180,180,195,0.04); }
        .lb-item.bronze { background: rgba(210,140,70,0.03); }
        .lb-item.gold:hover   { background: rgba(255,195,0,0.08); }
        .lb-item.silver:hover { background: rgba(180,180,195,0.08); }
        .lb-item.bronze:hover { background: rgba(210,140,70,0.07); }

        .lb-rank {
            font-size: 1.1em; font-weight: 600;
            text-align: center;
            color: var(--body-text-color);
        }
        .lb-rank.r-gold   { color: #E8C33A; }
        .lb-rank.r-silver { color: #C0C0C8; }
        .lb-rank.r-bronze { color: #D4944A; }

        .lb-name-cell {
            display: flex;
            flex-wrap: wrap;
            align-items: center;
            gap: 4px 6px;
            min-width: 0;
        }
        .lb-name {
            font-weight: 600; font-size: 1.05em;
            color: var(--body-text-color) !important;
            text-decoration: none !important;
            white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
            cursor: default;
        }
        a.lb-name { cursor: pointer; }
        a.lb-name::after {
            content: '';
            display: inline-block;
            width: 0.95em; height: 0.95em;
            margin-left: 5px;
            vertical-align: middle;
            opacity: 0;
            transition: opacity 0.15s;
            background: currentColor;
            -webkit-mask: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='none' stroke='currentColor' stroke-width='2.5' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpath d='M7 17L17 7'/%3E%3Cpath d='M7 7h10v10'/%3E%3C/svg%3E") no-repeat center/contain;
            mask: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='none' stroke='currentColor' stroke-width='2.5' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpath d='M7 17L17 7'/%3E%3Cpath d='M7 7h10v10'/%3E%3C/svg%3E") no-repeat center/contain;
        }
        a.lb-name:hover { color: #10b981 !important; text-decoration: none !important; }
        a.lb-name:hover::after { opacity: 1; }

        .lb-score {
            text-align: center;
            font-weight: 700; font-size: 1.05em;
            font-variant-numeric: tabular-nums;
            color: var(--body-text-color);
        }
        .lb-votes {
            text-align: center;
            font-size: 0.92em;
            font-variant-numeric: tabular-nums;
            color: var(--body-text-color);
        }
        .lb-latency {
            text-align: center;
            font-size: 0.92em;
            font-variant-numeric: tabular-nums;
            color: var(--body-text-color);
            position: relative;
            cursor: default;
            overflow: hidden;
        }
        .lb-latency[data-gpu] {
            cursor: pointer;
        }

        /* The latency value and GPU label live in spans inside the cell */
        .lb-latency .lb-lat-val {
            display: block;
            transition: transform 0.25s ease, opacity 0.25s ease;
        }
        .lb-latency .lb-lat-gpu {
            display: block;
            position: absolute;
            inset: 0;
            display: flex;
            align-items: center;
            justify-content: center;
            font-size: 0.85em;
            font-weight: 600;
            color: #10b981;
            letter-spacing: 0.01em;
            transform: translateY(100%);
            opacity: 0;
            transition: transform 0.25s ease, opacity 0.25s ease;
        }
        .lb-latency[data-gpu]:hover .lb-lat-val {
            transform: translateY(-100%);
            opacity: 0;
        }
        .lb-latency[data-gpu]:hover .lb-lat-gpu {
            transform: translateY(0);
            opacity: 1;
        }
        .lb-ci {
            font-size: 0.78em;
            font-weight: 400;
            color: var(--body-text-color-subdued, #888);
            margin-left: 3px;
            font-variant-numeric: tabular-nums;
        }

        /* Model tags (pills) */
        .lb-tag {
            display: inline-block;
            font-size: 0.65em;
            font-weight: 600;
            text-transform: uppercase;
            letter-spacing: 0.04em;
            padding: 0.15em 0.5em;
            border-radius: 4px;
            line-height: 1.4;
            white-space: nowrap;
        }
        .lb-tag-retired {
            background: rgba(255,255,255,0.08);
            color: var(--body-text-color-subdued, #888);
        }
        .lb-tag-proprietary {
            background: rgba(234,179,8,0.12);
            color: #d4a017;
        }

        /* Retired model row styling */
        .lb-item.retired { opacity: 0.5; }
        .lb-item.retired:hover { opacity: 0.7; background: rgba(255,255,255,0.02); }

        /* Retired accordion */
        .lb-retired-accordion {
            border-top: 1px solid var(--border-color-primary);
        }
        .lb-retired-accordion summary {
            display: flex;
            align-items: center;
            gap: 8px;
            padding: 10px 20px;
            font-size: 0.8em;
            font-weight: 600;
            text-transform: uppercase;
            letter-spacing: 0.05em;
            color: var(--body-text-color-subdued, #888);
            cursor: pointer;
            user-select: none;
            list-style: none;
            background: rgba(255,255,255,0.015);
            transition: background 0.15s ease;
        }
        .lb-retired-accordion summary:hover {
            background: rgba(255,255,255,0.04);
        }
        .lb-retired-accordion summary::-webkit-details-marker { display: none; }
        .lb-retired-accordion summary::before {
            content: '▸';
            display: inline-block;
            transition: transform 0.2s ease;
            font-size: 0.9em;
        }
        .lb-retired-accordion[open] summary::before {
            transform: rotate(90deg);
        }
        .lb-retired-accordion .lb-retired-body {
            border-top: 1px solid var(--border-color-primary);
        }

        /* Column labels */
        .lb-colheader {
            display: grid;
            grid-template-columns: 48px 1fr 120px 90px 72px;
            align-items: center;
            gap: 0 12px;
            padding: 6px 20px 8px 20px;
            font-size: 0.75em;
            font-weight: 600;
            text-transform: uppercase;
            letter-spacing: 0.05em;
            color: var(--body-text-color-subdued, #999);
        }
        .lb-colheader span:nth-child(3),
        .lb-colheader span:nth-child(4),
        .lb-colheader span:nth-child(5) { text-align: center; }

        @media (max-width: 600px) {
            .lb-container { margin: 0 4px; }
            .lb-colheader {
                grid-template-columns: 36px 1fr 72px;
                padding: 6px 10px 8px 10px;
                font-size: 0.7em;
            }
            .lb-colheader span:nth-child(4),
            .lb-colheader span:nth-child(5) { display: none; }
            .lb-item {
                grid-template-columns: 36px 1fr 72px;
                gap: 0 6px;
                padding: 10px 10px;
            }
            .lb-rank { font-size: 0.95em; }
            .lb-name { font-size: 0.92em; }
            .lb-score { font-size: 0.92em; }
            .lb-ci { font-size: 0.7em; }
            .lb-votes { display: none; }
            .lb-latency { display: none; }
            .lb-retired-accordion summary { padding: 10px 10px; }
        }
        @media (max-width: 380px) {
            .lb-colheader {
                grid-template-columns: 30px 1fr 64px;
                padding: 5px 6px 7px 6px;
            }
            .lb-item {
                grid-template-columns: 30px 1fr 64px;
                gap: 0 4px;
                padding: 9px 6px;
            }
            .lb-rank { font-size: 0.88em; }
            .lb-name { font-size: 0.85em; }
            .lb-score { font-size: 0.85em; }
            .lb-ci { display: none; }
            .lb-retired-accordion summary { padding: 9px 6px; }
        }
        </style>
        """

        # Metadata line
        total_battles = sum(m.get("battles", 0) for m in models)
        meta_html = (
            f'<div class="lb-meta">'
            f'<span>⚔️ <strong>{total_battles:,}</strong> battles</span>'
            f'<span class="lb-meta-sep">·</span>'
            f'<span>Updated <strong>{updated_str}</strong></span>'
            f'</div>'
        )

        # Column labels
        col_header = (
            '<div class="lb-colheader">'
            "<span>Rank</span>"
            "<span>Model</span>"
            "<span>Score</span>"
            "<span>Latency</span>"
            "<span>Battles</span>"
            "</div>"
        )

        # Separate active and retired models
        active_models = [m for m in models if not m.get("retired", False)]
        retired_models = [m for m in models if m.get("retired", False)]

        # Build rows
        tier_row = {1: "gold", 2: "silver", 3: "bronze"}
        tier_rank = {1: "r-gold", 2: "r-silver", 3: "r-bronze"}

        def _build_row(entry, is_retired=False):
            rank = entry["rank"]
            name = entry["name"]
            model_url = entry.get("model_url", "")
            elo = entry["elo"]
            ci = entry.get("ci", 0)
            battles = entry.get("battles", 0)
            avg_latency = entry.get("avg_latency")
            gpu = entry.get("gpu", "")
            open_weight = entry.get("open_weight", True)

            if is_retired:
                row_cls = "retired"
                rank_cls = ""
                rank_display = "—"
            else:
                row_cls = tier_row.get(rank, "")
                rank_cls = tier_rank.get(rank, "")
                rank_display = str(rank)

            is_api = not gpu

            if model_url:
                name_el = (
                    f'<a class="lb-name" href="{model_url}" target="_blank">{name}</a>'
                )
            else:
                name_el = f'<span class="lb-name">{name}</span>'

            # Build tags
            tags = ""
            if is_retired:
                tags += '<span class="lb-tag lb-tag-retired">Retired</span>'
            if not open_weight:
                tags += '<span class="lb-tag lb-tag-proprietary">Proprietary</span>'

            name_cell = f'<div class="lb-name-cell">{name_el}{tags}</div>'

            votes_text = f"{battles:,}" if battles else "—"
            ci_html = f'<span class="lb-ci">&plusmn;{ci:.0f}</span>' if ci else ""
            latency_text = f"{avg_latency:.1f}s" if avg_latency is not None else "—"
            if gpu:
                gpu_attr = f' data-gpu="{gpu}"'
                gpu_label = f'<span class="lb-lat-gpu">⚡ {gpu}</span>'
            elif is_api:
                gpu_attr = ' data-gpu="API"'
                gpu_label = '<span class="lb-lat-gpu">☁️ API</span>'
            else:
                gpu_attr = ""
                gpu_label = ""

            return (
                f'<div class="lb-item {row_cls}">'
                f'<div class="lb-rank {rank_cls}">{rank_display}</div>'
                f"{name_cell}"
                f'<div class="lb-score">{elo:.0f}{ci_html}</div>'
                f'<div class="lb-latency"{gpu_attr}><span class="lb-lat-val">{latency_text}</span>{gpu_label}</div>'
                f'<div class="lb-votes">{votes_text}</div>'
                f"</div>"
            )

        items_html = ""
        for i, entry in enumerate(active_models, start=1):
            entry["rank"] = i
            items_html += _build_row(entry)

        if retired_models:
            retired_rows = ""
            for entry in retired_models:
                retired_rows += _build_row(entry, is_retired=True)
            items_html += (
                f'<details class="lb-retired-accordion">'
                f'<summary>🏷️ Retired models ({len(retired_models)})</summary>'
                f'<div class="lb-retired-body">{retired_rows}</div>'
                f'</details>'
            )

        return (
            f'<div class="lb-container">'
            f"{style_block}{meta_html}{col_header}"
            f'<div class="lb-list">{items_html}</div>'
            f"</div>"
        )

    except Exception as e:
        return f"❌ Error loading leaderboard: {e}"


def _empty_leaderboard_md() -> str:
    return (
        '<div style="text-align:center; padding:3em 1em; color:var(--body-text-color-subdued,#888);">'
        "<h3>No data yet!</h3>"
        "<p>Be the first to vote — head to the <strong>⚔️ Battle</strong> tab.</p>"
        "</div>"
    )


def clear_for_next_round():
    """Reset the UI for a new comparison."""
    return (
        None,
        None,  # audio_a, audio_b
        None,
        None,  # model_a_id, model_b_id
        None,
        None,  # audio_a_base64, audio_b_base64
        None,
        None,  # latency_a_state, latency_b_state
        gr.update(visible=False),  # audio_row
        gr.update(visible=False),  # vote_row
        gr.update(visible=False),  # result_display
        gr.update(visible=False),  # next_round_btn
        "🔒 Hidden",
        "🔒 Hidden",  # model labels
        get_random_sentence(),  # new random sentence
        gr.update(interactive=True),  # re-enable vote_a_btn
        gr.update(interactive=True),  # re-enable vote_b_btn
        gr.update(interactive=True),  # re-enable vote_both_good_btn
        gr.update(interactive=True),  # re-enable vote_both_bad_btn
        "",  # current_text
    )


CUSTOM_CSS = """
/* Hide Gradio footer */
footer { display: none !important; }

/* RTL text input */
.text-input textarea {
    font-size: 1.15em !important;
    direction: rtl;
    line-height: 1.6;
}

/* Center helpers */
.center-text { text-align: center !important; }

/* Model column labels */
.model-label {
    text-align: center;
    font-weight: 600;
    font-size: 1em;
    padding: 0.4em 0 0.1em 0;
    min-height: 28px;
    letter-spacing: 0.02em;
}

/* Column header badges (A / B) */
.column-header {
    text-align: center;
    font-size: 1.15em;
    font-weight: 700;
    padding: 0.3em 0;
    margin-bottom: 0.15em;
}

/* Audio players */
.audio-player { min-height: 70px; }

/* Voting buttons — consistent sizing */
.vote-btn {
    min-height: 48px !important;
    font-size: 1em !important;
    font-weight: 600 !important;
}
.tie-btn {
    min-height: 44px !important;
    font-weight: 500 !important;
}

/* Next round button */
.next-btn {
    min-height: 48px !important;
    font-size: 1.05em !important;
    font-weight: 600 !important;
    margin-top: 0.5em !important;
}

/* Synth button */
.synth-btn {
    min-height: 48px !important;
    font-size: 1.05em !important;
}

/* Status message during synthesis */
.status-msg {
    text-align: center;
    font-size: 1.05em;
    color: #888;
    padding: 0.6em 0;
    animation: pulse 1.5s ease-in-out infinite;
}
@keyframes pulse {
    0%, 100% { opacity: 1; }
    50% { opacity: 0.5; }
}

/* Leaderboard tab spacing */
.leaderboard-wrap { max-width: 680px; margin: 0 auto; }

/* Center the tab buttons */
.tabs > .tab-nav,
div[role="tablist"],
.tab-nav {
    justify-content: center !important;
}
"""

SHORTCUT_JS = """
<script>
document.addEventListener('keypress', function(e) {
    // Don't fire when user is typing in an input
    const tag = e.target.tagName.toLowerCase();
    if (tag === 'input' || tag === 'textarea') return;

    switch (e.key.toLowerCase()) {
        case 'a': document.getElementById('vote-a-btn')?.click(); break;
        case 'b': document.getElementById('vote-b-btn')?.click(); break;
        case 'n': document.getElementById('next-round-btn')?.click(); break;
    }
}, false);
</script>
"""


def create_demo():
    """Create the Gradio interface."""

    with gr.Blocks(
        title="Arabic TTS Arena",
        theme=gr.themes.Soft(
            primary_hue="emerald",
            secondary_hue="slate",
            neutral_hue="slate",
        ),
        css=CUSTOM_CSS,
        head=SHORTCUT_JS,
    ) as demo:
        # Header
        gr.HTML(HEADER_MD)

        with gr.Tabs():
            # Voting Tab
            with gr.TabItem("⚔️ Battle", id="battle"):
                gr.HTML(HOW_IT_WORKS_MD)

                # Hidden state
                model_a_id = gr.State(value=None)
                model_b_id = gr.State(value=None)
                audio_a_base64 = gr.State(value=None)
                audio_b_base64 = gr.State(value=None)
                latency_a_state = gr.State(value=None)
                latency_b_state = gr.State(value=None)
                current_text = gr.State(value="")

                # — Text input —
                with gr.Group():
                    with gr.Row():
                        text_input = gr.Textbox(
                            container=False,
                            show_label=False,
                            placeholder="اكتب نصاً عربياً هنا...",
                            lines=1,
                            max_lines=3,
                            scale=20,
                            elem_classes=["text-input"],
                        )
                        random_btn = gr.Button(
                            "🎲",
                            scale=0,
                            min_width=50,
                            variant="secondary",
                        )
                    synth_btn = gr.Button(
                        "🔊 Synthesize",
                        variant="primary",
                        size="lg",
                        elem_classes=["synth-btn"],
                    )

                # — Status indicator (shown during synthesis) —
                status_display = gr.HTML(
                    value="", visible=False, elem_classes=["status-msg"]
                )

                # — Audio players (hidden until synthesis) —
                with gr.Row(visible=False, equal_height=True) as audio_row:
                    with gr.Column():
                        gr.Markdown(
                            "### 🅰️ Model A",
                            elem_classes=["column-header", "center-text"],
                        )
                        audio_a = gr.Audio(
                            show_label=False,
                            interactive=False,
                            elem_classes=["audio-player"],
                        )
                        model_a_label = gr.Markdown(
                            "🔒 Hidden",
                            elem_classes=["model-label", "center-text"],
                        )
                        vote_a_btn = gr.Button(
                            "👆 A is Better",
                            variant="primary",
                            elem_id="vote-a-btn",
                            elem_classes=["vote-btn"],
                        )

                    with gr.Column():
                        gr.Markdown(
                            "### 🅱️ Model B",
                            elem_classes=["column-header", "center-text"],
                        )
                        audio_b = gr.Audio(
                            show_label=False,
                            interactive=False,
                            elem_classes=["audio-player"],
                        )
                        model_b_label = gr.Markdown(
                            "🔒 Hidden",
                            elem_classes=["model-label", "center-text"],
                        )
                        vote_b_btn = gr.Button(
                            "👆 B is Better",
                            variant="primary",
                            elem_id="vote-b-btn",
                            elem_classes=["vote-btn"],
                        )

                # — Tie buttons —
                with gr.Row(visible=False) as vote_row:
                    vote_both_good_btn = gr.Button(
                        "👍 Both Good",
                        variant="secondary",
                        elem_classes=["tie-btn"],
                    )
                    vote_both_bad_btn = gr.Button(
                        "👎 Both Bad",
                        variant="secondary",
                        elem_classes=["tie-btn"],
                    )

                # — Result card + next round —
                result_display = gr.HTML(visible=False)
                next_round_btn = gr.Button(
                    "⚡ Next Round (N)",
                    visible=False,
                    variant="primary",
                    elem_id="next-round-btn",
                    elem_classes=["next-btn"],
                )

                random_btn.click(fn=get_random_sentence, outputs=[text_input])

                synth_btn.click(
                    fn=generate_comparison,
                    inputs=[text_input],
                    outputs=[
                        audio_a,
                        audio_b,
                        model_a_id,
                        model_b_id,
                        audio_a_base64,
                        audio_b_base64,
                        latency_a_state,
                        latency_b_state,
                        audio_row,
                        vote_row,
                        result_display,
                        synth_btn,
                        text_input,
                        status_display,
                        model_a_label,
                        model_b_label,
                        next_round_btn,
                        vote_a_btn,
                        vote_b_btn,
                        vote_both_good_btn,
                        vote_both_bad_btn,
                        current_text,
                    ],
                )

                # Vote handlers (all four buttons share the same signature)
                def make_vote_handler(vote_type: str):
                    def handler(text, m_a, m_b, a_b64, b_b64, lat_a, lat_b):
                        return submit_vote(vote_type, text, m_a, m_b, a_b64, b_b64, lat_a, lat_b)

                    return handler

                vote_outputs = [
                    vote_row,
                    result_display,
                    next_round_btn,
                    model_a_label,
                    model_b_label,
                    vote_a_btn,
                    vote_b_btn,
                    vote_both_good_btn,
                    vote_both_bad_btn,
                ]
                vote_inputs = [
                    current_text,
                    model_a_id,
                    model_b_id,
                    audio_a_base64,
                    audio_b_base64,
                    latency_a_state,
                    latency_b_state,
                ]

                for btn, vtype in [
                    (vote_a_btn, "model_a"),
                    (vote_b_btn, "model_b"),
                    (vote_both_good_btn, "both_good"),
                    (vote_both_bad_btn, "both_bad"),
                ]:
                    btn.click(
                        fn=make_vote_handler(vtype),
                        inputs=vote_inputs,
                        outputs=vote_outputs,
                    )

                next_round_btn.click(
                    fn=clear_for_next_round,
                    outputs=[
                        audio_a,
                        audio_b,
                        model_a_id,
                        model_b_id,
                        audio_a_base64,
                        audio_b_base64,
                        latency_a_state,
                        latency_b_state,
                        audio_row,
                        vote_row,
                        result_display,
                        next_round_btn,
                        model_a_label,
                        model_b_label,
                        text_input,
                        vote_a_btn,
                        vote_b_btn,
                        vote_both_good_btn,
                        vote_both_bad_btn,
                        current_text,
                    ],
                )

            # Leaderboard Tab
            with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
                with gr.Column(elem_classes=["leaderboard-wrap"]):
                    leaderboard_display = gr.HTML(
                        "<p style='text-align:center; color:var(--body-text-color-subdued,#888);'>Loading…</p>"
                    )
                demo.load(fn=refresh_leaderboard, outputs=[leaderboard_display])

            # About Tab
            with gr.TabItem("📖 Story", id="story"):
                gr.HTML(ABOUT_MD)

    return demo


if __name__ == "__main__":
    demo = create_demo()
    demo.queue(default_concurrency_limit=4).launch(server_name="0.0.0.0", server_port=7860)