Spaces:

MLSpeech
/

FALCON

Running

File size: 29,139 Bytes

"""
FALCON web demo — interactive forced alignment in the browser.

Two pretrained checkpoints are used by FALCON (under pretrained_models/):
  - falcon_timit_english.pt       — TIMIT-trained, best for English phoneme alignment
  - falcon_joint_multilingual.pt  — joint TIMIT+Buckeye model; best for cross-lingual /
                                   multilingual zero-shot alignment (Dutch, German,
                                   Hebrew, ...) at both phoneme and word level.

The app picks one automatically from the `Language` radio (english → TIMIT,
multilingual → joint); a custom .pt upload overrides both.

For HuggingFace Spaces deployment, set Space Secrets:
  HF_MODEL_REPO   — e.g. "MLSpeech/FALCON-weights"
  HF_TOKEN        — only needed for private repos
The app will download `falcon_timit_english.pt` and `falcon_joint_multilingual.pt`
from that repo on first use.
"""
import os
import re
import shutil
import sys
import tempfile
import threading
import time

# panphon 0.21.0 (pinned) ships an invalid-syntax line in featuretable.py — a stray
# type annotation inside a function call — that breaks `import panphon` on a clean
# install. Fix it on disk before anything imports panphon. Idempotent: a no-op when
# panphon is already patched (e.g. local installs).
def _patch_panphon():
    import importlib.util
    spec = importlib.util.find_spec("panphon")
    locs = getattr(spec, "submodule_search_locations", None) if spec else None
    if not locs:
        return
    ft = os.path.join(list(locs)[0], "featuretable.py")
    bad = "word_features = self.word_fts(word: str, normalize: bool=True)"
    good = "word_features = self.word_fts(word, normalize=True)"
    try:
        with open(ft) as f:
            src = f.read()
        if bad in src:
            with open(ft, "w") as f:
                f.write(src.replace(bad, good))
    except Exception as exc:
        print(f"[FALCON] panphon patch skipped: {exc}")

_patch_panphon()

import gradio as gr
import textgrid
import torchaudio

import utils
from predict import main_predict

# On HF Spaces, point the "MFA-like" word G2P at the bundled dictionaries / G2P FST
# and at this interpreter (which has pynini), so it works without a separate MFA
# aligner conda env. No effect off Spaces — your local MFA install is used as-is.
if os.environ.get("SPACE_ID"):
    _SPACE_DIR = os.path.dirname(os.path.abspath(__file__))
    os.environ.setdefault("MFA_ROOT_DIR", os.path.join(_SPACE_DIR, "mfa_assets"))
    os.environ.setdefault("FDNFA_MFA_ENV_PY", sys.executable)

# ── Checkpoint configuration ──────────────────────────────────────────────────

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PRETRAINED_DIR = os.path.join(SCRIPT_DIR, "pretrained_models")

CKPT_FILES = {
    "english":      "falcon_timit_english.pt",
    "buckeye":      "falcon_buckeye_english.pt",
    "multilingual": "falcon_joint_multilingual.pt",
}
CKPT_LABELS = {
    "english":      "Read English (recommended) — TIMIT model",
    "buckeye":      "Spontaneous English (recommended) — Buckeye model",
    "multilingual": "Multilingual — joint TIMIT+Buckeye model",
}

_ckpt_cache = {}

def _resolve_ckpt(key: str):
    """Resolve checkpoint path: cache → local file → HF Hub. Returns None if all fail."""
    if key in _ckpt_cache and os.path.exists(_ckpt_cache[key]):
        return _ckpt_cache[key]

    filename = CKPT_FILES[key]
    local_path = os.path.join(PRETRAINED_DIR, filename)
    if os.path.exists(local_path):
        _ckpt_cache[key] = local_path
        return local_path

    repo = os.environ.get("HF_MODEL_REPO", "")
    if repo:
        try:
            from huggingface_hub import hf_hub_download
            path = hf_hub_download(
                repo_id=repo,
                filename=filename,
                token=os.environ.get("HF_TOKEN"),
            )
            _ckpt_cache[key] = path
            return path
        except Exception as exc:
            print(f"[FALCON] HF Hub fetch failed for {filename}: {exc}")

    return None

_inference_lock = threading.Lock()

# ── Heartbeat-based auto-shutdown (local runs only) ───────────────────────────
# The open browser tab pings _heartbeat() every few seconds. A watchdog thread
# exits the process when the pings stop (tab closed / browser crashed / unload
# event dropped). _last_ping stays None until the first browser connects, so the
# server never self-exits before anyone opens it.
_last_ping = [None]
_HEARTBEAT_TIMEOUT = 90  # secs of silence before the local server self-exits

def _heartbeat():
    _last_ping[0] = time.time()

def _start_shutdown_watchdog():
    def _watch():
        while True:
            time.sleep(5)
            last = _last_ping[0]
            if last is not None and (time.time() - last) > _HEARTBEAT_TIMEOUT:
                os._exit(0)
    threading.Thread(target=_watch, daemon=True).start()

# ── Internal language routing ────────────────────────────────────────────────

def _internal_language(lang: str, mode: str, ann_ext: str) -> str:
    """
    Map UI choices to the internal `language` flag understood by main_predict.

      'english' = no G2P; assumes labels are already TIMIT-39 phonemes.
      'dutch'   = G2P pipeline (panphon-based articulatory mapping). Used for:
                    • any non-English language
                    • word-level alignment (.wrd, words need phoneme decomposition)
                    • plain text input (could be words or arbitrary phonemes)

    NOTE: `ann_ext` here must be the *original* extension supplied by the user —
    not the post-rewrite extension after a .txt → dummy .phn synthesis.
    """
    if lang == "english" and mode == "phoneme" and ann_ext.lower() == "phn":
        return "english"
    return "dutch"

# ── Word-level G2P selection ──────────────────────────────────────────────────

# Optional input-language hint -> (espeak voice, MFA voice or None). MFA ships
# pronunciation models only for en/de/nl; everything else uses espeak, or "none"
# (romanized characters -> LH39) when even espeak has no voice.
G2P_LANG_CHOICES = [
    "English (default)", "German", "Dutch", "Hebrew", "French",
    "Spanish", "Italian", "Russian", "Portuguese", "Other / unknown",
]
_G2P_LANG_MAP = {
    "English (default)": ("en-us", "en-us"),
    "German":            ("de", "de"),
    "Dutch":             ("nl", "nl"),
    "Hebrew":            ("he", None),
    "French":            ("fr", None),
    "Spanish":           ("es", None),
    "Italian":           ("it", None),
    "Russian":           ("ru", None),
    "Portuguese":        ("pt", None),
    "Other / unknown":   ("en-us", None),
}

def _resolve_g2p(g2p_choice, lang_choice):
    """Map the (G2P option, input-language) UI choices to a concrete backend.

    Returns (backend, voice, note). backend in {"mfa", "espeak", "char"}. Honors
    an explicit espeak / MFA-like / none choice but auto-falls-back when the
    chosen backend has no model for the language; "Auto" picks the best available.
    """
    espeak_voice, mfa_voice = _G2P_LANG_MAP.get(lang_choice, ("en-us", "en-us"))
    try:
        import mfa_g2p
        mfa_ok = mfa_voice is not None and mfa_g2p.mfa_available(mfa_voice)
    except Exception:
        mfa_ok = False
    choice = (g2p_choice or "Auto").lower()

    if choice.startswith("none"):
        return "char", espeak_voice or "en-us", "none (romanization)"
    if choice.startswith("mfa"):
        if mfa_ok:
            return "mfa", mfa_voice, "MFA-like"
        if espeak_voice:
            return "espeak", espeak_voice, "espeak (no MFA model for this language)"
        return "char", "en-us", "none (no MFA/espeak model)"
    if choice.startswith("espeak"):
        if espeak_voice:
            return "espeak", espeak_voice, "espeak"
        if mfa_ok:
            return "mfa", mfa_voice, "MFA-like (no espeak voice for this language)"
        return "char", "en-us", "none"
    # Auto (recommended)
    if mfa_ok:
        return "mfa", mfa_voice, "MFA-like (auto)"
    if espeak_voice:
        return "espeak", espeak_voice, "espeak (auto)"
    return "char", "en-us", "none (auto)"

# ── Core handler ──────────────────────────────────────────────────────────────

OUTPUTS_NONE = (None, None, None, None, None)  # 5 None for the non-status outputs

def run_alignment(audio_file, annotation_file, ckpt_upload, mode, lang,
                  pretrained_choice, g2p_choice="Auto (recommended)",
                  lang_choice="English (default)",
                  progress=gr.Progress(track_tqdm=True)):
    if not audio_file or not annotation_file:
        return ("Please upload both an audio file and an annotation file.", *OUTPUTS_NONE)

    ckpt_to_use = ckpt_upload if ckpt_upload else _resolve_ckpt(pretrained_choice)
    if not ckpt_to_use or not os.path.exists(ckpt_to_use):
        return (
            f"No checkpoint found. Expected {CKPT_FILES[pretrained_choice]} "
            f"in {PRETRAINED_DIR}, or HF_MODEL_REPO set, or upload a .pt file.",
            *OUTPUTS_NONE,
        )

    progress(0.1, desc="Preparing workspace...")
    workspace = tempfile.mkdtemp(prefix="falcon_")
    base = "input"
    wav_path = os.path.join(workspace, f"{base}.wav")

    original_ext = os.path.basename(annotation_file).split(".")[-1].lower()
    ann_ext = original_ext
    ann_path = os.path.join(workspace, f"{base}.{ann_ext}")

    # Resample audio to 16 kHz mono
    try:
        audio, sr = torchaudio.load(audio_file)
        if audio.shape[0] > 1:
            audio = audio.mean(dim=0, keepdim=True)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)
        torchaudio.save(wav_path, audio, 16000)
    except Exception as exc:
        return (f"Audio error: {exc}", *OUTPUTS_NONE)

    shutil.copy(annotation_file, ann_path)

    # Capture the original input tokens (whatever the user supplied per line):
    #   .phn → phoneme labels
    #   .wrd → word labels
    #   .txt → space-separated tokens (words or phonemes)
    if original_ext == "txt":
        with open(ann_path) as f:
            orig_tokens = re.sub(r"[^\w\s]", "", f.read().strip()).split()
        # TIMIT .txt files are "<start_sample> <end_sample> <sentence>" — drop the
        # leading sample indices so they aren't mistaken for words.
        if len(orig_tokens) >= 3 and orig_tokens[0].isdigit() and orig_tokens[1].isdigit():
            orig_tokens = orig_tokens[2:]
        if not orig_tokens:
            return ("Text annotation is empty after stripping punctuation.", *OUTPUTS_NONE)
        # Synthesize a uniform-segments dummy .phn so downstream code has timestamps.
        audio_len = audio.shape[1]
        interval = audio_len / len(orig_tokens)
        ann_ext = "phn"
        ann_path = os.path.join(workspace, f"{base}.{ann_ext}")
        with open(ann_path, "w") as f:
            for i, tok in enumerate(orig_tokens):
                f.write(f"{int(i * interval)} {int((i + 1) * interval)} {tok}\n")
    else:
        with open(ann_path) as f:
            orig_tokens = [ln.strip().split()[-1] for ln in f if ln.strip()]

    # Route by ORIGINAL extension (post-rewrite ann_ext is "phn" for txt inputs).
    language = _internal_language(lang, mode, original_ext)

    # Word-level G2P: convert orthographic words -> LH39 phonemes with the chosen
    # front-end (espeak, or the MFA english_us_arpa G2P used in the paper), then
    # align via the stock phoneme path. Replaces the legacy letter-by-letter
    # mapping. Only applies to real word input (.wrd / .txt / .word).
    # Word/text inputs (.wrd / .txt) always go through the proper word -> LH39 G2P,
    # regardless of the phoneme/word toggle — otherwise phoneme mode would fall back
    # to a crude character mapping and misalign. (.phn is the phoneme-input path.)
    app_mapped_ph = None
    word_g2p_note = ""
    if original_ext in ("wrd", "txt", "word") and orig_tokens:
        import word_g2p
        backend, voice, g2p_note = _resolve_g2p(g2p_choice, lang_choice)
        try:
            app_mapped_ph = [word_g2p.word_to_lh39(tok, voice=voice, backend=backend)
                             for tok in orig_tokens]
        except Exception as g2p_exc:
            # A missing model must never break the whole run — fall back.
            print(f"[FALCON] G2P backend '{backend}' failed ({g2p_exc}); falling back.")
            try:
                app_mapped_ph = [word_g2p.word_to_lh39(tok, voice=voice or "en-us",
                                                       backend="espeak")
                                 for tok in orig_tokens]
                g2p_note += " → espeak fallback"
            except Exception:
                app_mapped_ph = [word_g2p.word_to_lh39(tok, backend="char")
                                 for tok in orig_tokens]
                g2p_note += " → none fallback"
        word_g2p_note = f" Word G2P: {g2p_note}."
        phons = [ph for seq in app_mapped_ph for ph in seq] or ["sil"]
        # Rewrite the annotation as a dummy uniform-time .phn of LH39 phonemes so
        # the stock English/phoneme aligner runs on them (no further G2P).
        ann_ext = "phn"
        ann_path = os.path.join(workspace, f"{base}.{ann_ext}")
        audio_len = audio.shape[1]
        interval = audio_len / max(1, len(phons))
        with open(ann_path, "w") as f:
            for i, ph in enumerate(phons):
                f.write(f"{int(i * interval)} {int((i + 1) * interval)} {ph}\n")
        language = "english"   # phonemes are already LH39; skip the internal G2P

    progress(0.3, desc="Running alignment...")
    try:
        with _inference_lock:
            utils.set_dp_matrix_out_dir(workspace)
            pred_bound, _truth_bound, mapped_ph = main_predict(
                wav=wav_path,
                ckpt=ckpt_to_use,
                w_phi=0.5,  # placeholder; the model uses its own learned w_phi at inference
                language=language,
                annotation=ann_ext,
            )
            utils.set_dp_matrix_out_dir(None)
            progress(0.6, desc="Rendering aligned visualization...")
            # Time-aligned representations as one stacked figure (waveform,
            # spectrogram, phoneme posteriors, Soft-DP matrix + path, contrastive
            # score) — all on the same time axis with predicted boundaries overlaid.
            panels_path = os.path.join(workspace, "panels.png")
            try:
                import falcon_viz
                falcon_viz.make_alignment_panels(
                    wav=wav_path, ckpt=ckpt_to_use, out_path=panels_path,
                    language=language, annotation=ann_ext,
                    show_truth=(original_ext == "phn" and mode == "phoneme"),
                )
            except Exception as viz_exc:
                print(f"[FALCON] panel viz failed: {viz_exc}")
                panels_path = None
    except Exception as exc:
        utils.set_dp_matrix_out_dir(None)
        return (f"Inference error: {exc}", *OUTPUTS_NONE)

    # When the app did the word-level G2P itself, use its per-word LH39 phoneme
    # lists for the two-table / TextGrid word tier (main_predict's English path
    # returns mapped_ph=None).
    if app_mapped_ph is not None:
        mapped_ph = app_mapped_ph

    progress(0.8, desc="Building outputs...")

    pred_bound_list = [float(t) for t in pred_bound]

    # ── Build two tables ─────────────────────────────────────────────────────
    # 1) LH39 phonemes  — the aligner's direct output, one row per pred_bound.
    # 2) Original tokens — words (.wrd / .txt) or non-LH39 phonemes (multilingual
    #    .phn). Only populated when the G2P path was used (mapped_ph != None);
    #    for english+phoneme+.phn the LH39 phonemes ARE the original, so the
    #    second table is left empty.
    # Every input token gets a row in the table even if its predicted interval
    # is degenerate; degenerate intervals are still kept out of the TextGrid
    # (Praat rejects zero-length).

    if mapped_ph is not None:
        phn_labels = [ph for seq in mapped_ph for ph in seq]
    else:
        phn_labels = orig_tokens

    table_phonemes, phn_intervals = [], []
    t0 = 0.0
    for i, t1 in enumerate(pred_bound_list):
        lbl = phn_labels[i] if i < len(phn_labels) else ""
        table_phonemes.append([round(t0, 3), round(t1, 3), lbl])
        if t1 > t0:
            phn_intervals.append((t0, t1, lbl))
        t0 = t1

    table_original, orig_intervals = [], []
    if mapped_ph is not None:
        counts_per_token = [len(seq) for seq in mapped_ph]
        cumulative = 0
        t0 = 0.0
        for i, count in enumerate(counts_per_token):
            cumulative += count
            if cumulative - 1 >= len(pred_bound_list):
                break
            t1 = pred_bound_list[cumulative - 1]
            lbl = orig_tokens[i] if i < len(orig_tokens) else ""
            table_original.append([round(t0, 3), round(t1, 3), lbl])
            if t1 > t0:
                orig_intervals.append((t0, t1, lbl))
            t0 = t1

    # ── TextGrid: phones tier always, original tier when applicable ──────────
    max_time = phn_intervals[-1][1] if phn_intervals else 0.0
    tg = textgrid.TextGrid(minTime=0, maxTime=max_time)
    tier_phn = textgrid.IntervalTier(name="phones", minTime=0, maxTime=max_time)
    for t0_iv, t1_iv, lbl_iv in phn_intervals:
        tier_phn.add(minTime=t0_iv, maxTime=t1_iv, mark=lbl_iv)
    tg.append(tier_phn)
    if orig_intervals:
        # Tier name reflects what the original layer represents.
        if mode == "word":
            orig_tier_name = "words"
        elif original_ext == "phn":
            orig_tier_name = "phones_original"
        else:
            orig_tier_name = "tokens"
        tier_orig = textgrid.IntervalTier(name=orig_tier_name, minTime=0, maxTime=max_time)
        for t0_iv, t1_iv, lbl_iv in orig_intervals:
            tier_orig.add(minTime=t0_iv, maxTime=t1_iv, mark=lbl_iv)
        tg.append(tier_orig)
    tg_path = os.path.join(workspace, f"{base}.TextGrid")
    tg.write(tg_path)

    # Status note: the .phn-as-word case produces a second "words" table that
    # actually contains phonemes — surface this in the status so it's not
    # mistaken for a bug.
    status_note = word_g2p_note
    if mode == "word" and original_ext == "phn":
        status_note += (" Note: input was phoneme-level (.phn) but mode=word "
                        "— the 'original' table shows input phonemes since no "
                        "word annotations were provided.")

    return (
        "Done." + status_note,
        audio_file,
        panels_path if panels_path and os.path.exists(panels_path) else None,
        tg_path,
        table_phonemes,
        table_original,
    )

# ── UI ────────────────────────────────────────────────────────────────────────

PRETRAINED_RADIO_CHOICES = [
    (CKPT_LABELS["english"],      "english"),
    (CKPT_LABELS["buckeye"],      "buckeye"),
    (CKPT_LABELS["multilingual"], "multilingual"),
]

# Benchmark results, shown under the example. Numbers are exactly those reported
# in the paper (arXiv:2606.25460). "specialist" = trained on the target English
# corpus; "joint" = one model jointly trained on TIMIT+Buckeye; multilingual rows
# are zero-shot (no target-language training data).
RESULTS_MD = """### Results (from the [paper](https://arxiv.org/abs/2606.25460))

Accuracy = % of reference boundaries matched within the ms tolerance. **Specialist** =
trained on the target English corpus; **joint** = one model jointly trained on
TIMIT+Buckeye; multilingual rows are **zero-shot** (no target-language training data).

**Phone-level Alignment Accuracy [%]: MFA vs. FALCON (Ours)**

| Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 |
|---|---|---|---|---|---|
| TIMIT | MFA | **38.6** | 72.3 | 81.1 | 84.6 |
| TIMIT | FALCON specialist | 37.66 | **83.88** | **94.85** | **98.62** |
| TIMIT | FALCON joint | 34.70 | 82.62 | 94.91 | 98.60 |
| Buckeye | MFA | **35.3** | 60.6 | 68.9 | 72.7 |
| Buckeye | FALCON specialist | 29.69 | **69.93** | **90.07** | **97.40** |
| Buckeye | FALCON joint | 28.87 | 69.40 | 89.53 | 97.13 |

**Phoneme-Level: Unseen Multilingual Generalization Accuracy**

| Test set | Model | ≤10 | ≤15 | ≤20 | ≤25 | ≤50 | ≤100 |
|---|---|---|---|---|---|---|---|
| Dutch — IFA | **FALCON joint** | **26.85** | **36.16** | **44.56** | **51.17** | **69.94** | **84.11** |
| Dutch — IFA | FALCON specialist | 26.86 | 35.79 | 43.85 | 50.34 | 68.68 | 83.22 |
| Dutch — IFA | MFA | 11.01 | 14.70 | 19.05 | 21.80 | 33.90 | 51.02 |
| German — PHONDAT | **FALCON joint** | **25.63** | **34.12** | **41.87** | **49.07** | **70.04** | **84.58** |
| German — PHONDAT | FALCON specialist | 25.08 | 33.37 | 40.76 | 47.43 | 68.27 | 82.44 |
| German — PHONDAT | MFA | 20.60 | 31.75 | 37.17 | 45.83 | 66.78 | 79.19 |
| Hebrew | **FALCON joint** | **21.98** | **30.10** | **36.91** | **42.78** | **63.07** | **80.41** |
| Hebrew | FALCON specialist | 21.03 | 27.78 | 34.30 | 39.79 | 59.38 | 77.76 |

**Word-Level Alignment Accuracy [%]: Comparative Analysis**

| Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 |
|---|---|---|---|---|---|
| TIMIT | FALCON spec (MFA-G2P) | 49.22 | **81.79** | **93.04** | 98.37 |
| TIMIT | FALCON joint (MFA-G2P) | **49.50** | 80.60 | 92.86 | **98.46** |
| TIMIT | MFA | 41.60 | 72.80 | 89.40 | 97.40 |
| TIMIT | MMS | 18.60 | 43.50 | 75.70 | 94.70 |
| TIMIT | WhisperX | 22.40 | 52.70 | 82.40 | 94.20 |
| TIMIT | Nvidia-Canary-1b | 9.23 | 23.11 | 44.23 | 72.81 |
| Buckeye | FALCON spec (MFA-G2P) | 50.06 | 77.85 | **91.51** | **96.63** |
| Buckeye | FALCON joint (MFA-G2P) | **50.42** | **77.98** | 91.01 | 96.55 |
| Buckeye | MFA | 39.80 | 69.90 | 84.90 | 91.80 |
| Buckeye | MMS | 25.00 | 52.70 | 75.00 | 87.90 |
| Buckeye | WhisperX | 18.80 | 43.10 | 67.40 | 77.40 |
| Buckeye | Nvidia-Canary-1b | 8.06 | 18.83 | 36.31 | 63.29 |

**Word-Level: Unseen Multilingual Generalization Accuracy**

| Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 |
|---|---|---|---|---|---|
| German — PHONDAT | FALCON (MFA-G2P) | **44.20** | **68.48** | **86.12** | **95.11** |
| German — PHONDAT | MFA | 29.9 | 65.4 | 82.1 | 94.3 |
| German — PHONDAT | MMS | 21.8 | 44.3 | 74.9 | 91.8 |
| Dutch — IFA | FALCON (MFA-G2P) | **26.38** | **45.15** | 61.16 | 76.49 |
| Dutch — IFA | MFA | 4.7 | 7.3 | 11.6 | 19.0 |
| Dutch — IFA | MMS | 16.0 | 37.9 | **62.9** | **76.6** |
| Hebrew | FALCON | **31.91** | **56.72** | 75.18 | 87.89 |
| Hebrew | MMS | 14.3 | 41.3 | **76.5** | **94.7** |
"""

with gr.Blocks(title="FALCON Forced Aligner", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# FALCON: Forced Alignment through Contrastive Optimization Networks")
    gr.Markdown(
        "Upload a speech file and a transcript to predict precise phoneme or word boundaries "
        "using Soft Dynamic Programming."
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Inputs")
            audio_in  = gr.Audio(label="Audio file (any sample rate)", type="filepath")
            ann_in    = gr.File(label="Annotation (.phn / .wrd / .txt)")
            mode_in   = gr.Radio(["phoneme", "word"], value="phoneme", label="Mode")
            lang_in   = gr.Radio(["english", "multilingual"], value="english", label="Language")
            # Word-level G2P front-end (word mode only). The espeak voice / MFA
            # dictionary is chosen automatically from the optional input-language
            # hint below; "Auto" also picks the best available backend.
            g2p_in    = gr.Radio(
                ["Auto (recommended)", "espeak", "MFA-like",
                 "none — romanization (not recommended; only for languages with no G2P model)"],
                value="Auto (recommended)",
                label="Word G2P (used only in word mode)",
            )
            lang_g2p_in = gr.Dropdown(
                G2P_LANG_CHOICES,
                value="English (default)",
                label="Input language (optional, recommended — improves G2P choice)",
            )
            pretrained_in = gr.Radio(
                choices=PRETRAINED_RADIO_CHOICES,
                value="english",
                label="Pretrained checkpoint (auto-follows Language; override here if you want)",
            )
            ckpt_in   = gr.File(label="Or upload a custom checkpoint (.pt) — overrides pretrained",
                                file_types=[".pt"], type="filepath")
            btn       = gr.Button("Run Alignment", variant="primary")
            status    = gr.Textbox(label="Status", interactive=False)

        with gr.Column(scale=2):
            gr.Markdown("### Outputs")
            with gr.Tabs():
                with gr.Tab("Alignment Data"):
                    audio_out  = gr.Audio(label="Playback", interactive=False)
                    table_phn_out = gr.Dataframe(
                        headers=["Start (s)", "End (s)", "Phoneme (LH39)"],
                        label="LH39 phonemes — aligner output",
                    )
                    table_orig_out = gr.Dataframe(
                        headers=["Start (s)", "End (s)", "Label"],
                        label="Original input layer (words / non-LH39 phonemes) — empty for English phoneme alignment",
                    )
                    tg_out     = gr.File(label="Download TextGrid (carries both tiers when applicable)")
                with gr.Tab("Visualizations"):
                    img_panels = gr.Image(
                        label="Time-aligned representations — waveform · spectrogram · phoneme posteriors · Soft-DP path · contrastive score (shared time axis; predicted boundaries overlaid). Click to enlarge.",
                        show_download_button=True,
                    )

    gr.Markdown("### Example — click to load it, then press **Run Alignment**")
    gr.Examples(
        examples=[
            ["assets/fasw0sa2.wav", "assets/fasw0sa2.phn", "phoneme",
             "english", "english", "Auto (recommended)", "English (default)"],
        ],
        inputs=[audio_in, ann_in, mode_in, lang_in, pretrained_in, g2p_in, lang_g2p_in],
        label="English · TIMIT",
    )

    gr.Markdown(RESULTS_MD)

    # Auto-flip the pretrained-checkpoint radio when the user changes language.
    lang_in.change(fn=lambda v: v, inputs=lang_in, outputs=pretrained_in)

    btn.click(
        fn=run_alignment,
        inputs=[audio_in, ann_in, ckpt_in, mode_in, lang_in, pretrained_in,
                g2p_in, lang_g2p_in],
        outputs=[status, audio_out, img_panels,
                 tg_out, table_phn_out, table_orig_out],
    )

    # Auto-shutdown the local Python server when the user closes the browser
    # tab. Disabled on HuggingFace Spaces (where SPACE_ID is set automatically)
    # because the container is shared across visitors — one tab close should
    # not tear down everyone else's session.
    if not os.environ.get("SPACE_ID"):
        # Fast path: unload events click a hidden Shutdown button -> os._exit.
        shutdown_btn = gr.Button("Shutdown", visible=False, elem_id="falcon-shutdown-btn")
        shutdown_btn.click(fn=lambda: os._exit(0), inputs=[], outputs=[])

        # Guaranteed fallback: the page heartbeats; the watchdog exits if it stops.
        hb_btn = gr.Button("hb", visible=False, elem_id="falcon-heartbeat-btn")
        hb_btn.click(fn=_heartbeat, inputs=[], outputs=[],
                     show_progress="hidden", queue=False)
        _start_shutdown_watchdog()

        demo.load(None, None, None, js="""
            () => {
                const stop = () => {
                    const btn = document.getElementById('falcon-shutdown-btn');
                    if (btn) btn.click();
                };
                window.addEventListener('beforeunload', stop);
                window.addEventListener('pagehide', stop);
                const beat = () => {
                    const hb = document.getElementById('falcon-heartbeat-btn');
                    if (hb) hb.click();
                };
                beat();
                setInterval(beat, 10000);
            }
        """)

if __name__ == "__main__":
    demo.launch()