Spaces:

MLSpeech
/

FALCON

Running

App Files Files Community

FALCON / app.py

MLSpeech

App: use the paper's exact table captions

34226e2 verified 1 day ago

Raw

History Blame Contribute Delete

29.1 kB

	"""
	FALCON web demo — interactive forced alignment in the browser.

	Two pretrained checkpoints are used by FALCON (under pretrained_models/):
	- falcon_timit_english.pt — TIMIT-trained, best for English phoneme alignment
	- falcon_joint_multilingual.pt — joint TIMIT+Buckeye model; best for cross-lingual /
	multilingual zero-shot alignment (Dutch, German,
	Hebrew, ...) at both phoneme and word level.

	The app picks one automatically from the `Language` radio (english → TIMIT,
	multilingual → joint); a custom .pt upload overrides both.

	For HuggingFace Spaces deployment, set Space Secrets:
	HF_MODEL_REPO — e.g. "MLSpeech/FALCON-weights"
	HF_TOKEN — only needed for private repos
	The app will download `falcon_timit_english.pt` and `falcon_joint_multilingual.pt`
	from that repo on first use.
	"""
	import os
	import re
	import shutil
	import sys
	import tempfile
	import threading
	import time

	# panphon 0.21.0 (pinned) ships an invalid-syntax line in featuretable.py — a stray
	# type annotation inside a function call — that breaks `import panphon` on a clean
	# install. Fix it on disk before anything imports panphon. Idempotent: a no-op when
	# panphon is already patched (e.g. local installs).
	def _patch_panphon():
	import importlib.util
	spec = importlib.util.find_spec("panphon")
	locs = getattr(spec, "submodule_search_locations", None) if spec else None
	if not locs:
	return
	ft = os.path.join(list(locs)[0], "featuretable.py")
	bad = "word_features = self.word_fts(word: str, normalize: bool=True)"
	good = "word_features = self.word_fts(word, normalize=True)"
	try:
	with open(ft) as f:
	src = f.read()
	if bad in src:
	with open(ft, "w") as f:
	f.write(src.replace(bad, good))
	except Exception as exc:
	print(f"[FALCON] panphon patch skipped: {exc}")

	_patch_panphon()

	import gradio as gr
	import textgrid
	import torchaudio

	import utils
	from predict import main_predict

	# On HF Spaces, point the "MFA-like" word G2P at the bundled dictionaries / G2P FST
	# and at this interpreter (which has pynini), so it works without a separate MFA
	# aligner conda env. No effect off Spaces — your local MFA install is used as-is.
	if os.environ.get("SPACE_ID"):
	_SPACE_DIR = os.path.dirname(os.path.abspath(__file__))
	os.environ.setdefault("MFA_ROOT_DIR", os.path.join(_SPACE_DIR, "mfa_assets"))
	os.environ.setdefault("FDNFA_MFA_ENV_PY", sys.executable)

	# ── Checkpoint configuration ──────────────────────────────────────────────────

	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	PRETRAINED_DIR = os.path.join(SCRIPT_DIR, "pretrained_models")

	CKPT_FILES = {
	"english": "falcon_timit_english.pt",
	"buckeye": "falcon_buckeye_english.pt",
	"multilingual": "falcon_joint_multilingual.pt",
	}
	CKPT_LABELS = {
	"english": "Read English (recommended) — TIMIT model",
	"buckeye": "Spontaneous English (recommended) — Buckeye model",
	"multilingual": "Multilingual — joint TIMIT+Buckeye model",
	}

	_ckpt_cache = {}

	def _resolve_ckpt(key: str):
	"""Resolve checkpoint path: cache → local file → HF Hub. Returns None if all fail."""
	if key in _ckpt_cache and os.path.exists(_ckpt_cache[key]):
	return _ckpt_cache[key]

	filename = CKPT_FILES[key]
	local_path = os.path.join(PRETRAINED_DIR, filename)
	if os.path.exists(local_path):
	_ckpt_cache[key] = local_path
	return local_path

	repo = os.environ.get("HF_MODEL_REPO", "")
	if repo:
	try:
	from huggingface_hub import hf_hub_download
	path = hf_hub_download(
	repo_id=repo,
	filename=filename,
	token=os.environ.get("HF_TOKEN"),
	)
	_ckpt_cache[key] = path
	return path
	except Exception as exc:
	print(f"[FALCON] HF Hub fetch failed for {filename}: {exc}")

	return None

	_inference_lock = threading.Lock()

	# ── Heartbeat-based auto-shutdown (local runs only) ───────────────────────────
	# The open browser tab pings _heartbeat() every few seconds. A watchdog thread
	# exits the process when the pings stop (tab closed / browser crashed / unload
	# event dropped). _last_ping stays None until the first browser connects, so the
	# server never self-exits before anyone opens it.
	_last_ping = [None]
	_HEARTBEAT_TIMEOUT = 90 # secs of silence before the local server self-exits

	def _heartbeat():
	_last_ping[0] = time.time()

	def _start_shutdown_watchdog():
	def _watch():
	while True:
	time.sleep(5)
	last = _last_ping[0]
	if last is not None and (time.time() - last) > _HEARTBEAT_TIMEOUT:
	os._exit(0)
	threading.Thread(target=_watch, daemon=True).start()

	# ── Internal language routing ────────────────────────────────────────────────

	def _internal_language(lang: str, mode: str, ann_ext: str) -> str:
	"""
	Map UI choices to the internal `language` flag understood by main_predict.

	'english' = no G2P; assumes labels are already TIMIT-39 phonemes.
	'dutch' = G2P pipeline (panphon-based articulatory mapping). Used for:
	• any non-English language
	• word-level alignment (.wrd, words need phoneme decomposition)
	• plain text input (could be words or arbitrary phonemes)

	NOTE: `ann_ext` here must be the original extension supplied by the user —
	not the post-rewrite extension after a .txt → dummy .phn synthesis.
	"""
	if lang == "english" and mode == "phoneme" and ann_ext.lower() == "phn":
	return "english"
	return "dutch"

	# ── Word-level G2P selection ──────────────────────────────────────────────────

	# Optional input-language hint -> (espeak voice, MFA voice or None). MFA ships
	# pronunciation models only for en/de/nl; everything else uses espeak, or "none"
	# (romanized characters -> LH39) when even espeak has no voice.
	G2P_LANG_CHOICES = [
	"English (default)", "German", "Dutch", "Hebrew", "French",
	"Spanish", "Italian", "Russian", "Portuguese", "Other / unknown",
	]
	_G2P_LANG_MAP = {
	"English (default)": ("en-us", "en-us"),
	"German": ("de", "de"),
	"Dutch": ("nl", "nl"),
	"Hebrew": ("he", None),
	"French": ("fr", None),
	"Spanish": ("es", None),
	"Italian": ("it", None),
	"Russian": ("ru", None),
	"Portuguese": ("pt", None),
	"Other / unknown": ("en-us", None),
	}

	def _resolve_g2p(g2p_choice, lang_choice):
	"""Map the (G2P option, input-language) UI choices to a concrete backend.

	Returns (backend, voice, note). backend in {"mfa", "espeak", "char"}. Honors
	an explicit espeak / MFA-like / none choice but auto-falls-back when the
	chosen backend has no model for the language; "Auto" picks the best available.
	"""
	espeak_voice, mfa_voice = _G2P_LANG_MAP.get(lang_choice, ("en-us", "en-us"))
	try:
	import mfa_g2p
	mfa_ok = mfa_voice is not None and mfa_g2p.mfa_available(mfa_voice)
	except Exception:
	mfa_ok = False
	choice = (g2p_choice or "Auto").lower()

	if choice.startswith("none"):
	return "char", espeak_voice or "en-us", "none (romanization)"
	if choice.startswith("mfa"):
	if mfa_ok:
	return "mfa", mfa_voice, "MFA-like"
	if espeak_voice:
	return "espeak", espeak_voice, "espeak (no MFA model for this language)"
	return "char", "en-us", "none (no MFA/espeak model)"
	if choice.startswith("espeak"):
	if espeak_voice:
	return "espeak", espeak_voice, "espeak"
	if mfa_ok:
	return "mfa", mfa_voice, "MFA-like (no espeak voice for this language)"
	return "char", "en-us", "none"
	# Auto (recommended)
	if mfa_ok:
	return "mfa", mfa_voice, "MFA-like (auto)"
	if espeak_voice:
	return "espeak", espeak_voice, "espeak (auto)"
	return "char", "en-us", "none (auto)"

	# ── Core handler ──────────────────────────────────────────────────────────────

	OUTPUTS_NONE = (None, None, None, None, None) # 5 None for the non-status outputs

	def run_alignment(audio_file, annotation_file, ckpt_upload, mode, lang,
	pretrained_choice, g2p_choice="Auto (recommended)",
	lang_choice="English (default)",
	progress=gr.Progress(track_tqdm=True)):
	if not audio_file or not annotation_file:
	return ("Please upload both an audio file and an annotation file.", *OUTPUTS_NONE)

	ckpt_to_use = ckpt_upload if ckpt_upload else _resolve_ckpt(pretrained_choice)
	if not ckpt_to_use or not os.path.exists(ckpt_to_use):
	return (
	f"No checkpoint found. Expected {CKPT_FILES[pretrained_choice]} "
	f"in {PRETRAINED_DIR}, or HF_MODEL_REPO set, or upload a .pt file.",
	*OUTPUTS_NONE,
	)

	progress(0.1, desc="Preparing workspace...")
	workspace = tempfile.mkdtemp(prefix="falcon_")
	base = "input"
	wav_path = os.path.join(workspace, f"{base}.wav")

	original_ext = os.path.basename(annotation_file).split(".")[-1].lower()
	ann_ext = original_ext
	ann_path = os.path.join(workspace, f"{base}.{ann_ext}")

	# Resample audio to 16 kHz mono
	try:
	audio, sr = torchaudio.load(audio_file)
	if audio.shape[0] > 1:
	audio = audio.mean(dim=0, keepdim=True)
	if sr != 16000:
	audio = torchaudio.functional.resample(audio, sr, 16000)
	torchaudio.save(wav_path, audio, 16000)
	except Exception as exc:
	return (f"Audio error: {exc}", *OUTPUTS_NONE)

	shutil.copy(annotation_file, ann_path)

	# Capture the original input tokens (whatever the user supplied per line):
	# .phn → phoneme labels
	# .wrd → word labels
	# .txt → space-separated tokens (words or phonemes)
	if original_ext == "txt":
	with open(ann_path) as f:
	orig_tokens = re.sub(r"[^\w\s]", "", f.read().strip()).split()
	# TIMIT .txt files are "<start_sample> <end_sample> <sentence>" — drop the
	# leading sample indices so they aren't mistaken for words.
	if len(orig_tokens) >= 3 and orig_tokens[0].isdigit() and orig_tokens[1].isdigit():
	orig_tokens = orig_tokens[2:]
	if not orig_tokens:
	return ("Text annotation is empty after stripping punctuation.", *OUTPUTS_NONE)
	# Synthesize a uniform-segments dummy .phn so downstream code has timestamps.
	audio_len = audio.shape[1]
	interval = audio_len / len(orig_tokens)
	ann_ext = "phn"
	ann_path = os.path.join(workspace, f"{base}.{ann_ext}")
	with open(ann_path, "w") as f:
	for i, tok in enumerate(orig_tokens):
	f.write(f"{int(i * interval)} {int((i + 1) * interval)} {tok}\n")
	else:
	with open(ann_path) as f:
	orig_tokens = [ln.strip().split()[-1] for ln in f if ln.strip()]

	# Route by ORIGINAL extension (post-rewrite ann_ext is "phn" for txt inputs).
	language = _internal_language(lang, mode, original_ext)

	# Word-level G2P: convert orthographic words -> LH39 phonemes with the chosen
	# front-end (espeak, or the MFA english_us_arpa G2P used in the paper), then
	# align via the stock phoneme path. Replaces the legacy letter-by-letter
	# mapping. Only applies to real word input (.wrd / .txt / .word).
	# Word/text inputs (.wrd / .txt) always go through the proper word -> LH39 G2P,
	# regardless of the phoneme/word toggle — otherwise phoneme mode would fall back
	# to a crude character mapping and misalign. (.phn is the phoneme-input path.)
	app_mapped_ph = None
	word_g2p_note = ""
	if original_ext in ("wrd", "txt", "word") and orig_tokens:
	import word_g2p
	backend, voice, g2p_note = _resolve_g2p(g2p_choice, lang_choice)
	try:
	app_mapped_ph = [word_g2p.word_to_lh39(tok, voice=voice, backend=backend)
	for tok in orig_tokens]
	except Exception as g2p_exc:
	# A missing model must never break the whole run — fall back.
	print(f"[FALCON] G2P backend '{backend}' failed ({g2p_exc}); falling back.")
	try:
	app_mapped_ph = [word_g2p.word_to_lh39(tok, voice=voice or "en-us",
	backend="espeak")
	for tok in orig_tokens]
	g2p_note += " → espeak fallback"
	except Exception:
	app_mapped_ph = [word_g2p.word_to_lh39(tok, backend="char")
	for tok in orig_tokens]
	g2p_note += " → none fallback"
	word_g2p_note = f" Word G2P: {g2p_note}."
	phons = [ph for seq in app_mapped_ph for ph in seq] or ["sil"]
	# Rewrite the annotation as a dummy uniform-time .phn of LH39 phonemes so
	# the stock English/phoneme aligner runs on them (no further G2P).
	ann_ext = "phn"
	ann_path = os.path.join(workspace, f"{base}.{ann_ext}")
	audio_len = audio.shape[1]
	interval = audio_len / max(1, len(phons))
	with open(ann_path, "w") as f:
	for i, ph in enumerate(phons):
	f.write(f"{int(i * interval)} {int((i + 1) * interval)} {ph}\n")
	language = "english" # phonemes are already LH39; skip the internal G2P

	progress(0.3, desc="Running alignment...")
	try:
	with _inference_lock:
	utils.set_dp_matrix_out_dir(workspace)
	pred_bound, _truth_bound, mapped_ph = main_predict(
	wav=wav_path,
	ckpt=ckpt_to_use,
	w_phi=0.5, # placeholder; the model uses its own learned w_phi at inference
	language=language,
	annotation=ann_ext,
	)
	utils.set_dp_matrix_out_dir(None)
	progress(0.6, desc="Rendering aligned visualization...")
	# Time-aligned representations as one stacked figure (waveform,
	# spectrogram, phoneme posteriors, Soft-DP matrix + path, contrastive
	# score) — all on the same time axis with predicted boundaries overlaid.
	panels_path = os.path.join(workspace, "panels.png")
	try:
	import falcon_viz
	falcon_viz.make_alignment_panels(
	wav=wav_path, ckpt=ckpt_to_use, out_path=panels_path,
	language=language, annotation=ann_ext,
	show_truth=(original_ext == "phn" and mode == "phoneme"),
	)
	except Exception as viz_exc:
	print(f"[FALCON] panel viz failed: {viz_exc}")
	panels_path = None
	except Exception as exc:
	utils.set_dp_matrix_out_dir(None)
	return (f"Inference error: {exc}", *OUTPUTS_NONE)

	# When the app did the word-level G2P itself, use its per-word LH39 phoneme
	# lists for the two-table / TextGrid word tier (main_predict's English path
	# returns mapped_ph=None).
	if app_mapped_ph is not None:
	mapped_ph = app_mapped_ph

	progress(0.8, desc="Building outputs...")

	pred_bound_list = [float(t) for t in pred_bound]

	# ── Build two tables ─────────────────────────────────────────────────────
	# 1) LH39 phonemes — the aligner's direct output, one row per pred_bound.
	# 2) Original tokens — words (.wrd / .txt) or non-LH39 phonemes (multilingual
	# .phn). Only populated when the G2P path was used (mapped_ph != None);
	# for english+phoneme+.phn the LH39 phonemes ARE the original, so the
	# second table is left empty.
	# Every input token gets a row in the table even if its predicted interval
	# is degenerate; degenerate intervals are still kept out of the TextGrid
	# (Praat rejects zero-length).

	if mapped_ph is not None:
	phn_labels = [ph for seq in mapped_ph for ph in seq]
	else:
	phn_labels = orig_tokens

	table_phonemes, phn_intervals = [], []
	t0 = 0.0
	for i, t1 in enumerate(pred_bound_list):
	lbl = phn_labels[i] if i < len(phn_labels) else ""
	table_phonemes.append([round(t0, 3), round(t1, 3), lbl])
	if t1 > t0:
	phn_intervals.append((t0, t1, lbl))
	t0 = t1

	table_original, orig_intervals = [], []
	if mapped_ph is not None:
	counts_per_token = [len(seq) for seq in mapped_ph]
	cumulative = 0
	t0 = 0.0
	for i, count in enumerate(counts_per_token):
	cumulative += count
	if cumulative - 1 >= len(pred_bound_list):
	break
	t1 = pred_bound_list[cumulative - 1]
	lbl = orig_tokens[i] if i < len(orig_tokens) else ""
	table_original.append([round(t0, 3), round(t1, 3), lbl])
	if t1 > t0:
	orig_intervals.append((t0, t1, lbl))
	t0 = t1

	# ── TextGrid: phones tier always, original tier when applicable ──────────
	max_time = phn_intervals[-1][1] if phn_intervals else 0.0
	tg = textgrid.TextGrid(minTime=0, maxTime=max_time)
	tier_phn = textgrid.IntervalTier(name="phones", minTime=0, maxTime=max_time)
	for t0_iv, t1_iv, lbl_iv in phn_intervals:
	tier_phn.add(minTime=t0_iv, maxTime=t1_iv, mark=lbl_iv)
	tg.append(tier_phn)
	if orig_intervals:
	# Tier name reflects what the original layer represents.
	if mode == "word":
	orig_tier_name = "words"
	elif original_ext == "phn":
	orig_tier_name = "phones_original"
	else:
	orig_tier_name = "tokens"
	tier_orig = textgrid.IntervalTier(name=orig_tier_name, minTime=0, maxTime=max_time)
	for t0_iv, t1_iv, lbl_iv in orig_intervals:
	tier_orig.add(minTime=t0_iv, maxTime=t1_iv, mark=lbl_iv)
	tg.append(tier_orig)
	tg_path = os.path.join(workspace, f"{base}.TextGrid")
	tg.write(tg_path)

	# Status note: the .phn-as-word case produces a second "words" table that
	# actually contains phonemes — surface this in the status so it's not
	# mistaken for a bug.
	status_note = word_g2p_note
	if mode == "word" and original_ext == "phn":
	status_note += (" Note: input was phoneme-level (.phn) but mode=word "
	"— the 'original' table shows input phonemes since no "
	"word annotations were provided.")

	return (
	"Done." + status_note,
	audio_file,
	panels_path if panels_path and os.path.exists(panels_path) else None,
	tg_path,
	table_phonemes,
	table_original,
	)

	# ── UI ────────────────────────────────────────────────────────────────────────

	PRETRAINED_RADIO_CHOICES = [
	(CKPT_LABELS["english"], "english"),
	(CKPT_LABELS["buckeye"], "buckeye"),
	(CKPT_LABELS["multilingual"], "multilingual"),
	]

	# Benchmark results, shown under the example. Numbers are exactly those reported
	# in the paper (arXiv:2606.25460). "specialist" = trained on the target English
	# corpus; "joint" = one model jointly trained on TIMIT+Buckeye; multilingual rows
	# are zero-shot (no target-language training data).
	RESULTS_MD = """### Results (from the [paper](https://arxiv.org/abs/2606.25460))

	Accuracy = % of reference boundaries matched within the ms tolerance. Specialist =
	trained on the target English corpus; joint = one model jointly trained on
	TIMIT+Buckeye; multilingual rows are zero-shot (no target-language training data).

	Phone-level Alignment Accuracy [%]: MFA vs. FALCON (Ours)

	\| Dataset \| Model \| t≤10 \| t≤25 \| t≤50 \| t≤100 \|
	\|---\|---\|---\|---\|---\|---\|
	\| TIMIT \| MFA \| 38.6 \| 72.3 \| 81.1 \| 84.6 \|
	\| TIMIT \| FALCON specialist \| 37.66 \| 83.88 \| 94.85 \| 98.62 \|
	\| TIMIT \| FALCON joint \| 34.70 \| 82.62 \| 94.91 \| 98.60 \|
	\| Buckeye \| MFA \| 35.3 \| 60.6 \| 68.9 \| 72.7 \|
	\| Buckeye \| FALCON specialist \| 29.69 \| 69.93 \| 90.07 \| 97.40 \|
	\| Buckeye \| FALCON joint \| 28.87 \| 69.40 \| 89.53 \| 97.13 \|

	Phoneme-Level: Unseen Multilingual Generalization Accuracy

	\| Test set \| Model \| ≤10 \| ≤15 \| ≤20 \| ≤25 \| ≤50 \| ≤100 \|
	\|---\|---\|---\|---\|---\|---\|---\|---\|
	\| Dutch — IFA \| FALCON joint \| 26.85 \| 36.16 \| 44.56 \| 51.17 \| 69.94 \| 84.11 \|
	\| Dutch — IFA \| FALCON specialist \| 26.86 \| 35.79 \| 43.85 \| 50.34 \| 68.68 \| 83.22 \|
	\| Dutch — IFA \| MFA \| 11.01 \| 14.70 \| 19.05 \| 21.80 \| 33.90 \| 51.02 \|
	\| German — PHONDAT \| FALCON joint \| 25.63 \| 34.12 \| 41.87 \| 49.07 \| 70.04 \| 84.58 \|
	\| German — PHONDAT \| FALCON specialist \| 25.08 \| 33.37 \| 40.76 \| 47.43 \| 68.27 \| 82.44 \|
	\| German — PHONDAT \| MFA \| 20.60 \| 31.75 \| 37.17 \| 45.83 \| 66.78 \| 79.19 \|
	\| Hebrew \| FALCON joint \| 21.98 \| 30.10 \| 36.91 \| 42.78 \| 63.07 \| 80.41 \|
	\| Hebrew \| FALCON specialist \| 21.03 \| 27.78 \| 34.30 \| 39.79 \| 59.38 \| 77.76 \|

	Word-Level Alignment Accuracy [%]: Comparative Analysis

	\| Dataset \| Model \| t≤10 \| t≤25 \| t≤50 \| t≤100 \|
	\|---\|---\|---\|---\|---\|---\|
	\| TIMIT \| FALCON spec (MFA-G2P) \| 49.22 \| 81.79 \| 93.04 \| 98.37 \|
	\| TIMIT \| FALCON joint (MFA-G2P) \| 49.50 \| 80.60 \| 92.86 \| 98.46 \|
	\| TIMIT \| MFA \| 41.60 \| 72.80 \| 89.40 \| 97.40 \|
	\| TIMIT \| MMS \| 18.60 \| 43.50 \| 75.70 \| 94.70 \|
	\| TIMIT \| WhisperX \| 22.40 \| 52.70 \| 82.40 \| 94.20 \|
	\| TIMIT \| Nvidia-Canary-1b \| 9.23 \| 23.11 \| 44.23 \| 72.81 \|
	\| Buckeye \| FALCON spec (MFA-G2P) \| 50.06 \| 77.85 \| 91.51 \| 96.63 \|
	\| Buckeye \| FALCON joint (MFA-G2P) \| 50.42 \| 77.98 \| 91.01 \| 96.55 \|
	\| Buckeye \| MFA \| 39.80 \| 69.90 \| 84.90 \| 91.80 \|
	\| Buckeye \| MMS \| 25.00 \| 52.70 \| 75.00 \| 87.90 \|
	\| Buckeye \| WhisperX \| 18.80 \| 43.10 \| 67.40 \| 77.40 \|
	\| Buckeye \| Nvidia-Canary-1b \| 8.06 \| 18.83 \| 36.31 \| 63.29 \|

	Word-Level: Unseen Multilingual Generalization Accuracy

	\| Dataset \| Model \| t≤10 \| t≤25 \| t≤50 \| t≤100 \|
	\|---\|---\|---\|---\|---\|---\|
	\| German — PHONDAT \| FALCON (MFA-G2P) \| 44.20 \| 68.48 \| 86.12 \| 95.11 \|
	\| German — PHONDAT \| MFA \| 29.9 \| 65.4 \| 82.1 \| 94.3 \|
	\| German — PHONDAT \| MMS \| 21.8 \| 44.3 \| 74.9 \| 91.8 \|
	\| Dutch — IFA \| FALCON (MFA-G2P) \| 26.38 \| 45.15 \| 61.16 \| 76.49 \|
	\| Dutch — IFA \| MFA \| 4.7 \| 7.3 \| 11.6 \| 19.0 \|
	\| Dutch — IFA \| MMS \| 16.0 \| 37.9 \| 62.9 \| 76.6 \|
	\| Hebrew \| FALCON \| 31.91 \| 56.72 \| 75.18 \| 87.89 \|
	\| Hebrew \| MMS \| 14.3 \| 41.3 \| 76.5 \| 94.7 \|
	"""

	with gr.Blocks(title="FALCON Forced Aligner", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# FALCON: Forced Alignment through Contrastive Optimization Networks")
	gr.Markdown(
	"Upload a speech file and a transcript to predict precise phoneme or word boundaries "
	"using Soft Dynamic Programming."
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Inputs")
	audio_in = gr.Audio(label="Audio file (any sample rate)", type="filepath")
	ann_in = gr.File(label="Annotation (.phn / .wrd / .txt)")
	mode_in = gr.Radio(["phoneme", "word"], value="phoneme", label="Mode")
	lang_in = gr.Radio(["english", "multilingual"], value="english", label="Language")
	# Word-level G2P front-end (word mode only). The espeak voice / MFA
	# dictionary is chosen automatically from the optional input-language
	# hint below; "Auto" also picks the best available backend.
	g2p_in = gr.Radio(
	["Auto (recommended)", "espeak", "MFA-like",
	"none — romanization (not recommended; only for languages with no G2P model)"],
	value="Auto (recommended)",
	label="Word G2P (used only in word mode)",
	)
	lang_g2p_in = gr.Dropdown(
	G2P_LANG_CHOICES,
	value="English (default)",
	label="Input language (optional, recommended — improves G2P choice)",
	)
	pretrained_in = gr.Radio(
	choices=PRETRAINED_RADIO_CHOICES,
	value="english",
	label="Pretrained checkpoint (auto-follows Language; override here if you want)",
	)
	ckpt_in = gr.File(label="Or upload a custom checkpoint (.pt) — overrides pretrained",
	file_types=[".pt"], type="filepath")
	btn = gr.Button("Run Alignment", variant="primary")
	status = gr.Textbox(label="Status", interactive=False)

	with gr.Column(scale=2):
	gr.Markdown("### Outputs")
	with gr.Tabs():
	with gr.Tab("Alignment Data"):
	audio_out = gr.Audio(label="Playback", interactive=False)
	table_phn_out = gr.Dataframe(
	headers=["Start (s)", "End (s)", "Phoneme (LH39)"],
	label="LH39 phonemes — aligner output",
	)
	table_orig_out = gr.Dataframe(
	headers=["Start (s)", "End (s)", "Label"],
	label="Original input layer (words / non-LH39 phonemes) — empty for English phoneme alignment",
	)
	tg_out = gr.File(label="Download TextGrid (carries both tiers when applicable)")
	with gr.Tab("Visualizations"):
	img_panels = gr.Image(
	label="Time-aligned representations — waveform · spectrogram · phoneme posteriors · Soft-DP path · contrastive score (shared time axis; predicted boundaries overlaid). Click to enlarge.",
	show_download_button=True,
	)

	gr.Markdown("### Example — click to load it, then press Run Alignment")
	gr.Examples(
	examples=[
	["assets/fasw0sa2.wav", "assets/fasw0sa2.phn", "phoneme",
	"english", "english", "Auto (recommended)", "English (default)"],
	],
	inputs=[audio_in, ann_in, mode_in, lang_in, pretrained_in, g2p_in, lang_g2p_in],
	label="English · TIMIT",
	)

	gr.Markdown(RESULTS_MD)

	# Auto-flip the pretrained-checkpoint radio when the user changes language.
	lang_in.change(fn=lambda v: v, inputs=lang_in, outputs=pretrained_in)

	btn.click(
	fn=run_alignment,
	inputs=[audio_in, ann_in, ckpt_in, mode_in, lang_in, pretrained_in,
	g2p_in, lang_g2p_in],
	outputs=[status, audio_out, img_panels,
	tg_out, table_phn_out, table_orig_out],
	)

	# Auto-shutdown the local Python server when the user closes the browser
	# tab. Disabled on HuggingFace Spaces (where SPACE_ID is set automatically)
	# because the container is shared across visitors — one tab close should
	# not tear down everyone else's session.
	if not os.environ.get("SPACE_ID"):
	# Fast path: unload events click a hidden Shutdown button -> os._exit.
	shutdown_btn = gr.Button("Shutdown", visible=False, elem_id="falcon-shutdown-btn")
	shutdown_btn.click(fn=lambda: os._exit(0), inputs=[], outputs=[])

	# Guaranteed fallback: the page heartbeats; the watchdog exits if it stops.
	hb_btn = gr.Button("hb", visible=False, elem_id="falcon-heartbeat-btn")
	hb_btn.click(fn=_heartbeat, inputs=[], outputs=[],
	show_progress="hidden", queue=False)
	_start_shutdown_watchdog()

	demo.load(None, None, None, js="""
	() => {
	const stop = () => {
	const btn = document.getElementById('falcon-shutdown-btn');
	if (btn) btn.click();
	};
	window.addEventListener('beforeunload', stop);
	window.addEventListener('pagehide', stop);
	const beat = () => {
	const hb = document.getElementById('falcon-heartbeat-btn');
	if (hb) hb.click();
	};
	beat();
	setInterval(beat, 10000);
	}
	""")

	if __name__ == "__main__":
	demo.launch()