FALCON / app.py
MLSpeech's picture
App: use the paper's exact table captions
34226e2 verified
Raw
History Blame Contribute Delete
29.1 kB
"""
FALCON web demo — interactive forced alignment in the browser.
Two pretrained checkpoints are used by FALCON (under pretrained_models/):
- falcon_timit_english.pt — TIMIT-trained, best for English phoneme alignment
- falcon_joint_multilingual.pt — joint TIMIT+Buckeye model; best for cross-lingual /
multilingual zero-shot alignment (Dutch, German,
Hebrew, ...) at both phoneme and word level.
The app picks one automatically from the `Language` radio (english → TIMIT,
multilingual → joint); a custom .pt upload overrides both.
For HuggingFace Spaces deployment, set Space Secrets:
HF_MODEL_REPO — e.g. "MLSpeech/FALCON-weights"
HF_TOKEN — only needed for private repos
The app will download `falcon_timit_english.pt` and `falcon_joint_multilingual.pt`
from that repo on first use.
"""
import os
import re
import shutil
import sys
import tempfile
import threading
import time
# panphon 0.21.0 (pinned) ships an invalid-syntax line in featuretable.py — a stray
# type annotation inside a function call — that breaks `import panphon` on a clean
# install. Fix it on disk before anything imports panphon. Idempotent: a no-op when
# panphon is already patched (e.g. local installs).
def _patch_panphon():
import importlib.util
spec = importlib.util.find_spec("panphon")
locs = getattr(spec, "submodule_search_locations", None) if spec else None
if not locs:
return
ft = os.path.join(list(locs)[0], "featuretable.py")
bad = "word_features = self.word_fts(word: str, normalize: bool=True)"
good = "word_features = self.word_fts(word, normalize=True)"
try:
with open(ft) as f:
src = f.read()
if bad in src:
with open(ft, "w") as f:
f.write(src.replace(bad, good))
except Exception as exc:
print(f"[FALCON] panphon patch skipped: {exc}")
_patch_panphon()
import gradio as gr
import textgrid
import torchaudio
import utils
from predict import main_predict
# On HF Spaces, point the "MFA-like" word G2P at the bundled dictionaries / G2P FST
# and at this interpreter (which has pynini), so it works without a separate MFA
# aligner conda env. No effect off Spaces — your local MFA install is used as-is.
if os.environ.get("SPACE_ID"):
_SPACE_DIR = os.path.dirname(os.path.abspath(__file__))
os.environ.setdefault("MFA_ROOT_DIR", os.path.join(_SPACE_DIR, "mfa_assets"))
os.environ.setdefault("FDNFA_MFA_ENV_PY", sys.executable)
# ── Checkpoint configuration ──────────────────────────────────────────────────
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PRETRAINED_DIR = os.path.join(SCRIPT_DIR, "pretrained_models")
CKPT_FILES = {
"english": "falcon_timit_english.pt",
"buckeye": "falcon_buckeye_english.pt",
"multilingual": "falcon_joint_multilingual.pt",
}
CKPT_LABELS = {
"english": "Read English (recommended) — TIMIT model",
"buckeye": "Spontaneous English (recommended) — Buckeye model",
"multilingual": "Multilingual — joint TIMIT+Buckeye model",
}
_ckpt_cache = {}
def _resolve_ckpt(key: str):
"""Resolve checkpoint path: cache → local file → HF Hub. Returns None if all fail."""
if key in _ckpt_cache and os.path.exists(_ckpt_cache[key]):
return _ckpt_cache[key]
filename = CKPT_FILES[key]
local_path = os.path.join(PRETRAINED_DIR, filename)
if os.path.exists(local_path):
_ckpt_cache[key] = local_path
return local_path
repo = os.environ.get("HF_MODEL_REPO", "")
if repo:
try:
from huggingface_hub import hf_hub_download
path = hf_hub_download(
repo_id=repo,
filename=filename,
token=os.environ.get("HF_TOKEN"),
)
_ckpt_cache[key] = path
return path
except Exception as exc:
print(f"[FALCON] HF Hub fetch failed for {filename}: {exc}")
return None
_inference_lock = threading.Lock()
# ── Heartbeat-based auto-shutdown (local runs only) ───────────────────────────
# The open browser tab pings _heartbeat() every few seconds. A watchdog thread
# exits the process when the pings stop (tab closed / browser crashed / unload
# event dropped). _last_ping stays None until the first browser connects, so the
# server never self-exits before anyone opens it.
_last_ping = [None]
_HEARTBEAT_TIMEOUT = 90 # secs of silence before the local server self-exits
def _heartbeat():
_last_ping[0] = time.time()
def _start_shutdown_watchdog():
def _watch():
while True:
time.sleep(5)
last = _last_ping[0]
if last is not None and (time.time() - last) > _HEARTBEAT_TIMEOUT:
os._exit(0)
threading.Thread(target=_watch, daemon=True).start()
# ── Internal language routing ────────────────────────────────────────────────
def _internal_language(lang: str, mode: str, ann_ext: str) -> str:
"""
Map UI choices to the internal `language` flag understood by main_predict.
'english' = no G2P; assumes labels are already TIMIT-39 phonemes.
'dutch' = G2P pipeline (panphon-based articulatory mapping). Used for:
• any non-English language
• word-level alignment (.wrd, words need phoneme decomposition)
• plain text input (could be words or arbitrary phonemes)
NOTE: `ann_ext` here must be the *original* extension supplied by the user —
not the post-rewrite extension after a .txt → dummy .phn synthesis.
"""
if lang == "english" and mode == "phoneme" and ann_ext.lower() == "phn":
return "english"
return "dutch"
# ── Word-level G2P selection ──────────────────────────────────────────────────
# Optional input-language hint -> (espeak voice, MFA voice or None). MFA ships
# pronunciation models only for en/de/nl; everything else uses espeak, or "none"
# (romanized characters -> LH39) when even espeak has no voice.
G2P_LANG_CHOICES = [
"English (default)", "German", "Dutch", "Hebrew", "French",
"Spanish", "Italian", "Russian", "Portuguese", "Other / unknown",
]
_G2P_LANG_MAP = {
"English (default)": ("en-us", "en-us"),
"German": ("de", "de"),
"Dutch": ("nl", "nl"),
"Hebrew": ("he", None),
"French": ("fr", None),
"Spanish": ("es", None),
"Italian": ("it", None),
"Russian": ("ru", None),
"Portuguese": ("pt", None),
"Other / unknown": ("en-us", None),
}
def _resolve_g2p(g2p_choice, lang_choice):
"""Map the (G2P option, input-language) UI choices to a concrete backend.
Returns (backend, voice, note). backend in {"mfa", "espeak", "char"}. Honors
an explicit espeak / MFA-like / none choice but auto-falls-back when the
chosen backend has no model for the language; "Auto" picks the best available.
"""
espeak_voice, mfa_voice = _G2P_LANG_MAP.get(lang_choice, ("en-us", "en-us"))
try:
import mfa_g2p
mfa_ok = mfa_voice is not None and mfa_g2p.mfa_available(mfa_voice)
except Exception:
mfa_ok = False
choice = (g2p_choice or "Auto").lower()
if choice.startswith("none"):
return "char", espeak_voice or "en-us", "none (romanization)"
if choice.startswith("mfa"):
if mfa_ok:
return "mfa", mfa_voice, "MFA-like"
if espeak_voice:
return "espeak", espeak_voice, "espeak (no MFA model for this language)"
return "char", "en-us", "none (no MFA/espeak model)"
if choice.startswith("espeak"):
if espeak_voice:
return "espeak", espeak_voice, "espeak"
if mfa_ok:
return "mfa", mfa_voice, "MFA-like (no espeak voice for this language)"
return "char", "en-us", "none"
# Auto (recommended)
if mfa_ok:
return "mfa", mfa_voice, "MFA-like (auto)"
if espeak_voice:
return "espeak", espeak_voice, "espeak (auto)"
return "char", "en-us", "none (auto)"
# ── Core handler ──────────────────────────────────────────────────────────────
OUTPUTS_NONE = (None, None, None, None, None) # 5 None for the non-status outputs
def run_alignment(audio_file, annotation_file, ckpt_upload, mode, lang,
pretrained_choice, g2p_choice="Auto (recommended)",
lang_choice="English (default)",
progress=gr.Progress(track_tqdm=True)):
if not audio_file or not annotation_file:
return ("Please upload both an audio file and an annotation file.", *OUTPUTS_NONE)
ckpt_to_use = ckpt_upload if ckpt_upload else _resolve_ckpt(pretrained_choice)
if not ckpt_to_use or not os.path.exists(ckpt_to_use):
return (
f"No checkpoint found. Expected {CKPT_FILES[pretrained_choice]} "
f"in {PRETRAINED_DIR}, or HF_MODEL_REPO set, or upload a .pt file.",
*OUTPUTS_NONE,
)
progress(0.1, desc="Preparing workspace...")
workspace = tempfile.mkdtemp(prefix="falcon_")
base = "input"
wav_path = os.path.join(workspace, f"{base}.wav")
original_ext = os.path.basename(annotation_file).split(".")[-1].lower()
ann_ext = original_ext
ann_path = os.path.join(workspace, f"{base}.{ann_ext}")
# Resample audio to 16 kHz mono
try:
audio, sr = torchaudio.load(audio_file)
if audio.shape[0] > 1:
audio = audio.mean(dim=0, keepdim=True)
if sr != 16000:
audio = torchaudio.functional.resample(audio, sr, 16000)
torchaudio.save(wav_path, audio, 16000)
except Exception as exc:
return (f"Audio error: {exc}", *OUTPUTS_NONE)
shutil.copy(annotation_file, ann_path)
# Capture the original input tokens (whatever the user supplied per line):
# .phn → phoneme labels
# .wrd → word labels
# .txt → space-separated tokens (words or phonemes)
if original_ext == "txt":
with open(ann_path) as f:
orig_tokens = re.sub(r"[^\w\s]", "", f.read().strip()).split()
# TIMIT .txt files are "<start_sample> <end_sample> <sentence>" — drop the
# leading sample indices so they aren't mistaken for words.
if len(orig_tokens) >= 3 and orig_tokens[0].isdigit() and orig_tokens[1].isdigit():
orig_tokens = orig_tokens[2:]
if not orig_tokens:
return ("Text annotation is empty after stripping punctuation.", *OUTPUTS_NONE)
# Synthesize a uniform-segments dummy .phn so downstream code has timestamps.
audio_len = audio.shape[1]
interval = audio_len / len(orig_tokens)
ann_ext = "phn"
ann_path = os.path.join(workspace, f"{base}.{ann_ext}")
with open(ann_path, "w") as f:
for i, tok in enumerate(orig_tokens):
f.write(f"{int(i * interval)} {int((i + 1) * interval)} {tok}\n")
else:
with open(ann_path) as f:
orig_tokens = [ln.strip().split()[-1] for ln in f if ln.strip()]
# Route by ORIGINAL extension (post-rewrite ann_ext is "phn" for txt inputs).
language = _internal_language(lang, mode, original_ext)
# Word-level G2P: convert orthographic words -> LH39 phonemes with the chosen
# front-end (espeak, or the MFA english_us_arpa G2P used in the paper), then
# align via the stock phoneme path. Replaces the legacy letter-by-letter
# mapping. Only applies to real word input (.wrd / .txt / .word).
# Word/text inputs (.wrd / .txt) always go through the proper word -> LH39 G2P,
# regardless of the phoneme/word toggle — otherwise phoneme mode would fall back
# to a crude character mapping and misalign. (.phn is the phoneme-input path.)
app_mapped_ph = None
word_g2p_note = ""
if original_ext in ("wrd", "txt", "word") and orig_tokens:
import word_g2p
backend, voice, g2p_note = _resolve_g2p(g2p_choice, lang_choice)
try:
app_mapped_ph = [word_g2p.word_to_lh39(tok, voice=voice, backend=backend)
for tok in orig_tokens]
except Exception as g2p_exc:
# A missing model must never break the whole run — fall back.
print(f"[FALCON] G2P backend '{backend}' failed ({g2p_exc}); falling back.")
try:
app_mapped_ph = [word_g2p.word_to_lh39(tok, voice=voice or "en-us",
backend="espeak")
for tok in orig_tokens]
g2p_note += " → espeak fallback"
except Exception:
app_mapped_ph = [word_g2p.word_to_lh39(tok, backend="char")
for tok in orig_tokens]
g2p_note += " → none fallback"
word_g2p_note = f" Word G2P: {g2p_note}."
phons = [ph for seq in app_mapped_ph for ph in seq] or ["sil"]
# Rewrite the annotation as a dummy uniform-time .phn of LH39 phonemes so
# the stock English/phoneme aligner runs on them (no further G2P).
ann_ext = "phn"
ann_path = os.path.join(workspace, f"{base}.{ann_ext}")
audio_len = audio.shape[1]
interval = audio_len / max(1, len(phons))
with open(ann_path, "w") as f:
for i, ph in enumerate(phons):
f.write(f"{int(i * interval)} {int((i + 1) * interval)} {ph}\n")
language = "english" # phonemes are already LH39; skip the internal G2P
progress(0.3, desc="Running alignment...")
try:
with _inference_lock:
utils.set_dp_matrix_out_dir(workspace)
pred_bound, _truth_bound, mapped_ph = main_predict(
wav=wav_path,
ckpt=ckpt_to_use,
w_phi=0.5, # placeholder; the model uses its own learned w_phi at inference
language=language,
annotation=ann_ext,
)
utils.set_dp_matrix_out_dir(None)
progress(0.6, desc="Rendering aligned visualization...")
# Time-aligned representations as one stacked figure (waveform,
# spectrogram, phoneme posteriors, Soft-DP matrix + path, contrastive
# score) — all on the same time axis with predicted boundaries overlaid.
panels_path = os.path.join(workspace, "panels.png")
try:
import falcon_viz
falcon_viz.make_alignment_panels(
wav=wav_path, ckpt=ckpt_to_use, out_path=panels_path,
language=language, annotation=ann_ext,
show_truth=(original_ext == "phn" and mode == "phoneme"),
)
except Exception as viz_exc:
print(f"[FALCON] panel viz failed: {viz_exc}")
panels_path = None
except Exception as exc:
utils.set_dp_matrix_out_dir(None)
return (f"Inference error: {exc}", *OUTPUTS_NONE)
# When the app did the word-level G2P itself, use its per-word LH39 phoneme
# lists for the two-table / TextGrid word tier (main_predict's English path
# returns mapped_ph=None).
if app_mapped_ph is not None:
mapped_ph = app_mapped_ph
progress(0.8, desc="Building outputs...")
pred_bound_list = [float(t) for t in pred_bound]
# ── Build two tables ─────────────────────────────────────────────────────
# 1) LH39 phonemes — the aligner's direct output, one row per pred_bound.
# 2) Original tokens — words (.wrd / .txt) or non-LH39 phonemes (multilingual
# .phn). Only populated when the G2P path was used (mapped_ph != None);
# for english+phoneme+.phn the LH39 phonemes ARE the original, so the
# second table is left empty.
# Every input token gets a row in the table even if its predicted interval
# is degenerate; degenerate intervals are still kept out of the TextGrid
# (Praat rejects zero-length).
if mapped_ph is not None:
phn_labels = [ph for seq in mapped_ph for ph in seq]
else:
phn_labels = orig_tokens
table_phonemes, phn_intervals = [], []
t0 = 0.0
for i, t1 in enumerate(pred_bound_list):
lbl = phn_labels[i] if i < len(phn_labels) else ""
table_phonemes.append([round(t0, 3), round(t1, 3), lbl])
if t1 > t0:
phn_intervals.append((t0, t1, lbl))
t0 = t1
table_original, orig_intervals = [], []
if mapped_ph is not None:
counts_per_token = [len(seq) for seq in mapped_ph]
cumulative = 0
t0 = 0.0
for i, count in enumerate(counts_per_token):
cumulative += count
if cumulative - 1 >= len(pred_bound_list):
break
t1 = pred_bound_list[cumulative - 1]
lbl = orig_tokens[i] if i < len(orig_tokens) else ""
table_original.append([round(t0, 3), round(t1, 3), lbl])
if t1 > t0:
orig_intervals.append((t0, t1, lbl))
t0 = t1
# ── TextGrid: phones tier always, original tier when applicable ──────────
max_time = phn_intervals[-1][1] if phn_intervals else 0.0
tg = textgrid.TextGrid(minTime=0, maxTime=max_time)
tier_phn = textgrid.IntervalTier(name="phones", minTime=0, maxTime=max_time)
for t0_iv, t1_iv, lbl_iv in phn_intervals:
tier_phn.add(minTime=t0_iv, maxTime=t1_iv, mark=lbl_iv)
tg.append(tier_phn)
if orig_intervals:
# Tier name reflects what the original layer represents.
if mode == "word":
orig_tier_name = "words"
elif original_ext == "phn":
orig_tier_name = "phones_original"
else:
orig_tier_name = "tokens"
tier_orig = textgrid.IntervalTier(name=orig_tier_name, minTime=0, maxTime=max_time)
for t0_iv, t1_iv, lbl_iv in orig_intervals:
tier_orig.add(minTime=t0_iv, maxTime=t1_iv, mark=lbl_iv)
tg.append(tier_orig)
tg_path = os.path.join(workspace, f"{base}.TextGrid")
tg.write(tg_path)
# Status note: the .phn-as-word case produces a second "words" table that
# actually contains phonemes — surface this in the status so it's not
# mistaken for a bug.
status_note = word_g2p_note
if mode == "word" and original_ext == "phn":
status_note += (" Note: input was phoneme-level (.phn) but mode=word "
"— the 'original' table shows input phonemes since no "
"word annotations were provided.")
return (
"Done." + status_note,
audio_file,
panels_path if panels_path and os.path.exists(panels_path) else None,
tg_path,
table_phonemes,
table_original,
)
# ── UI ────────────────────────────────────────────────────────────────────────
PRETRAINED_RADIO_CHOICES = [
(CKPT_LABELS["english"], "english"),
(CKPT_LABELS["buckeye"], "buckeye"),
(CKPT_LABELS["multilingual"], "multilingual"),
]
# Benchmark results, shown under the example. Numbers are exactly those reported
# in the paper (arXiv:2606.25460). "specialist" = trained on the target English
# corpus; "joint" = one model jointly trained on TIMIT+Buckeye; multilingual rows
# are zero-shot (no target-language training data).
RESULTS_MD = """### Results (from the [paper](https://arxiv.org/abs/2606.25460))
Accuracy = % of reference boundaries matched within the ms tolerance. **Specialist** =
trained on the target English corpus; **joint** = one model jointly trained on
TIMIT+Buckeye; multilingual rows are **zero-shot** (no target-language training data).
**Phone-level Alignment Accuracy [%]: MFA vs. FALCON (Ours)**
| Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 |
|---|---|---|---|---|---|
| TIMIT | MFA | **38.6** | 72.3 | 81.1 | 84.6 |
| TIMIT | FALCON specialist | 37.66 | **83.88** | **94.85** | **98.62** |
| TIMIT | FALCON joint | 34.70 | 82.62 | 94.91 | 98.60 |
| Buckeye | MFA | **35.3** | 60.6 | 68.9 | 72.7 |
| Buckeye | FALCON specialist | 29.69 | **69.93** | **90.07** | **97.40** |
| Buckeye | FALCON joint | 28.87 | 69.40 | 89.53 | 97.13 |
**Phoneme-Level: Unseen Multilingual Generalization Accuracy**
| Test set | Model | ≤10 | ≤15 | ≤20 | ≤25 | ≤50 | ≤100 |
|---|---|---|---|---|---|---|---|
| Dutch — IFA | **FALCON joint** | **26.85** | **36.16** | **44.56** | **51.17** | **69.94** | **84.11** |
| Dutch — IFA | FALCON specialist | 26.86 | 35.79 | 43.85 | 50.34 | 68.68 | 83.22 |
| Dutch — IFA | MFA | 11.01 | 14.70 | 19.05 | 21.80 | 33.90 | 51.02 |
| German — PHONDAT | **FALCON joint** | **25.63** | **34.12** | **41.87** | **49.07** | **70.04** | **84.58** |
| German — PHONDAT | FALCON specialist | 25.08 | 33.37 | 40.76 | 47.43 | 68.27 | 82.44 |
| German — PHONDAT | MFA | 20.60 | 31.75 | 37.17 | 45.83 | 66.78 | 79.19 |
| Hebrew | **FALCON joint** | **21.98** | **30.10** | **36.91** | **42.78** | **63.07** | **80.41** |
| Hebrew | FALCON specialist | 21.03 | 27.78 | 34.30 | 39.79 | 59.38 | 77.76 |
**Word-Level Alignment Accuracy [%]: Comparative Analysis**
| Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 |
|---|---|---|---|---|---|
| TIMIT | FALCON spec (MFA-G2P) | 49.22 | **81.79** | **93.04** | 98.37 |
| TIMIT | FALCON joint (MFA-G2P) | **49.50** | 80.60 | 92.86 | **98.46** |
| TIMIT | MFA | 41.60 | 72.80 | 89.40 | 97.40 |
| TIMIT | MMS | 18.60 | 43.50 | 75.70 | 94.70 |
| TIMIT | WhisperX | 22.40 | 52.70 | 82.40 | 94.20 |
| TIMIT | Nvidia-Canary-1b | 9.23 | 23.11 | 44.23 | 72.81 |
| Buckeye | FALCON spec (MFA-G2P) | 50.06 | 77.85 | **91.51** | **96.63** |
| Buckeye | FALCON joint (MFA-G2P) | **50.42** | **77.98** | 91.01 | 96.55 |
| Buckeye | MFA | 39.80 | 69.90 | 84.90 | 91.80 |
| Buckeye | MMS | 25.00 | 52.70 | 75.00 | 87.90 |
| Buckeye | WhisperX | 18.80 | 43.10 | 67.40 | 77.40 |
| Buckeye | Nvidia-Canary-1b | 8.06 | 18.83 | 36.31 | 63.29 |
**Word-Level: Unseen Multilingual Generalization Accuracy**
| Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 |
|---|---|---|---|---|---|
| German — PHONDAT | FALCON (MFA-G2P) | **44.20** | **68.48** | **86.12** | **95.11** |
| German — PHONDAT | MFA | 29.9 | 65.4 | 82.1 | 94.3 |
| German — PHONDAT | MMS | 21.8 | 44.3 | 74.9 | 91.8 |
| Dutch — IFA | FALCON (MFA-G2P) | **26.38** | **45.15** | 61.16 | 76.49 |
| Dutch — IFA | MFA | 4.7 | 7.3 | 11.6 | 19.0 |
| Dutch — IFA | MMS | 16.0 | 37.9 | **62.9** | **76.6** |
| Hebrew | FALCON | **31.91** | **56.72** | 75.18 | 87.89 |
| Hebrew | MMS | 14.3 | 41.3 | **76.5** | **94.7** |
"""
with gr.Blocks(title="FALCON Forced Aligner", theme=gr.themes.Soft()) as demo:
gr.Markdown("# FALCON: Forced Alignment through Contrastive Optimization Networks")
gr.Markdown(
"Upload a speech file and a transcript to predict precise phoneme or word boundaries "
"using Soft Dynamic Programming."
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Inputs")
audio_in = gr.Audio(label="Audio file (any sample rate)", type="filepath")
ann_in = gr.File(label="Annotation (.phn / .wrd / .txt)")
mode_in = gr.Radio(["phoneme", "word"], value="phoneme", label="Mode")
lang_in = gr.Radio(["english", "multilingual"], value="english", label="Language")
# Word-level G2P front-end (word mode only). The espeak voice / MFA
# dictionary is chosen automatically from the optional input-language
# hint below; "Auto" also picks the best available backend.
g2p_in = gr.Radio(
["Auto (recommended)", "espeak", "MFA-like",
"none — romanization (not recommended; only for languages with no G2P model)"],
value="Auto (recommended)",
label="Word G2P (used only in word mode)",
)
lang_g2p_in = gr.Dropdown(
G2P_LANG_CHOICES,
value="English (default)",
label="Input language (optional, recommended — improves G2P choice)",
)
pretrained_in = gr.Radio(
choices=PRETRAINED_RADIO_CHOICES,
value="english",
label="Pretrained checkpoint (auto-follows Language; override here if you want)",
)
ckpt_in = gr.File(label="Or upload a custom checkpoint (.pt) — overrides pretrained",
file_types=[".pt"], type="filepath")
btn = gr.Button("Run Alignment", variant="primary")
status = gr.Textbox(label="Status", interactive=False)
with gr.Column(scale=2):
gr.Markdown("### Outputs")
with gr.Tabs():
with gr.Tab("Alignment Data"):
audio_out = gr.Audio(label="Playback", interactive=False)
table_phn_out = gr.Dataframe(
headers=["Start (s)", "End (s)", "Phoneme (LH39)"],
label="LH39 phonemes — aligner output",
)
table_orig_out = gr.Dataframe(
headers=["Start (s)", "End (s)", "Label"],
label="Original input layer (words / non-LH39 phonemes) — empty for English phoneme alignment",
)
tg_out = gr.File(label="Download TextGrid (carries both tiers when applicable)")
with gr.Tab("Visualizations"):
img_panels = gr.Image(
label="Time-aligned representations — waveform · spectrogram · phoneme posteriors · Soft-DP path · contrastive score (shared time axis; predicted boundaries overlaid). Click to enlarge.",
show_download_button=True,
)
gr.Markdown("### Example — click to load it, then press **Run Alignment**")
gr.Examples(
examples=[
["assets/fasw0sa2.wav", "assets/fasw0sa2.phn", "phoneme",
"english", "english", "Auto (recommended)", "English (default)"],
],
inputs=[audio_in, ann_in, mode_in, lang_in, pretrained_in, g2p_in, lang_g2p_in],
label="English · TIMIT",
)
gr.Markdown(RESULTS_MD)
# Auto-flip the pretrained-checkpoint radio when the user changes language.
lang_in.change(fn=lambda v: v, inputs=lang_in, outputs=pretrained_in)
btn.click(
fn=run_alignment,
inputs=[audio_in, ann_in, ckpt_in, mode_in, lang_in, pretrained_in,
g2p_in, lang_g2p_in],
outputs=[status, audio_out, img_panels,
tg_out, table_phn_out, table_orig_out],
)
# Auto-shutdown the local Python server when the user closes the browser
# tab. Disabled on HuggingFace Spaces (where SPACE_ID is set automatically)
# because the container is shared across visitors — one tab close should
# not tear down everyone else's session.
if not os.environ.get("SPACE_ID"):
# Fast path: unload events click a hidden Shutdown button -> os._exit.
shutdown_btn = gr.Button("Shutdown", visible=False, elem_id="falcon-shutdown-btn")
shutdown_btn.click(fn=lambda: os._exit(0), inputs=[], outputs=[])
# Guaranteed fallback: the page heartbeats; the watchdog exits if it stops.
hb_btn = gr.Button("hb", visible=False, elem_id="falcon-heartbeat-btn")
hb_btn.click(fn=_heartbeat, inputs=[], outputs=[],
show_progress="hidden", queue=False)
_start_shutdown_watchdog()
demo.load(None, None, None, js="""
() => {
const stop = () => {
const btn = document.getElementById('falcon-shutdown-btn');
if (btn) btn.click();
};
window.addEventListener('beforeunload', stop);
window.addEventListener('pagehide', stop);
const beat = () => {
const hb = document.getElementById('falcon-heartbeat-btn');
if (hb) hb.click();
};
beat();
setInterval(beat, 10000);
}
""")
if __name__ == "__main__":
demo.launch()