| """ |
| FALCON web demo — interactive forced alignment in the browser. |
| |
| Two pretrained checkpoints are used by FALCON (under pretrained_models/): |
| - falcon_timit_english.pt — TIMIT-trained, best for English phoneme alignment |
| - falcon_joint_multilingual.pt — joint TIMIT+Buckeye model; best for cross-lingual / |
| multilingual zero-shot alignment (Dutch, German, |
| Hebrew, ...) at both phoneme and word level. |
| |
| The app picks one automatically from the `Language` radio (english → TIMIT, |
| multilingual → joint); a custom .pt upload overrides both. |
| |
| For HuggingFace Spaces deployment, set Space Secrets: |
| HF_MODEL_REPO — e.g. "MLSpeech/FALCON-weights" |
| HF_TOKEN — only needed for private repos |
| The app will download `falcon_timit_english.pt` and `falcon_joint_multilingual.pt` |
| from that repo on first use. |
| """ |
| import os |
| import re |
| import shutil |
| import sys |
| import tempfile |
| import threading |
| import time |
|
|
| |
| |
| |
| |
| def _patch_panphon(): |
| import importlib.util |
| spec = importlib.util.find_spec("panphon") |
| locs = getattr(spec, "submodule_search_locations", None) if spec else None |
| if not locs: |
| return |
| ft = os.path.join(list(locs)[0], "featuretable.py") |
| bad = "word_features = self.word_fts(word: str, normalize: bool=True)" |
| good = "word_features = self.word_fts(word, normalize=True)" |
| try: |
| with open(ft) as f: |
| src = f.read() |
| if bad in src: |
| with open(ft, "w") as f: |
| f.write(src.replace(bad, good)) |
| except Exception as exc: |
| print(f"[FALCON] panphon patch skipped: {exc}") |
|
|
| _patch_panphon() |
|
|
| import gradio as gr |
| import textgrid |
| import torchaudio |
|
|
| import utils |
| from predict import main_predict |
|
|
| |
| |
| |
| if os.environ.get("SPACE_ID"): |
| _SPACE_DIR = os.path.dirname(os.path.abspath(__file__)) |
| os.environ.setdefault("MFA_ROOT_DIR", os.path.join(_SPACE_DIR, "mfa_assets")) |
| os.environ.setdefault("FDNFA_MFA_ENV_PY", sys.executable) |
|
|
| |
|
|
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| PRETRAINED_DIR = os.path.join(SCRIPT_DIR, "pretrained_models") |
|
|
| CKPT_FILES = { |
| "english": "falcon_timit_english.pt", |
| "buckeye": "falcon_buckeye_english.pt", |
| "multilingual": "falcon_joint_multilingual.pt", |
| } |
| CKPT_LABELS = { |
| "english": "Read English (recommended) — TIMIT model", |
| "buckeye": "Spontaneous English (recommended) — Buckeye model", |
| "multilingual": "Multilingual — joint TIMIT+Buckeye model", |
| } |
|
|
| _ckpt_cache = {} |
|
|
| def _resolve_ckpt(key: str): |
| """Resolve checkpoint path: cache → local file → HF Hub. Returns None if all fail.""" |
| if key in _ckpt_cache and os.path.exists(_ckpt_cache[key]): |
| return _ckpt_cache[key] |
|
|
| filename = CKPT_FILES[key] |
| local_path = os.path.join(PRETRAINED_DIR, filename) |
| if os.path.exists(local_path): |
| _ckpt_cache[key] = local_path |
| return local_path |
|
|
| repo = os.environ.get("HF_MODEL_REPO", "") |
| if repo: |
| try: |
| from huggingface_hub import hf_hub_download |
| path = hf_hub_download( |
| repo_id=repo, |
| filename=filename, |
| token=os.environ.get("HF_TOKEN"), |
| ) |
| _ckpt_cache[key] = path |
| return path |
| except Exception as exc: |
| print(f"[FALCON] HF Hub fetch failed for {filename}: {exc}") |
|
|
| return None |
|
|
| _inference_lock = threading.Lock() |
|
|
| |
| |
| |
| |
| |
| _last_ping = [None] |
| _HEARTBEAT_TIMEOUT = 90 |
|
|
| def _heartbeat(): |
| _last_ping[0] = time.time() |
|
|
| def _start_shutdown_watchdog(): |
| def _watch(): |
| while True: |
| time.sleep(5) |
| last = _last_ping[0] |
| if last is not None and (time.time() - last) > _HEARTBEAT_TIMEOUT: |
| os._exit(0) |
| threading.Thread(target=_watch, daemon=True).start() |
|
|
| |
|
|
| def _internal_language(lang: str, mode: str, ann_ext: str) -> str: |
| """ |
| Map UI choices to the internal `language` flag understood by main_predict. |
| |
| 'english' = no G2P; assumes labels are already TIMIT-39 phonemes. |
| 'dutch' = G2P pipeline (panphon-based articulatory mapping). Used for: |
| • any non-English language |
| • word-level alignment (.wrd, words need phoneme decomposition) |
| • plain text input (could be words or arbitrary phonemes) |
| |
| NOTE: `ann_ext` here must be the *original* extension supplied by the user — |
| not the post-rewrite extension after a .txt → dummy .phn synthesis. |
| """ |
| if lang == "english" and mode == "phoneme" and ann_ext.lower() == "phn": |
| return "english" |
| return "dutch" |
|
|
| |
|
|
| |
| |
| |
| G2P_LANG_CHOICES = [ |
| "English (default)", "German", "Dutch", "Hebrew", "French", |
| "Spanish", "Italian", "Russian", "Portuguese", "Other / unknown", |
| ] |
| _G2P_LANG_MAP = { |
| "English (default)": ("en-us", "en-us"), |
| "German": ("de", "de"), |
| "Dutch": ("nl", "nl"), |
| "Hebrew": ("he", None), |
| "French": ("fr", None), |
| "Spanish": ("es", None), |
| "Italian": ("it", None), |
| "Russian": ("ru", None), |
| "Portuguese": ("pt", None), |
| "Other / unknown": ("en-us", None), |
| } |
|
|
| def _resolve_g2p(g2p_choice, lang_choice): |
| """Map the (G2P option, input-language) UI choices to a concrete backend. |
| |
| Returns (backend, voice, note). backend in {"mfa", "espeak", "char"}. Honors |
| an explicit espeak / MFA-like / none choice but auto-falls-back when the |
| chosen backend has no model for the language; "Auto" picks the best available. |
| """ |
| espeak_voice, mfa_voice = _G2P_LANG_MAP.get(lang_choice, ("en-us", "en-us")) |
| try: |
| import mfa_g2p |
| mfa_ok = mfa_voice is not None and mfa_g2p.mfa_available(mfa_voice) |
| except Exception: |
| mfa_ok = False |
| choice = (g2p_choice or "Auto").lower() |
|
|
| if choice.startswith("none"): |
| return "char", espeak_voice or "en-us", "none (romanization)" |
| if choice.startswith("mfa"): |
| if mfa_ok: |
| return "mfa", mfa_voice, "MFA-like" |
| if espeak_voice: |
| return "espeak", espeak_voice, "espeak (no MFA model for this language)" |
| return "char", "en-us", "none (no MFA/espeak model)" |
| if choice.startswith("espeak"): |
| if espeak_voice: |
| return "espeak", espeak_voice, "espeak" |
| if mfa_ok: |
| return "mfa", mfa_voice, "MFA-like (no espeak voice for this language)" |
| return "char", "en-us", "none" |
| |
| if mfa_ok: |
| return "mfa", mfa_voice, "MFA-like (auto)" |
| if espeak_voice: |
| return "espeak", espeak_voice, "espeak (auto)" |
| return "char", "en-us", "none (auto)" |
|
|
| |
|
|
| OUTPUTS_NONE = (None, None, None, None, None) |
|
|
| def run_alignment(audio_file, annotation_file, ckpt_upload, mode, lang, |
| pretrained_choice, g2p_choice="Auto (recommended)", |
| lang_choice="English (default)", |
| progress=gr.Progress(track_tqdm=True)): |
| if not audio_file or not annotation_file: |
| return ("Please upload both an audio file and an annotation file.", *OUTPUTS_NONE) |
|
|
| ckpt_to_use = ckpt_upload if ckpt_upload else _resolve_ckpt(pretrained_choice) |
| if not ckpt_to_use or not os.path.exists(ckpt_to_use): |
| return ( |
| f"No checkpoint found. Expected {CKPT_FILES[pretrained_choice]} " |
| f"in {PRETRAINED_DIR}, or HF_MODEL_REPO set, or upload a .pt file.", |
| *OUTPUTS_NONE, |
| ) |
|
|
| progress(0.1, desc="Preparing workspace...") |
| workspace = tempfile.mkdtemp(prefix="falcon_") |
| base = "input" |
| wav_path = os.path.join(workspace, f"{base}.wav") |
|
|
| original_ext = os.path.basename(annotation_file).split(".")[-1].lower() |
| ann_ext = original_ext |
| ann_path = os.path.join(workspace, f"{base}.{ann_ext}") |
|
|
| |
| try: |
| audio, sr = torchaudio.load(audio_file) |
| if audio.shape[0] > 1: |
| audio = audio.mean(dim=0, keepdim=True) |
| if sr != 16000: |
| audio = torchaudio.functional.resample(audio, sr, 16000) |
| torchaudio.save(wav_path, audio, 16000) |
| except Exception as exc: |
| return (f"Audio error: {exc}", *OUTPUTS_NONE) |
|
|
| shutil.copy(annotation_file, ann_path) |
|
|
| |
| |
| |
| |
| if original_ext == "txt": |
| with open(ann_path) as f: |
| orig_tokens = re.sub(r"[^\w\s]", "", f.read().strip()).split() |
| |
| |
| if len(orig_tokens) >= 3 and orig_tokens[0].isdigit() and orig_tokens[1].isdigit(): |
| orig_tokens = orig_tokens[2:] |
| if not orig_tokens: |
| return ("Text annotation is empty after stripping punctuation.", *OUTPUTS_NONE) |
| |
| audio_len = audio.shape[1] |
| interval = audio_len / len(orig_tokens) |
| ann_ext = "phn" |
| ann_path = os.path.join(workspace, f"{base}.{ann_ext}") |
| with open(ann_path, "w") as f: |
| for i, tok in enumerate(orig_tokens): |
| f.write(f"{int(i * interval)} {int((i + 1) * interval)} {tok}\n") |
| else: |
| with open(ann_path) as f: |
| orig_tokens = [ln.strip().split()[-1] for ln in f if ln.strip()] |
|
|
| |
| language = _internal_language(lang, mode, original_ext) |
|
|
| |
| |
| |
| |
| |
| |
| |
| app_mapped_ph = None |
| word_g2p_note = "" |
| if original_ext in ("wrd", "txt", "word") and orig_tokens: |
| import word_g2p |
| backend, voice, g2p_note = _resolve_g2p(g2p_choice, lang_choice) |
| try: |
| app_mapped_ph = [word_g2p.word_to_lh39(tok, voice=voice, backend=backend) |
| for tok in orig_tokens] |
| except Exception as g2p_exc: |
| |
| print(f"[FALCON] G2P backend '{backend}' failed ({g2p_exc}); falling back.") |
| try: |
| app_mapped_ph = [word_g2p.word_to_lh39(tok, voice=voice or "en-us", |
| backend="espeak") |
| for tok in orig_tokens] |
| g2p_note += " → espeak fallback" |
| except Exception: |
| app_mapped_ph = [word_g2p.word_to_lh39(tok, backend="char") |
| for tok in orig_tokens] |
| g2p_note += " → none fallback" |
| word_g2p_note = f" Word G2P: {g2p_note}." |
| phons = [ph for seq in app_mapped_ph for ph in seq] or ["sil"] |
| |
| |
| ann_ext = "phn" |
| ann_path = os.path.join(workspace, f"{base}.{ann_ext}") |
| audio_len = audio.shape[1] |
| interval = audio_len / max(1, len(phons)) |
| with open(ann_path, "w") as f: |
| for i, ph in enumerate(phons): |
| f.write(f"{int(i * interval)} {int((i + 1) * interval)} {ph}\n") |
| language = "english" |
|
|
| progress(0.3, desc="Running alignment...") |
| try: |
| with _inference_lock: |
| utils.set_dp_matrix_out_dir(workspace) |
| pred_bound, _truth_bound, mapped_ph = main_predict( |
| wav=wav_path, |
| ckpt=ckpt_to_use, |
| w_phi=0.5, |
| language=language, |
| annotation=ann_ext, |
| ) |
| utils.set_dp_matrix_out_dir(None) |
| progress(0.6, desc="Rendering aligned visualization...") |
| |
| |
| |
| panels_path = os.path.join(workspace, "panels.png") |
| try: |
| import falcon_viz |
| falcon_viz.make_alignment_panels( |
| wav=wav_path, ckpt=ckpt_to_use, out_path=panels_path, |
| language=language, annotation=ann_ext, |
| show_truth=(original_ext == "phn" and mode == "phoneme"), |
| ) |
| except Exception as viz_exc: |
| print(f"[FALCON] panel viz failed: {viz_exc}") |
| panels_path = None |
| except Exception as exc: |
| utils.set_dp_matrix_out_dir(None) |
| return (f"Inference error: {exc}", *OUTPUTS_NONE) |
|
|
| |
| |
| |
| if app_mapped_ph is not None: |
| mapped_ph = app_mapped_ph |
|
|
| progress(0.8, desc="Building outputs...") |
|
|
| pred_bound_list = [float(t) for t in pred_bound] |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| if mapped_ph is not None: |
| phn_labels = [ph for seq in mapped_ph for ph in seq] |
| else: |
| phn_labels = orig_tokens |
|
|
| table_phonemes, phn_intervals = [], [] |
| t0 = 0.0 |
| for i, t1 in enumerate(pred_bound_list): |
| lbl = phn_labels[i] if i < len(phn_labels) else "" |
| table_phonemes.append([round(t0, 3), round(t1, 3), lbl]) |
| if t1 > t0: |
| phn_intervals.append((t0, t1, lbl)) |
| t0 = t1 |
|
|
| table_original, orig_intervals = [], [] |
| if mapped_ph is not None: |
| counts_per_token = [len(seq) for seq in mapped_ph] |
| cumulative = 0 |
| t0 = 0.0 |
| for i, count in enumerate(counts_per_token): |
| cumulative += count |
| if cumulative - 1 >= len(pred_bound_list): |
| break |
| t1 = pred_bound_list[cumulative - 1] |
| lbl = orig_tokens[i] if i < len(orig_tokens) else "" |
| table_original.append([round(t0, 3), round(t1, 3), lbl]) |
| if t1 > t0: |
| orig_intervals.append((t0, t1, lbl)) |
| t0 = t1 |
|
|
| |
| max_time = phn_intervals[-1][1] if phn_intervals else 0.0 |
| tg = textgrid.TextGrid(minTime=0, maxTime=max_time) |
| tier_phn = textgrid.IntervalTier(name="phones", minTime=0, maxTime=max_time) |
| for t0_iv, t1_iv, lbl_iv in phn_intervals: |
| tier_phn.add(minTime=t0_iv, maxTime=t1_iv, mark=lbl_iv) |
| tg.append(tier_phn) |
| if orig_intervals: |
| |
| if mode == "word": |
| orig_tier_name = "words" |
| elif original_ext == "phn": |
| orig_tier_name = "phones_original" |
| else: |
| orig_tier_name = "tokens" |
| tier_orig = textgrid.IntervalTier(name=orig_tier_name, minTime=0, maxTime=max_time) |
| for t0_iv, t1_iv, lbl_iv in orig_intervals: |
| tier_orig.add(minTime=t0_iv, maxTime=t1_iv, mark=lbl_iv) |
| tg.append(tier_orig) |
| tg_path = os.path.join(workspace, f"{base}.TextGrid") |
| tg.write(tg_path) |
|
|
| |
| |
| |
| status_note = word_g2p_note |
| if mode == "word" and original_ext == "phn": |
| status_note += (" Note: input was phoneme-level (.phn) but mode=word " |
| "— the 'original' table shows input phonemes since no " |
| "word annotations were provided.") |
|
|
| return ( |
| "Done." + status_note, |
| audio_file, |
| panels_path if panels_path and os.path.exists(panels_path) else None, |
| tg_path, |
| table_phonemes, |
| table_original, |
| ) |
|
|
| |
|
|
| PRETRAINED_RADIO_CHOICES = [ |
| (CKPT_LABELS["english"], "english"), |
| (CKPT_LABELS["buckeye"], "buckeye"), |
| (CKPT_LABELS["multilingual"], "multilingual"), |
| ] |
|
|
| |
| |
| |
| |
| RESULTS_MD = """### Results (from the [paper](https://arxiv.org/abs/2606.25460)) |
| |
| Accuracy = % of reference boundaries matched within the ms tolerance. **Specialist** = |
| trained on the target English corpus; **joint** = one model jointly trained on |
| TIMIT+Buckeye; multilingual rows are **zero-shot** (no target-language training data). |
| |
| **Phone-level Alignment Accuracy [%]: MFA vs. FALCON (Ours)** |
| |
| | Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 | |
| |---|---|---|---|---|---| |
| | TIMIT | MFA | **38.6** | 72.3 | 81.1 | 84.6 | |
| | TIMIT | FALCON specialist | 37.66 | **83.88** | **94.85** | **98.62** | |
| | TIMIT | FALCON joint | 34.70 | 82.62 | 94.91 | 98.60 | |
| | Buckeye | MFA | **35.3** | 60.6 | 68.9 | 72.7 | |
| | Buckeye | FALCON specialist | 29.69 | **69.93** | **90.07** | **97.40** | |
| | Buckeye | FALCON joint | 28.87 | 69.40 | 89.53 | 97.13 | |
| |
| **Phoneme-Level: Unseen Multilingual Generalization Accuracy** |
| |
| | Test set | Model | ≤10 | ≤15 | ≤20 | ≤25 | ≤50 | ≤100 | |
| |---|---|---|---|---|---|---|---| |
| | Dutch — IFA | **FALCON joint** | **26.85** | **36.16** | **44.56** | **51.17** | **69.94** | **84.11** | |
| | Dutch — IFA | FALCON specialist | 26.86 | 35.79 | 43.85 | 50.34 | 68.68 | 83.22 | |
| | Dutch — IFA | MFA | 11.01 | 14.70 | 19.05 | 21.80 | 33.90 | 51.02 | |
| | German — PHONDAT | **FALCON joint** | **25.63** | **34.12** | **41.87** | **49.07** | **70.04** | **84.58** | |
| | German — PHONDAT | FALCON specialist | 25.08 | 33.37 | 40.76 | 47.43 | 68.27 | 82.44 | |
| | German — PHONDAT | MFA | 20.60 | 31.75 | 37.17 | 45.83 | 66.78 | 79.19 | |
| | Hebrew | **FALCON joint** | **21.98** | **30.10** | **36.91** | **42.78** | **63.07** | **80.41** | |
| | Hebrew | FALCON specialist | 21.03 | 27.78 | 34.30 | 39.79 | 59.38 | 77.76 | |
| |
| **Word-Level Alignment Accuracy [%]: Comparative Analysis** |
| |
| | Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 | |
| |---|---|---|---|---|---| |
| | TIMIT | FALCON spec (MFA-G2P) | 49.22 | **81.79** | **93.04** | 98.37 | |
| | TIMIT | FALCON joint (MFA-G2P) | **49.50** | 80.60 | 92.86 | **98.46** | |
| | TIMIT | MFA | 41.60 | 72.80 | 89.40 | 97.40 | |
| | TIMIT | MMS | 18.60 | 43.50 | 75.70 | 94.70 | |
| | TIMIT | WhisperX | 22.40 | 52.70 | 82.40 | 94.20 | |
| | TIMIT | Nvidia-Canary-1b | 9.23 | 23.11 | 44.23 | 72.81 | |
| | Buckeye | FALCON spec (MFA-G2P) | 50.06 | 77.85 | **91.51** | **96.63** | |
| | Buckeye | FALCON joint (MFA-G2P) | **50.42** | **77.98** | 91.01 | 96.55 | |
| | Buckeye | MFA | 39.80 | 69.90 | 84.90 | 91.80 | |
| | Buckeye | MMS | 25.00 | 52.70 | 75.00 | 87.90 | |
| | Buckeye | WhisperX | 18.80 | 43.10 | 67.40 | 77.40 | |
| | Buckeye | Nvidia-Canary-1b | 8.06 | 18.83 | 36.31 | 63.29 | |
| |
| **Word-Level: Unseen Multilingual Generalization Accuracy** |
| |
| | Dataset | Model | t≤10 | t≤25 | t≤50 | t≤100 | |
| |---|---|---|---|---|---| |
| | German — PHONDAT | FALCON (MFA-G2P) | **44.20** | **68.48** | **86.12** | **95.11** | |
| | German — PHONDAT | MFA | 29.9 | 65.4 | 82.1 | 94.3 | |
| | German — PHONDAT | MMS | 21.8 | 44.3 | 74.9 | 91.8 | |
| | Dutch — IFA | FALCON (MFA-G2P) | **26.38** | **45.15** | 61.16 | 76.49 | |
| | Dutch — IFA | MFA | 4.7 | 7.3 | 11.6 | 19.0 | |
| | Dutch — IFA | MMS | 16.0 | 37.9 | **62.9** | **76.6** | |
| | Hebrew | FALCON | **31.91** | **56.72** | 75.18 | 87.89 | |
| | Hebrew | MMS | 14.3 | 41.3 | **76.5** | **94.7** | |
| """ |
|
|
| with gr.Blocks(title="FALCON Forced Aligner", theme=gr.themes.Soft()) as demo: |
| gr.Markdown("# FALCON: Forced Alignment through Contrastive Optimization Networks") |
| gr.Markdown( |
| "Upload a speech file and a transcript to predict precise phoneme or word boundaries " |
| "using Soft Dynamic Programming." |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### Inputs") |
| audio_in = gr.Audio(label="Audio file (any sample rate)", type="filepath") |
| ann_in = gr.File(label="Annotation (.phn / .wrd / .txt)") |
| mode_in = gr.Radio(["phoneme", "word"], value="phoneme", label="Mode") |
| lang_in = gr.Radio(["english", "multilingual"], value="english", label="Language") |
| |
| |
| |
| g2p_in = gr.Radio( |
| ["Auto (recommended)", "espeak", "MFA-like", |
| "none — romanization (not recommended; only for languages with no G2P model)"], |
| value="Auto (recommended)", |
| label="Word G2P (used only in word mode)", |
| ) |
| lang_g2p_in = gr.Dropdown( |
| G2P_LANG_CHOICES, |
| value="English (default)", |
| label="Input language (optional, recommended — improves G2P choice)", |
| ) |
| pretrained_in = gr.Radio( |
| choices=PRETRAINED_RADIO_CHOICES, |
| value="english", |
| label="Pretrained checkpoint (auto-follows Language; override here if you want)", |
| ) |
| ckpt_in = gr.File(label="Or upload a custom checkpoint (.pt) — overrides pretrained", |
| file_types=[".pt"], type="filepath") |
| btn = gr.Button("Run Alignment", variant="primary") |
| status = gr.Textbox(label="Status", interactive=False) |
|
|
| with gr.Column(scale=2): |
| gr.Markdown("### Outputs") |
| with gr.Tabs(): |
| with gr.Tab("Alignment Data"): |
| audio_out = gr.Audio(label="Playback", interactive=False) |
| table_phn_out = gr.Dataframe( |
| headers=["Start (s)", "End (s)", "Phoneme (LH39)"], |
| label="LH39 phonemes — aligner output", |
| ) |
| table_orig_out = gr.Dataframe( |
| headers=["Start (s)", "End (s)", "Label"], |
| label="Original input layer (words / non-LH39 phonemes) — empty for English phoneme alignment", |
| ) |
| tg_out = gr.File(label="Download TextGrid (carries both tiers when applicable)") |
| with gr.Tab("Visualizations"): |
| img_panels = gr.Image( |
| label="Time-aligned representations — waveform · spectrogram · phoneme posteriors · Soft-DP path · contrastive score (shared time axis; predicted boundaries overlaid). Click to enlarge.", |
| show_download_button=True, |
| ) |
|
|
| gr.Markdown("### Example — click to load it, then press **Run Alignment**") |
| gr.Examples( |
| examples=[ |
| ["assets/fasw0sa2.wav", "assets/fasw0sa2.phn", "phoneme", |
| "english", "english", "Auto (recommended)", "English (default)"], |
| ], |
| inputs=[audio_in, ann_in, mode_in, lang_in, pretrained_in, g2p_in, lang_g2p_in], |
| label="English · TIMIT", |
| ) |
|
|
| gr.Markdown(RESULTS_MD) |
|
|
| |
| lang_in.change(fn=lambda v: v, inputs=lang_in, outputs=pretrained_in) |
|
|
| btn.click( |
| fn=run_alignment, |
| inputs=[audio_in, ann_in, ckpt_in, mode_in, lang_in, pretrained_in, |
| g2p_in, lang_g2p_in], |
| outputs=[status, audio_out, img_panels, |
| tg_out, table_phn_out, table_orig_out], |
| ) |
|
|
| |
| |
| |
| |
| if not os.environ.get("SPACE_ID"): |
| |
| shutdown_btn = gr.Button("Shutdown", visible=False, elem_id="falcon-shutdown-btn") |
| shutdown_btn.click(fn=lambda: os._exit(0), inputs=[], outputs=[]) |
|
|
| |
| hb_btn = gr.Button("hb", visible=False, elem_id="falcon-heartbeat-btn") |
| hb_btn.click(fn=_heartbeat, inputs=[], outputs=[], |
| show_progress="hidden", queue=False) |
| _start_shutdown_watchdog() |
|
|
| demo.load(None, None, None, js=""" |
| () => { |
| const stop = () => { |
| const btn = document.getElementById('falcon-shutdown-btn'); |
| if (btn) btn.click(); |
| }; |
| window.addEventListener('beforeunload', stop); |
| window.addEventListener('pagehide', stop); |
| const beat = () => { |
| const hb = document.getElementById('falcon-heartbeat-btn'); |
| if (hb) hb.click(); |
| }; |
| beat(); |
| setInterval(beat, 10000); |
| } |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|