Spaces:
Running on Zero
Running on Zero
| """fluentWhisper: side-by-side Whisper demo. | |
| Vanilla whisper-large-v3-turbo against the same model plus a disfluency-cleaning | |
| LoRA adapter, with a strikethrough diff view that makes the deletions visible. | |
| Ships the v3c winner adapter. The diff view (discourse markers, repetitions, and | |
| self-repairs struck through) is the visually dominant element, not "watch it | |
| delete um/uh" (vanilla already does that natively). | |
| """ | |
| import difflib | |
| import html | |
| import os | |
| import re | |
| import gradio as gr | |
| # `spaces` only exists in the ZeroGPU runtime. Provide a no-op shim so the | |
| # module imports cleanly anywhere (local lint, CI, py_compile, unit tests). | |
| try: | |
| import spaces # type: ignore | |
| except Exception: # pragma: no cover - exercised only off-Space | |
| class _SpacesShim: | |
| def GPU(*args, **kwargs): | |
| # Support both @spaces.GPU and @spaces.GPU(duration=...). | |
| if len(args) == 1 and callable(args[0]) and not kwargs: | |
| return args[0] | |
| def _decorator(fn): | |
| return fn | |
| return _decorator | |
| spaces = _SpacesShim() # type: ignore | |
| BASE = "openai/whisper-large-v3-turbo" | |
| # The winning disfluency LoRA (v3c checkpoint-2000), published public on the Hub. | |
| ADAPTER = "pradachan/whisper-large-v3-turbo-disfluency-lora" | |
| TARGET_SR = 16000 | |
| CHUNK_S = 30 # whisper's native window; longer audio is split into chunks. | |
| NUM_BEAMS = 1 # matches the v1 capture methodology (src/baseline.py). | |
| # --------------------------------------------------------------------------- | |
| # Model loading — lazy and CUDA-free at import time. | |
| # | |
| # ZeroGPU only attaches a GPU *inside* the @spaces.GPU-decorated `transcribe`, | |
| # so all model loading happens there on the first call. Loading the model (or | |
| # running any CUDA op) at module-import/startup raises "No CUDA GPUs are | |
| # available" on ZeroGPU, which is why there is no global preload here. | |
| # --------------------------------------------------------------------------- | |
| import torch | |
| from peft import PeftModel | |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
| _model, _processor, _device, _dtype = None, None, None, None | |
| def load_models(): | |
| """Lazy-load the processor and base model + LoRA once, on first GPU call. | |
| The base weights are loaded a *single* time and the LoRA is attached on | |
| top. Vanilla decoding runs inside a ``disable_adapter()`` context, so there | |
| is only one copy of whisper-large-v3-turbo in memory instead of two. | |
| """ | |
| global _model, _processor, _device, _dtype | |
| if _model is None: | |
| _dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| _device = "cuda" if torch.cuda.is_available() else "cpu" | |
| _processor = WhisperProcessor.from_pretrained(BASE) | |
| _base = WhisperForConditionalGeneration.from_pretrained(BASE, torch_dtype=_dtype) | |
| _model = PeftModel.from_pretrained(_base, ADAPTER).to(_device) | |
| _model.eval() | |
| return { | |
| "model": _model, | |
| "processor": _processor, | |
| "device": _device, | |
| "dtype": _dtype, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Audio loading / chunking | |
| # --------------------------------------------------------------------------- | |
| def _load_audio_16k_mono(audio_path): | |
| """Load any audio file as a float32 mono numpy array at 16 kHz.""" | |
| import librosa | |
| audio, _ = librosa.load(audio_path, sr=TARGET_SR, mono=True) | |
| return audio | |
| def _chunk(audio, sr=TARGET_SR, chunk_s=CHUNK_S): | |
| """Split into <=chunk_s windows. Whisper handles <=30s natively; for longer | |
| clips we fall back to fixed windows and concatenate the decoded texts.""" | |
| n = int(chunk_s * sr) | |
| if len(audio) <= n: | |
| return [audio] | |
| return [audio[i : i + n] for i in range(0, len(audio), n)] | |
| def _decode(model, processor, audio, device, dtype): | |
| """Decode one model over an audio array (chunking long inputs), return text.""" | |
| import torch | |
| forced = processor.get_decoder_prompt_ids(language="en", task="transcribe") | |
| texts = [] | |
| for chunk in _chunk(audio): | |
| feats = processor( | |
| chunk, sampling_rate=TARGET_SR, return_tensors="pt" | |
| ).input_features.to(device=device, dtype=dtype) | |
| with torch.no_grad(): | |
| ids = model.generate( | |
| feats, forced_decoder_ids=forced, num_beams=NUM_BEAMS | |
| ) | |
| text = processor.batch_decode( | |
| ids, skip_special_tokens=True, normalize=False | |
| )[0].strip() | |
| if text: | |
| texts.append(text) | |
| return " ".join(texts).strip() | |
| # --------------------------------------------------------------------------- | |
| # Diff view — the centerpiece of the demo | |
| # --------------------------------------------------------------------------- | |
| _TOKEN_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE) | |
| def _tokenize(text): | |
| """Split into word and punctuation tokens for token-level diffing.""" | |
| return _TOKEN_RE.findall(text or "") | |
| _WORD_RE = re.compile(r"\w+(?:'\w+)*", re.UNICODE) # words, keeping contractions whole | |
| def diff_html(vanilla_text, tuned_text): | |
| """Render the vanilla transcript with the words the cleaned model REMOVED | |
| shown as red strikethrough, and everything else left intact. | |
| The cleaned model lowercases its output and strips most punctuation, so a | |
| raw text diff would flag every comma, capital letter, and full stop as an | |
| "edit" and bury the actual disfluency deletions in noise (this is exactly | |
| what made the diff look broken). To avoid that: | |
| * Comparison runs on lowercased *word* tokens only; punctuation and casing | |
| are ignored, so they never count as edits. | |
| * The display preserves the original vanilla text verbatim (its casing, | |
| punctuation, and spacing). Only whole words are ever struck through. | |
| * Only true deletions are struck. Word substitutions (an ASR variant like | |
| "mum" vs "mom", or an acronym respelled "CNN" vs "c n n") are left alone, | |
| because those are not disfluency removals and would only confuse a viewer. | |
| """ | |
| vanilla_text = vanilla_text or "" | |
| a_matches = list(_WORD_RE.finditer(vanilla_text)) | |
| a_words = [m.group(0).lower() for m in a_matches] | |
| b_words = [m.group(0).lower() for m in _WORD_RE.finditer(tuned_text or "")] | |
| sm = difflib.SequenceMatcher(a=a_words, b=b_words, autojunk=False) | |
| removed = set() | |
| for tag, i1, i2, j1, j2 in sm.get_opcodes(): | |
| if tag == "delete": # only words dropped outright count as removed | |
| removed.update(range(i1, i2)) | |
| out = [] | |
| cursor = 0 | |
| for i, m in enumerate(a_matches): | |
| # Inter-word text (spaces + punctuation) is passed through untouched. | |
| out.append(html.escape(vanilla_text[cursor:m.start()])) | |
| word = html.escape(m.group(0)) | |
| if i in removed: | |
| out.append( | |
| f'<span class="fw-token fw-removed" data-tooltip="Removed by the cleaning LoRA">{word}</span>' | |
| ) | |
| else: | |
| out.append(f'<span class="fw-token fw-kept" data-tooltip="Kept">{word}</span>') | |
| cursor = m.end() | |
| out.append(html.escape(vanilla_text[cursor:])) | |
| return f'<div class="fw-diff-text">{"".join(out)}</div>' | |
| # --------------------------------------------------------------------------- | |
| # Result metric — a normalized read-out shown beneath the diff | |
| # --------------------------------------------------------------------------- | |
| # Humanized per /humanizer: plain language, no em dashes, no promotional padding. | |
| _WER_INFO = ( | |
| "n-WER is normalized word error rate. Before scoring we lowercase the text " | |
| "and drop punctuation, so capitalization and commas never count against the " | |
| "model and only the words get compared. This is the usual way ASR systems " | |
| "are measured, Whisper's own benchmarks included. On the DisfluencySpeech " | |
| "test set, vanilla Whisper scores 9.4% and this adapter scores 3.4%." | |
| ) | |
| # Shown in the result row before anything is transcribed, so the area reads | |
| # clean instead of jumping when the first result lands. | |
| METRIC_PLACEHOLDER = ( | |
| '<div class="fw-metric fw-metric-empty">' | |
| '<span class="fw-metric-placeholder">Your cleaning summary will show up here.</span>' | |
| "</div>" | |
| ) | |
| def metric_html(vanilla_text, tuned_text): | |
| """One-line read-out under the diff: how much this clip was cleaned (live, | |
| measured against the vanilla transcript on lowercased words only), plus a | |
| fixed benchmark badge with an info bubble that defines n-WER. | |
| The live figure compares lowercased word tokens, so the cleaned model's lack | |
| of punctuation and casing is never counted as a difference. That is the same | |
| normalization the benchmark uses and the same one the diff above runs on. | |
| """ | |
| a_words = [m.group(0).lower() for m in _WORD_RE.finditer(vanilla_text or "")] | |
| b_words = [m.group(0).lower() for m in _WORD_RE.finditer(tuned_text or "")] | |
| sm = difflib.SequenceMatcher(a=a_words, b=b_words, autojunk=False) | |
| removed = sum( | |
| i2 - i1 for tag, i1, i2, _j1, _j2 in sm.get_opcodes() if tag == "delete" | |
| ) | |
| total = max(len(a_words), 1) | |
| pct = round(100 * removed / total) | |
| if removed == 0: | |
| live = "No disfluencies detected in this clip." | |
| else: | |
| word = "word" if removed == 1 else "words" | |
| live = f"Cleaned {removed} disfluent {word} ({pct}% of what was spoken)." | |
| badge = ( | |
| '<span class="fw-info" tabindex="0" ' | |
| f'data-tooltip="{html.escape(_WER_INFO)}">' | |
| "Benchmarked using n-WER ⓘ</span>" | |
| ) | |
| return f'<div class="fw-metric"><span class="fw-metric-live">{live}</span>{badge}</div>' | |
| # --------------------------------------------------------------------------- | |
| # Inference entry point (GPU) | |
| # --------------------------------------------------------------------------- | |
| def transcribe(audio_path): | |
| if not audio_path: | |
| return ( | |
| '<span class="fw-placeholder-text">Record or upload some speech, then hit Transcribe.</span>', | |
| METRIC_PLACEHOLDER, | |
| ) | |
| m = load_models() | |
| model, processor = m["model"], m["processor"] | |
| audio = _load_audio_16k_mono(audio_path) | |
| # Vanilla = the same weights with the LoRA switched off, so we never hold a | |
| # second copy of the base model in memory. | |
| with model.disable_adapter(): | |
| vanilla_text = _decode(model, processor, audio, m["device"], m["dtype"]) | |
| tuned_text = _decode(model, processor, audio, m["device"], m["dtype"]) | |
| return diff_html(vanilla_text, tuned_text), metric_html(vanilla_text, tuned_text) | |
| # --------------------------------------------------------------------------- | |
| # Facts carousel — rotating highlights shown in the hero section | |
| # --------------------------------------------------------------------------- | |
| FACTS = [ | |
| "On the DisfluencySpeech test set, vanilla Whisper scores " | |
| "<strong>9.4% WER</strong>. This adapter brings it down to <strong>3.4%</strong>.", | |
| "That is a <strong>6-point drop</strong> in word error rate, with a 95% " | |
| "bootstrap interval of [+5.0, +7.0]. The gain holds up.", | |
| "As far as we know, the <strong>only open Apache-2.0 model</strong> that removes " | |
| "fillers, discourse markers, repetitions, and self-repairs in one shot.", | |
| "It cleans your speech in a <strong>single pass</strong>. No second model, " | |
| "no LLM rewrite, no cloud round trip.", | |
| "Then it shows you <strong>exactly what it removed</strong>, struck through " | |
| "inline so you can trust the edit.", | |
| "Under the hood it is a small <strong>LoRA adapter</strong> on " | |
| "whisper-large-v3-turbo: rank 16, a few megabytes, loaded on top of the base.", | |
| "Trained on <strong>Modal</strong> from synthetic speech we built ourselves: " | |
| "LibriSpeech text, disfluencies injected, voiced with Kokoro across 54 voices.", | |
| "Runs <strong>offline on your own laptop</strong>. Apache-2.0, weights on " | |
| "Hugging Face, reproducible end to end.", | |
| ] | |
| FACT_DURATION = 5 # seconds each fact stays visible before fading | |
| def _fact_rotator_css(facts, duration=FACT_DURATION): | |
| """Generate the CSS keyframe animation for the rotating facts. | |
| Purely CSS-driven (no JavaScript). Each fact gets a staggered | |
| ``animation-delay`` so they cycle one after the other in an infinite loop. | |
| """ | |
| n = len(facts) | |
| total = n * duration | |
| show_pct = 100.0 / n | |
| fade_pct = show_pct * 0.15 | |
| delays = "\n".join( | |
| f".fw-fact:nth-child({i + 1}) {{ animation-delay: {i * duration}s; }}" | |
| for i in range(n) | |
| ) | |
| return f""" | |
| /* ---- fact rotator animation (auto-generated for {n} facts) ------------ */ | |
| .fw-fact {{ | |
| animation: fw-fact-fade {total}s ease-in-out infinite; | |
| }} | |
| @keyframes fw-fact-fade {{ | |
| 0% {{ opacity: 0; }} | |
| {fade_pct:.1f}% {{ opacity: 1; }} | |
| {show_pct - fade_pct:.1f}% {{ opacity: 1; }} | |
| {show_pct:.1f}% {{ opacity: 0; }} | |
| 100% {{ opacity: 0; }} | |
| }} | |
| {delays} | |
| """ | |
| def _build_header_html(facts): | |
| """Build the hero HTML with a rotating fact carousel at the bottom.""" | |
| fact_items = "\n".join( | |
| f' <div class="fw-fact">{f}</div>' for f in facts | |
| ) | |
| return f""" | |
| <div class="fw-hero"> | |
| <div class="fw-hero-section"> | |
| <div class="fw-eyebrow">✻ Apache-2.0 · Runs offline · Trained on Modal</div> | |
| </div> | |
| <div class="fw-hero-section"> | |
| <h1><span class="fw-accent">fluent</span>Whisper</h1> | |
| <p class="fw-lede">Speak messy. Read clean.</p> | |
| </div> | |
| <div class="fw-hero-section"> | |
| <div class="fw-fact-rotator"> | |
| {fact_items} | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # UI | |
| # --------------------------------------------------------------------------- | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital,wght@0,400;1,400&family=Newsreader:ital,opsz,wght@0,6..72,400;0,6..72,500;0,6..72,600;1,6..72,400&display=swap'); | |
| /* ---- palette ---------------------------------------------------------- */ | |
| :root, .gradio-container { | |
| --fw-parchment: #f9f6f0; /* vintage paper background */ | |
| --fw-card: #fcfff9; /* clean white paper cards */ | |
| --fw-royal-gold: #9e7a44; /* antique brass/gold */ | |
| --fw-royal-gold-hover: #805e2f; | |
| --fw-ink-dark: #1e130c; /* dark coffee-bean black */ | |
| --fw-ink-muted: #6b5c51; /* vintage gray-brown */ | |
| --fw-border: #dfd7ca; /* subtle line separator */ | |
| --fw-crimson: #a63f3c; /* editorial dark red for deleted text */ | |
| --fw-crimson-bg: #fdf3f2; /* soft pink highlighted background */ | |
| } | |
| /* ---- page shell — fill the viewport, no floating, no dark bars -------- */ | |
| html, body, gradio-app, .gradio-container > .main, | |
| .gradio-container .wrap, .app, #root { | |
| background: var(--fw-parchment) !important; | |
| color-scheme: light; | |
| } | |
| .gradio-container { | |
| max-width: 1000px !important; | |
| width: calc(100% - 4rem) !important; | |
| margin: 3rem auto !important; | |
| background: var(--fw-card) !important; | |
| color: var(--fw-ink-dark) !important; | |
| font-family: 'Instrument Serif', Georgia, 'Times New Roman', serif !important; | |
| padding: 4rem 3rem !important; | |
| border: 6px double var(--fw-royal-gold) !important; | |
| border-radius: 0 !important; | |
| box-shadow: 0 10px 40px rgba(158,122,68,0.05) !important; | |
| overflow: visible !important; | |
| } | |
| .gradio-container .prose, .gradio-container p, .gradio-container span, | |
| .gradio-container label { color: var(--fw-ink-dark); } | |
| /* ---- hero — centered sections vertically stacked --------------------- */ | |
| .gradio-container .block.fw-hero-container, | |
| .gradio-container .block.fw-hero-container > .prose, | |
| .gradio-container .block.fw-hero-container > .wrap, | |
| .gradio-container .block.fw-hero-container > div, | |
| .gradio-container .block.fw-footer-container, | |
| .gradio-container .block.fw-footer-container > .prose, | |
| .gradio-container .block.fw-footer-container > .wrap, | |
| .gradio-container .block.fw-footer-container > div { | |
| background: transparent !important; | |
| border: 20px !important; | |
| box-shadow: none !important; | |
| padding: 0.2rem !important; | |
| display: flex !important; | |
| flex-direction: column !important; | |
| } | |
| .fw-hero { | |
| background: transparent !important; | |
| border: none !important; | |
| padding: 0 !important; | |
| margin: 0 0 2rem 0 !important; | |
| text-align: center !important; | |
| display: flex !important; | |
| flex-direction: column !important; | |
| justify-content: center !important; | |
| height: auto !important; | |
| min-height: auto !important; | |
| gap: 1.5rem !important; | |
| flex-grow: 0 !important; | |
| } | |
| .fw-hero-section { | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| justify-content: center; | |
| } | |
| .fw-eyebrow { | |
| font-family: 'Instrument Serif', serif; | |
| font-style: normal; | |
| text-transform: none; | |
| font-size: 1.25rem; | |
| font-weight: 600; | |
| color: var(--fw-royal-gold); | |
| margin: 0; | |
| text-align: center !important; | |
| } | |
| .fw-hero h1 { | |
| font-family: 'Instrument Serif', Georgia, serif !important; | |
| font-weight: 400 !important; | |
| font-size: 4.2rem !important; | |
| line-height: 1.04 !important; | |
| color: var(--fw-ink-dark) !important; | |
| margin: 0 0 0.5rem !important; | |
| font-style: normal !important; | |
| text-align: center !important; | |
| } | |
| .fw-hero h1 .fw-accent { | |
| font-style: italic !important; | |
| color: var(--fw-royal-gold) !important; | |
| } | |
| .fw-lede { | |
| font-family: 'Instrument Serif', Georgia, serif !important; | |
| font-size: 2rem !important; | |
| color: var(--fw-ink-muted) !important; | |
| margin: 0 !important; | |
| text-align: center !important; | |
| } | |
| .fw-tagline { | |
| font-family: 'Newsreader', Georgia, serif !important; | |
| font-size: 1.15rem !important; | |
| line-height: 1.6 !important; | |
| color: var(--fw-ink-dark) !important; | |
| max-width: 32rem !important; | |
| margin: 0 auto !important; | |
| text-align: center !important; | |
| } | |
| .fw-tagline strong { color: var(--fw-royal-gold); font-weight: 600; } | |
| /* ---- WER stat — the visual anchor of the hero ------------------------- */ | |
| .fw-wer { | |
| display: flex; | |
| align-items: baseline; | |
| justify-content: center !important; | |
| gap: 0.8rem; | |
| margin: 0 0 0.5rem; | |
| } | |
| .fw-wer-num { | |
| font-family: 'Instrument Serif', Georgia, serif; | |
| font-size: 3.5rem; | |
| line-height: 1; | |
| color: var(--fw-ink-muted); | |
| } | |
| .fw-wer-good { color: var(--fw-royal-gold); font-weight: 600; } | |
| .fw-wer-arrow { font-size: 2.2rem; color: var(--fw-ink-muted); } | |
| .fw-wer-unit { | |
| font-family: 'Instrument Serif', Georgia, serif; | |
| font-size: 1.8rem; | |
| color: var(--fw-ink-dark); | |
| letter-spacing: 0.04em; | |
| } | |
| .fw-wer-note { | |
| font-family: 'Newsreader', Georgia, serif !important; | |
| font-size: 0.95rem !important; | |
| line-height: 1.6 !important; | |
| color: var(--fw-ink-muted) !important; | |
| margin: 0 auto !important; | |
| max-width: 28rem !important; | |
| text-align: center !important; | |
| box-sizing: border-box; | |
| } | |
| /* ---- fact rotator ----------------------------------------------------- */ | |
| .fw-fact-rotator { | |
| display: grid; | |
| place-items: center; | |
| min-height: 2.5rem; | |
| width: 100%; | |
| } | |
| .fw-fact { | |
| grid-area: 1 / 1; | |
| text-align: center; | |
| opacity: 0; | |
| font-family: 'Newsreader', Georgia, serif; | |
| font-size: 1.15rem; | |
| line-height: 1.5; | |
| color: var(--fw-ink-muted); | |
| max-width: 36rem; | |
| width: 100%; | |
| padding: 0 1rem; | |
| box-sizing: border-box; | |
| } | |
| .fw-fact strong { | |
| color: var(--fw-royal-gold); | |
| font-weight: 600; | |
| } | |
| /* ---- blocks / cards --------------------------------------------------- */ | |
| .gradio-container .block, | |
| .gradio-container .form, | |
| .gradio-container .gr-box { | |
| background: var(--fw-card) !important; | |
| border: 1px solid var(--fw-border) !important; | |
| border-radius: 8px !important; | |
| box-shadow: 0 2px 8px rgba(158,122,68,0.03) !important; | |
| } | |
| /* Style block headers and labels */ | |
| .gradio-container .block-label, | |
| .gradio-container .label-wrap, | |
| .gradio-container span[data-testid="block-info"], | |
| .gradio-container .block .label-wrap { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 0.4rem 0.8rem 0 !important; | |
| } | |
| /* SMALLER, cleaner component labels to remove bloat */ | |
| .gradio-container .block-label span, | |
| .gradio-container .label-wrap span, | |
| .gradio-container label > span, | |
| .gradio-container .block-info, | |
| .gradio-container span[data-testid="block-info"] { | |
| font-family: 'Instrument Serif', serif !important; | |
| font-style: normal !important; | |
| text-transform: none !important; | |
| letter-spacing: 0.02em !important; | |
| font-size: 1.15rem !important; | |
| font-weight: 500 !important; | |
| color: var(--fw-royal-gold) !important; | |
| background: transparent !important; | |
| } | |
| .gradio-container .block-label svg, | |
| .gradio-container .label-wrap svg { color: var(--fw-royal-gold) !important; } | |
| .gradio-container textarea, | |
| .gradio-container input { | |
| font-family: 'Newsreader', Georgia, serif !important; | |
| font-size: 1.05rem !important; | |
| color: var(--fw-ink-dark) !important; | |
| background: var(--fw-card) !important; | |
| border: 1px solid var(--fw-border) !important; | |
| border-radius: 4px !important; | |
| } | |
| /* ---- tabs and source navigation -------------------------------------- */ | |
| .gradio-container .tab-nav, | |
| .gradio-container div[role="tablist"] { | |
| background: transparent !important; | |
| border-bottom: 1px solid var(--fw-border) !important; | |
| gap: 0.5rem !important; | |
| padding: 0.2rem 0 !important; | |
| } | |
| .gradio-container .tab-nav button, | |
| .gradio-container button[role="tab"] { | |
| font-family: 'Instrument Serif', Georgia, serif !important; | |
| font-size: 1.15rem !important; | |
| font-style: normal !important; | |
| color: var(--fw-ink-muted) !important; | |
| background: transparent !important; | |
| border: 1px solid transparent !important; | |
| border-radius: 4px 4px 0 0 !important; | |
| padding: 0.4rem 1rem !important; | |
| transition: all 0.15s ease !important; | |
| } | |
| .gradio-container .tab-nav button.selected, | |
| .gradio-container button[role="tab"][aria-selected="true"] { | |
| color: var(--fw-ink-dark) !important; | |
| font-weight: 600 !important; | |
| background: var(--fw-card) !important; | |
| border: 1px solid var(--fw-border) !important; | |
| border-bottom-color: var(--fw-card) !important; | |
| box-shadow: none !important; | |
| } | |
| /* ---- pressable gold button ------------------------------------------- */ | |
| .fw-btn, .fw-btn button { | |
| background: var(--fw-royal-gold) !important; | |
| color: #fffdfa !important; | |
| font-family: 'Instrument Serif', Georgia, serif !important; | |
| font-weight: 600 !important; | |
| font-size: 1.15rem !important; | |
| letter-spacing: 0.05em !important; | |
| text-transform: uppercase !important; | |
| border: 1px solid var(--fw-royal-gold-hover) !important; | |
| border-radius: 4px !important; | |
| padding: 0.8rem 2.2rem !important; | |
| box-shadow: 0 2px 4px rgba(26,15,8,0.1) !important; | |
| transition: all 0.2s ease !important; | |
| } | |
| .fw-btn:hover, .fw-btn button:hover { | |
| background: var(--fw-royal-gold-hover) !important; | |
| box-shadow: 0 4px 8px rgba(26,15,8,0.15) !important; | |
| transform: translateY(-1px); | |
| } | |
| .fw-btn:active, .fw-btn button:active { | |
| background: var(--fw-royal-gold-hover) !important; | |
| box-shadow: inset 0 2px 4px rgba(0,0,0,0.2) !important; | |
| transform: translateY(0); | |
| } | |
| /* ---- force button text color inside components (microphone/upload, etc) -- */ | |
| .gradio-container button { | |
| color: var(--fw-ink-dark) !important; | |
| font-family: 'Instrument Serif', Georgia, serif !important; | |
| font-style: normal !important; | |
| font-weight: 500 !important; | |
| font-size: 1.15rem !important; | |
| } | |
| /* ---- scale audio component fonts so they are not diminished ---- */ | |
| .gradio-container .fw-audio, | |
| .gradio-container .fw-audio * { | |
| font-size: 1.05rem !important; | |
| } | |
| .gradio-container .fw-audio .block-label span { | |
| font-size: 1.15rem !important; | |
| } | |
| .gradio-container .fw-audio button { | |
| font-size: 1.15rem !important; | |
| } | |
| /* Keep the waveform / playback chrome from spilling out of its box */ | |
| .gradio-container .fw-audio .waveform-container, | |
| .gradio-container .fw-audio .controls, | |
| .gradio-container .fw-audio .component-wrapper { | |
| max-width: 100% !important; | |
| overflow: hidden !important; | |
| box-sizing: border-box !important; | |
| } | |
| .gradio-container .fw-audio .controls * { line-height: normal !important; } | |
| .fw-placeholder-text { | |
| color: var(--fw-ink-dark) !important; | |
| font-style: normal !important; | |
| font-size: 1.15rem !important; | |
| font-family: 'Newsreader', Georgia, serif !important; | |
| } | |
| /* ---- diff card & text ------------------------------------------------- */ | |
| .gradio-container .fw-diff { | |
| background: var(--fw-card) !important; | |
| border: 1px solid var(--fw-border) !important; | |
| border-radius: 8px !important; | |
| padding: 1.5rem !important; | |
| min-height: 80px !important; | |
| overflow: visible !important; | |
| } | |
| .fw-diff-text, .fw-placeholder-text { | |
| display: block; | |
| border-left: 4px solid var(--fw-royal-gold); | |
| padding-left: 1.5rem; | |
| margin-left: 0.2rem; | |
| border-radius: 4px; | |
| } | |
| .fw-diff-text { | |
| font-size: 1.25rem !important; | |
| line-height: 2 !important; | |
| color: var(--fw-ink-dark) !important; | |
| font-family: 'Newsreader', Georgia, serif !important; | |
| } | |
| .fw-diff-text span:not(.fw-removed) { | |
| color: var(--fw-ink-dark) !important; | |
| } | |
| /* Tooltip container */ | |
| .fw-token { | |
| position: relative; | |
| display: inline-block; | |
| cursor: pointer; | |
| padding: 0 1px; | |
| } | |
| /* Tooltip text bubble */ | |
| .fw-token::after { | |
| content: attr(data-tooltip); | |
| position: absolute; | |
| bottom: 130%; | |
| left: 50%; | |
| transform: translateX(-50%) scale(0.95); | |
| background-color: var(--fw-royal-gold) !important; | |
| color: var(--fw-parchment) !important; | |
| padding: 6px 10px; | |
| border-radius: 6px; | |
| font-size: 0.85rem; | |
| font-family: 'Newsreader', Georgia, serif; | |
| font-weight: 500; | |
| text-decoration: none !important; | |
| font-style: normal !important; | |
| white-space: nowrap; | |
| z-index: 1000; | |
| box-shadow: 0 4px 15px rgba(158, 122, 68, 0.15); | |
| opacity: 0; | |
| pointer-events: none; | |
| transition: opacity 0.15s ease, transform 0.15s cubic-bezier(0.16, 1, 0.3, 1); | |
| } | |
| /* Tooltip arrow */ | |
| .fw-token::before { | |
| content: ""; | |
| position: absolute; | |
| bottom: 115%; | |
| left: 50%; | |
| transform: translateX(-50%) scale(0.95); | |
| border-width: 6px; | |
| border-style: solid; | |
| border-color: var(--fw-royal-gold) transparent transparent transparent !important; | |
| z-index: 1000; | |
| opacity: 0; | |
| pointer-events: none; | |
| transition: opacity 0.15s ease, transform 0.15s cubic-bezier(0.16, 1, 0.3, 1); | |
| } | |
| .fw-token:hover::after, | |
| .fw-token:hover::before { | |
| opacity: 1; | |
| transform: translateX(-50%) scale(1); | |
| } | |
| .fw-kept:hover { | |
| background-color: rgba(158, 122, 68, 0.08); | |
| border-radius: 3px; | |
| } | |
| .fw-removed { | |
| text-decoration: line-through; | |
| color: var(--fw-crimson); | |
| background-color: var(--fw-crimson-bg); | |
| text-decoration-thickness: 1.5px; | |
| text-decoration-color: var(--fw-crimson); | |
| padding: 0 3px; | |
| border-radius: 3px; | |
| } | |
| .fw-removed:hover { | |
| background-color: #fbdcd9; | |
| } | |
| /* ---- result metric line ---------------------------------------------- */ | |
| .gradio-container .fw-metric-wrap { | |
| overflow: visible !important; | |
| } | |
| .fw-metric { | |
| height: 36px; | |
| display: flex; | |
| flex-wrap: wrap; | |
| align-items: baseline; | |
| justify-content: space-between; | |
| gap: 0.6rem 1.2rem; | |
| margin: 0.9rem 0.2rem 0.9rem; | |
| padding: 0 0.2rem; | |
| font-family: 'Newsreader', Georgia, serif; | |
| } | |
| .fw-metric-live { | |
| font-size: 1.05rem; | |
| color: var(--fw-ink-muted); | |
| } | |
| /* empty state — a faint grey placeholder so a fresh visit reads clean */ | |
| .fw-metric-empty { | |
| justify-content: flex-start; | |
| } | |
| .fw-metric-placeholder { | |
| font-size: 1.05rem; | |
| color: #b7ad9f; | |
| } | |
| .fw-info { | |
| position: relative; | |
| display: inline-block; | |
| cursor: help; | |
| font-size: 0.95rem; | |
| font-weight: 600; | |
| color: var(--fw-royal-gold); | |
| letter-spacing: 0.01em; | |
| border-bottom: 1px dotted var(--fw-royal-gold); | |
| outline: none; | |
| } | |
| /* wrapping tooltip bubble (the .fw-token tooltip is single-line; this one wraps) */ | |
| .fw-info::after { | |
| content: attr(data-tooltip); | |
| position: absolute; | |
| bottom: 150%; | |
| right: 0; | |
| width: 280px; | |
| max-width: 78vw; | |
| white-space: normal; | |
| text-align: left; | |
| line-height: 1.45; | |
| background-color: var(--fw-ink-dark); | |
| color: var(--fw-parchment); | |
| padding: 10px 12px; | |
| border-radius: 8px; | |
| font-size: 0.85rem; | |
| font-weight: 400; | |
| font-family: 'Newsreader', Georgia, serif; | |
| z-index: 1000; | |
| box-shadow: 0 6px 20px rgba(30, 19, 12, 0.25); | |
| opacity: 0; | |
| pointer-events: none; | |
| transform: translateY(4px); | |
| transition: opacity 0.15s ease, transform 0.15s ease; | |
| } | |
| .fw-info:hover::after, | |
| .fw-info:focus::after { | |
| opacity: 1; | |
| transform: translateY(0); | |
| } | |
| /* ---- footer ----------------------------------------------------------- */ | |
| .fw-footer { | |
| text-align: center; | |
| font-family: 'Instrument Serif', serif; | |
| font-style: normal; | |
| font-weight: bold; | |
| font-size: 1.15rem; | |
| color: var(--fw-ink-muted); | |
| padding: 1.5rem 0 0.5rem; | |
| margin-top: 1.5rem; | |
| border-top: 1px solid var(--fw-border); | |
| } | |
| .fw-footer .fw-heart { color: var(--fw-crimson); } | |
| .fw-footer a { | |
| color: var(--fw-royal-gold); | |
| font-weight: bold; | |
| text-decoration: none; | |
| border-bottom: 1px dotted var(--fw-royal-gold); | |
| } | |
| .fw-footer a:hover { color: var(--fw-royal-gold-hover); border-bottom-style: solid; } | |
| /* hide gradio's default footer */ | |
| footer { display: none !important; } | |
| /* ---- responsiveness --------------------------------------------------- */ | |
| @media (max-width: 900px) { | |
| .gradio-container { | |
| width: calc(100% - 1.5rem) !important; | |
| margin: 1rem auto !important; | |
| padding: 2.5rem 1.5rem !important; | |
| border-width: 4px !important; | |
| } | |
| .fw-layout-row { | |
| flex-direction: column !important; | |
| gap: 3.5rem !important; | |
| } | |
| .gradio-container .block.fw-hero-container, | |
| .gradio-container .block.fw-hero-container > .prose, | |
| .gradio-container .block.fw-hero-container > .wrap, | |
| .gradio-container .block.fw-hero-container > div { | |
| height: auto !important; | |
| } | |
| .fw-hero { | |
| min-height: auto !important; | |
| height: auto !important; | |
| gap: 2.5rem !important; | |
| } | |
| .fw-hero h1 { | |
| font-size: 3rem !important; | |
| } | |
| .fw-lede { | |
| font-size: 1.6rem !important; | |
| } | |
| .fw-wer-num { | |
| font-size: 2.8rem !important; | |
| } | |
| } | |
| """ | |
| HEADER_HTML = _build_header_html(FACTS) | |
| # --------------------------------------------------------------------------- | |
| # Curated example clips (real DisfluencySpeech test audio). Each entry is only | |
| # offered if its .wav is actually present, so a missing file never breaks the | |
| # build. Paths are resolved next to this file so they work locally and on Space. | |
| # --------------------------------------------------------------------------- | |
| _EX_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "examples") | |
| _EXAMPLE_SPECS = [ | |
| ("idx_049.wav", "Spontaneous interview answer"), | |
| ("idx_144.wav", "Casual spoken reply"), | |
| ("idx_222.wav", "Off-the-cuff explanation"), | |
| ("idx_083.wav", "Unscripted remark") | |
| ] | |
| EXAMPLES = [ | |
| [os.path.join(_EX_DIR, fn)] for fn, _ in _EXAMPLE_SPECS | |
| if os.path.exists(os.path.join(_EX_DIR, fn)) | |
| ] | |
| EXAMPLE_LABELS = [ | |
| label for fn, label in _EXAMPLE_SPECS | |
| if os.path.exists(os.path.join(_EX_DIR, fn)) | |
| ] | |
| FOOTER_HTML = """ | |
| <div class="fw-footer"> | |
| Made with <span class="fw-heart">❤</span> by | |
| <a href="https://aipdv.com" target="_blank" rel="noopener">Prabhudayal Vaishnav</a> | |
| </div> | |
| """ | |
| def _theme(): | |
| """Warm editorial paper theme so all Gradio chrome matches the palette.""" | |
| t = gr.themes.Soft( | |
| primary_hue=gr.themes.colors.stone, | |
| secondary_hue=gr.themes.colors.stone, | |
| neutral_hue=gr.themes.colors.stone, | |
| font=[gr.themes.GoogleFont("Instrument Serif"), "Georgia", "serif"], | |
| ) | |
| t.set( | |
| body_background_fill="#f9f6f0", | |
| body_background_fill_dark="#f9f6f0", | |
| block_background_fill="#fcfbf9", | |
| block_background_fill_dark="#fcfbf9", | |
| block_label_text_color="#9e7a44", | |
| block_label_text_color_dark="#9e7a44", | |
| block_title_text_color="#9e7a44", | |
| block_title_text_color_dark="#9e7a44", | |
| block_label_background_fill="transparent", | |
| block_label_background_fill_dark="transparent", | |
| block_label_border_width="0px", | |
| block_label_border_width_dark="0px", | |
| input_background_fill="#fcfbf9", | |
| input_background_fill_dark="#fcfbf9", | |
| button_primary_background_fill="#9e7a44", | |
| button_primary_background_fill_hover="#805e2f", | |
| button_primary_text_color="#fcfbf9", | |
| button_primary_background_fill_dark="#9e7a44", | |
| button_primary_text_color_dark="#fcfbf9", | |
| button_secondary_background_fill="#fcfbf9", | |
| button_secondary_background_fill_dark="#fcfbf9", | |
| button_secondary_text_color="#1e130c", | |
| button_secondary_text_color_dark="#1e130c", | |
| button_secondary_border_color="#dfd7ca", | |
| button_secondary_border_color_dark="#dfd7ca", | |
| link_text_color="#9e7a44", | |
| link_text_color_dark="#9e7a44", | |
| ) | |
| return t | |
| def build_demo(): | |
| full_css = CUSTOM_CSS + _fact_rotator_css(FACTS) | |
| with gr.Blocks(title="fluentWhisper", theme=_theme(), css=full_css) as demo: | |
| gr.HTML(HEADER_HTML, elem_classes=["fw-hero-container"]) | |
| audio_in = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Speak or upload audio", | |
| elem_classes=["fw-audio"], | |
| # The mid-recording "page unresponsive" freeze came from the cold | |
| # model load blocking the first request, which the lazy load inside | |
| # the @spaces.GPU transcribe function keeps off the page thread. | |
| waveform_options=gr.WaveformOptions(show_recording_waveform=True), | |
| ) | |
| # Curated DisfluencySpeech test clips where the adapter cleanly removes | |
| # real discourse markers and repetitions. These are the examples behind | |
| # the 3.4% WER number, so they show the model at its best for the demo. | |
| if EXAMPLES: | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=audio_in, | |
| label="Or try a real clip from the DisfluencySpeech test set", | |
| example_labels=EXAMPLE_LABELS, | |
| ) | |
| run_btn = gr.Button( | |
| "Transcribe", variant="primary", elem_classes=["fw-btn"] | |
| ) | |
| # The diff is the headline result, so it sits right under the button — | |
| # no scrolling past empty boxes to see what the model did. | |
| diff_out = gr.HTML( | |
| label="What the LoRA removed", | |
| elem_classes=["fw-diff"], | |
| value='<span class="fw-placeholder-text">Record or upload some speech, then hit Transcribe.</span>', | |
| ) | |
| metric_out = gr.HTML(value=METRIC_PLACEHOLDER, elem_classes=["fw-metric-wrap"]) | |
| run_btn.click( | |
| transcribe, | |
| inputs=audio_in, | |
| outputs=[diff_out, metric_out], | |
| ) | |
| audio_in.stop_recording( | |
| transcribe, | |
| inputs=audio_in, | |
| outputs=[diff_out, metric_out], | |
| ) | |
| gr.HTML(FOOTER_HTML, elem_classes=["fw-footer-container"]) | |
| return demo | |
| demo = build_demo() | |
| if __name__ == "__main__": | |
| demo.launch() | |