fluentwhisper / app.py
pradachan's picture
Upload folder using huggingface_hub
0758931 verified
Raw
History Blame Contribute Delete
35 kB
"""fluentWhisper: side-by-side Whisper demo.
Vanilla whisper-large-v3-turbo against the same model plus a disfluency-cleaning
LoRA adapter, with a strikethrough diff view that makes the deletions visible.
Ships the v3c winner adapter. The diff view (discourse markers, repetitions, and
self-repairs struck through) is the visually dominant element, not "watch it
delete um/uh" (vanilla already does that natively).
"""
import difflib
import html
import os
import re
import gradio as gr
# `spaces` only exists in the ZeroGPU runtime. Provide a no-op shim so the
# module imports cleanly anywhere (local lint, CI, py_compile, unit tests).
try:
import spaces # type: ignore
except Exception: # pragma: no cover - exercised only off-Space
class _SpacesShim:
@staticmethod
def GPU(*args, **kwargs):
# Support both @spaces.GPU and @spaces.GPU(duration=...).
if len(args) == 1 and callable(args[0]) and not kwargs:
return args[0]
def _decorator(fn):
return fn
return _decorator
spaces = _SpacesShim() # type: ignore
BASE = "openai/whisper-large-v3-turbo"
# The winning disfluency LoRA (v3c checkpoint-2000), published public on the Hub.
ADAPTER = "pradachan/whisper-large-v3-turbo-disfluency-lora"
TARGET_SR = 16000
CHUNK_S = 30 # whisper's native window; longer audio is split into chunks.
NUM_BEAMS = 1 # matches the v1 capture methodology (src/baseline.py).
# ---------------------------------------------------------------------------
# Model loading — lazy and CUDA-free at import time.
#
# ZeroGPU only attaches a GPU *inside* the @spaces.GPU-decorated `transcribe`,
# so all model loading happens there on the first call. Loading the model (or
# running any CUDA op) at module-import/startup raises "No CUDA GPUs are
# available" on ZeroGPU, which is why there is no global preload here.
# ---------------------------------------------------------------------------
import torch
from peft import PeftModel
from transformers import WhisperForConditionalGeneration, WhisperProcessor
_model, _processor, _device, _dtype = None, None, None, None
def load_models():
"""Lazy-load the processor and base model + LoRA once, on first GPU call.
The base weights are loaded a *single* time and the LoRA is attached on
top. Vanilla decoding runs inside a ``disable_adapter()`` context, so there
is only one copy of whisper-large-v3-turbo in memory instead of two.
"""
global _model, _processor, _device, _dtype
if _model is None:
_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
_device = "cuda" if torch.cuda.is_available() else "cpu"
_processor = WhisperProcessor.from_pretrained(BASE)
_base = WhisperForConditionalGeneration.from_pretrained(BASE, torch_dtype=_dtype)
_model = PeftModel.from_pretrained(_base, ADAPTER).to(_device)
_model.eval()
return {
"model": _model,
"processor": _processor,
"device": _device,
"dtype": _dtype,
}
# ---------------------------------------------------------------------------
# Audio loading / chunking
# ---------------------------------------------------------------------------
def _load_audio_16k_mono(audio_path):
"""Load any audio file as a float32 mono numpy array at 16 kHz."""
import librosa
audio, _ = librosa.load(audio_path, sr=TARGET_SR, mono=True)
return audio
def _chunk(audio, sr=TARGET_SR, chunk_s=CHUNK_S):
"""Split into <=chunk_s windows. Whisper handles <=30s natively; for longer
clips we fall back to fixed windows and concatenate the decoded texts."""
n = int(chunk_s * sr)
if len(audio) <= n:
return [audio]
return [audio[i : i + n] for i in range(0, len(audio), n)]
def _decode(model, processor, audio, device, dtype):
"""Decode one model over an audio array (chunking long inputs), return text."""
import torch
forced = processor.get_decoder_prompt_ids(language="en", task="transcribe")
texts = []
for chunk in _chunk(audio):
feats = processor(
chunk, sampling_rate=TARGET_SR, return_tensors="pt"
).input_features.to(device=device, dtype=dtype)
with torch.no_grad():
ids = model.generate(
feats, forced_decoder_ids=forced, num_beams=NUM_BEAMS
)
text = processor.batch_decode(
ids, skip_special_tokens=True, normalize=False
)[0].strip()
if text:
texts.append(text)
return " ".join(texts).strip()
# ---------------------------------------------------------------------------
# Diff view — the centerpiece of the demo
# ---------------------------------------------------------------------------
_TOKEN_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)
def _tokenize(text):
"""Split into word and punctuation tokens for token-level diffing."""
return _TOKEN_RE.findall(text or "")
_WORD_RE = re.compile(r"\w+(?:'\w+)*", re.UNICODE) # words, keeping contractions whole
def diff_html(vanilla_text, tuned_text):
"""Render the vanilla transcript with the words the cleaned model REMOVED
shown as red strikethrough, and everything else left intact.
The cleaned model lowercases its output and strips most punctuation, so a
raw text diff would flag every comma, capital letter, and full stop as an
"edit" and bury the actual disfluency deletions in noise (this is exactly
what made the diff look broken). To avoid that:
* Comparison runs on lowercased *word* tokens only; punctuation and casing
are ignored, so they never count as edits.
* The display preserves the original vanilla text verbatim (its casing,
punctuation, and spacing). Only whole words are ever struck through.
* Only true deletions are struck. Word substitutions (an ASR variant like
"mum" vs "mom", or an acronym respelled "CNN" vs "c n n") are left alone,
because those are not disfluency removals and would only confuse a viewer.
"""
vanilla_text = vanilla_text or ""
a_matches = list(_WORD_RE.finditer(vanilla_text))
a_words = [m.group(0).lower() for m in a_matches]
b_words = [m.group(0).lower() for m in _WORD_RE.finditer(tuned_text or "")]
sm = difflib.SequenceMatcher(a=a_words, b=b_words, autojunk=False)
removed = set()
for tag, i1, i2, j1, j2 in sm.get_opcodes():
if tag == "delete": # only words dropped outright count as removed
removed.update(range(i1, i2))
out = []
cursor = 0
for i, m in enumerate(a_matches):
# Inter-word text (spaces + punctuation) is passed through untouched.
out.append(html.escape(vanilla_text[cursor:m.start()]))
word = html.escape(m.group(0))
if i in removed:
out.append(
f'<span class="fw-token fw-removed" data-tooltip="Removed by the cleaning LoRA">{word}</span>'
)
else:
out.append(f'<span class="fw-token fw-kept" data-tooltip="Kept">{word}</span>')
cursor = m.end()
out.append(html.escape(vanilla_text[cursor:]))
return f'<div class="fw-diff-text">{"".join(out)}</div>'
# ---------------------------------------------------------------------------
# Result metric — a normalized read-out shown beneath the diff
# ---------------------------------------------------------------------------
# Humanized per /humanizer: plain language, no em dashes, no promotional padding.
_WER_INFO = (
"n-WER is normalized word error rate. Before scoring we lowercase the text "
"and drop punctuation, so capitalization and commas never count against the "
"model and only the words get compared. This is the usual way ASR systems "
"are measured, Whisper's own benchmarks included. On the DisfluencySpeech "
"test set, vanilla Whisper scores 9.4% and this adapter scores 3.4%."
)
# Shown in the result row before anything is transcribed, so the area reads
# clean instead of jumping when the first result lands.
METRIC_PLACEHOLDER = (
'<div class="fw-metric fw-metric-empty">'
'<span class="fw-metric-placeholder">Your cleaning summary will show up here.</span>'
"</div>"
)
def metric_html(vanilla_text, tuned_text):
"""One-line read-out under the diff: how much this clip was cleaned (live,
measured against the vanilla transcript on lowercased words only), plus a
fixed benchmark badge with an info bubble that defines n-WER.
The live figure compares lowercased word tokens, so the cleaned model's lack
of punctuation and casing is never counted as a difference. That is the same
normalization the benchmark uses and the same one the diff above runs on.
"""
a_words = [m.group(0).lower() for m in _WORD_RE.finditer(vanilla_text or "")]
b_words = [m.group(0).lower() for m in _WORD_RE.finditer(tuned_text or "")]
sm = difflib.SequenceMatcher(a=a_words, b=b_words, autojunk=False)
removed = sum(
i2 - i1 for tag, i1, i2, _j1, _j2 in sm.get_opcodes() if tag == "delete"
)
total = max(len(a_words), 1)
pct = round(100 * removed / total)
if removed == 0:
live = "No disfluencies detected in this clip."
else:
word = "word" if removed == 1 else "words"
live = f"Cleaned {removed} disfluent {word} ({pct}% of what was spoken)."
badge = (
'<span class="fw-info" tabindex="0" '
f'data-tooltip="{html.escape(_WER_INFO)}">'
"Benchmarked using n-WER ⓘ</span>"
)
return f'<div class="fw-metric"><span class="fw-metric-live">{live}</span>{badge}</div>'
# ---------------------------------------------------------------------------
# Inference entry point (GPU)
# ---------------------------------------------------------------------------
@spaces.GPU(duration=60)
def transcribe(audio_path):
if not audio_path:
return (
'<span class="fw-placeholder-text">Record or upload some speech, then hit Transcribe.</span>',
METRIC_PLACEHOLDER,
)
m = load_models()
model, processor = m["model"], m["processor"]
audio = _load_audio_16k_mono(audio_path)
# Vanilla = the same weights with the LoRA switched off, so we never hold a
# second copy of the base model in memory.
with model.disable_adapter():
vanilla_text = _decode(model, processor, audio, m["device"], m["dtype"])
tuned_text = _decode(model, processor, audio, m["device"], m["dtype"])
return diff_html(vanilla_text, tuned_text), metric_html(vanilla_text, tuned_text)
# ---------------------------------------------------------------------------
# Facts carousel — rotating highlights shown in the hero section
# ---------------------------------------------------------------------------
FACTS = [
"On the DisfluencySpeech test set, vanilla Whisper scores "
"<strong>9.4% WER</strong>. This adapter brings it down to <strong>3.4%</strong>.",
"That is a <strong>6-point drop</strong> in word error rate, with a 95% "
"bootstrap interval of [+5.0, +7.0]. The gain holds up.",
"As far as we know, the <strong>only open Apache-2.0 model</strong> that removes "
"fillers, discourse markers, repetitions, and self-repairs in one shot.",
"It cleans your speech in a <strong>single pass</strong>. No second model, "
"no LLM rewrite, no cloud round trip.",
"Then it shows you <strong>exactly what it removed</strong>, struck through "
"inline so you can trust the edit.",
"Under the hood it is a small <strong>LoRA adapter</strong> on "
"whisper-large-v3-turbo: rank 16, a few megabytes, loaded on top of the base.",
"Trained on <strong>Modal</strong> from synthetic speech we built ourselves: "
"LibriSpeech text, disfluencies injected, voiced with Kokoro across 54 voices.",
"Runs <strong>offline on your own laptop</strong>. Apache-2.0, weights on "
"Hugging Face, reproducible end to end.",
]
FACT_DURATION = 5 # seconds each fact stays visible before fading
def _fact_rotator_css(facts, duration=FACT_DURATION):
"""Generate the CSS keyframe animation for the rotating facts.
Purely CSS-driven (no JavaScript). Each fact gets a staggered
``animation-delay`` so they cycle one after the other in an infinite loop.
"""
n = len(facts)
total = n * duration
show_pct = 100.0 / n
fade_pct = show_pct * 0.15
delays = "\n".join(
f".fw-fact:nth-child({i + 1}) {{ animation-delay: {i * duration}s; }}"
for i in range(n)
)
return f"""
/* ---- fact rotator animation (auto-generated for {n} facts) ------------ */
.fw-fact {{
animation: fw-fact-fade {total}s ease-in-out infinite;
}}
@keyframes fw-fact-fade {{
0% {{ opacity: 0; }}
{fade_pct:.1f}% {{ opacity: 1; }}
{show_pct - fade_pct:.1f}% {{ opacity: 1; }}
{show_pct:.1f}% {{ opacity: 0; }}
100% {{ opacity: 0; }}
}}
{delays}
"""
def _build_header_html(facts):
"""Build the hero HTML with a rotating fact carousel at the bottom."""
fact_items = "\n".join(
f' <div class="fw-fact">{f}</div>' for f in facts
)
return f"""
<div class="fw-hero">
<div class="fw-hero-section">
<div class="fw-eyebrow">✻ Apache-2.0 · Runs offline · Trained on Modal</div>
</div>
<div class="fw-hero-section">
<h1><span class="fw-accent">fluent</span>Whisper</h1>
<p class="fw-lede">Speak messy. Read clean.</p>
</div>
<div class="fw-hero-section">
<div class="fw-fact-rotator">
{fact_items}
</div>
</div>
</div>
"""
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital,wght@0,400;1,400&family=Newsreader:ital,opsz,wght@0,6..72,400;0,6..72,500;0,6..72,600;1,6..72,400&display=swap');
/* ---- palette ---------------------------------------------------------- */
:root, .gradio-container {
--fw-parchment: #f9f6f0; /* vintage paper background */
--fw-card: #fcfff9; /* clean white paper cards */
--fw-royal-gold: #9e7a44; /* antique brass/gold */
--fw-royal-gold-hover: #805e2f;
--fw-ink-dark: #1e130c; /* dark coffee-bean black */
--fw-ink-muted: #6b5c51; /* vintage gray-brown */
--fw-border: #dfd7ca; /* subtle line separator */
--fw-crimson: #a63f3c; /* editorial dark red for deleted text */
--fw-crimson-bg: #fdf3f2; /* soft pink highlighted background */
}
/* ---- page shell — fill the viewport, no floating, no dark bars -------- */
html, body, gradio-app, .gradio-container > .main,
.gradio-container .wrap, .app, #root {
background: var(--fw-parchment) !important;
color-scheme: light;
}
.gradio-container {
max-width: 1000px !important;
width: calc(100% - 4rem) !important;
margin: 3rem auto !important;
background: var(--fw-card) !important;
color: var(--fw-ink-dark) !important;
font-family: 'Instrument Serif', Georgia, 'Times New Roman', serif !important;
padding: 4rem 3rem !important;
border: 6px double var(--fw-royal-gold) !important;
border-radius: 0 !important;
box-shadow: 0 10px 40px rgba(158,122,68,0.05) !important;
overflow: visible !important;
}
.gradio-container .prose, .gradio-container p, .gradio-container span,
.gradio-container label { color: var(--fw-ink-dark); }
/* ---- hero — centered sections vertically stacked --------------------- */
.gradio-container .block.fw-hero-container,
.gradio-container .block.fw-hero-container > .prose,
.gradio-container .block.fw-hero-container > .wrap,
.gradio-container .block.fw-hero-container > div,
.gradio-container .block.fw-footer-container,
.gradio-container .block.fw-footer-container > .prose,
.gradio-container .block.fw-footer-container > .wrap,
.gradio-container .block.fw-footer-container > div {
background: transparent !important;
border: 20px !important;
box-shadow: none !important;
padding: 0.2rem !important;
display: flex !important;
flex-direction: column !important;
}
.fw-hero {
background: transparent !important;
border: none !important;
padding: 0 !important;
margin: 0 0 2rem 0 !important;
text-align: center !important;
display: flex !important;
flex-direction: column !important;
justify-content: center !important;
height: auto !important;
min-height: auto !important;
gap: 1.5rem !important;
flex-grow: 0 !important;
}
.fw-hero-section {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
}
.fw-eyebrow {
font-family: 'Instrument Serif', serif;
font-style: normal;
text-transform: none;
font-size: 1.25rem;
font-weight: 600;
color: var(--fw-royal-gold);
margin: 0;
text-align: center !important;
}
.fw-hero h1 {
font-family: 'Instrument Serif', Georgia, serif !important;
font-weight: 400 !important;
font-size: 4.2rem !important;
line-height: 1.04 !important;
color: var(--fw-ink-dark) !important;
margin: 0 0 0.5rem !important;
font-style: normal !important;
text-align: center !important;
}
.fw-hero h1 .fw-accent {
font-style: italic !important;
color: var(--fw-royal-gold) !important;
}
.fw-lede {
font-family: 'Instrument Serif', Georgia, serif !important;
font-size: 2rem !important;
color: var(--fw-ink-muted) !important;
margin: 0 !important;
text-align: center !important;
}
.fw-tagline {
font-family: 'Newsreader', Georgia, serif !important;
font-size: 1.15rem !important;
line-height: 1.6 !important;
color: var(--fw-ink-dark) !important;
max-width: 32rem !important;
margin: 0 auto !important;
text-align: center !important;
}
.fw-tagline strong { color: var(--fw-royal-gold); font-weight: 600; }
/* ---- WER stat — the visual anchor of the hero ------------------------- */
.fw-wer {
display: flex;
align-items: baseline;
justify-content: center !important;
gap: 0.8rem;
margin: 0 0 0.5rem;
}
.fw-wer-num {
font-family: 'Instrument Serif', Georgia, serif;
font-size: 3.5rem;
line-height: 1;
color: var(--fw-ink-muted);
}
.fw-wer-good { color: var(--fw-royal-gold); font-weight: 600; }
.fw-wer-arrow { font-size: 2.2rem; color: var(--fw-ink-muted); }
.fw-wer-unit {
font-family: 'Instrument Serif', Georgia, serif;
font-size: 1.8rem;
color: var(--fw-ink-dark);
letter-spacing: 0.04em;
}
.fw-wer-note {
font-family: 'Newsreader', Georgia, serif !important;
font-size: 0.95rem !important;
line-height: 1.6 !important;
color: var(--fw-ink-muted) !important;
margin: 0 auto !important;
max-width: 28rem !important;
text-align: center !important;
box-sizing: border-box;
}
/* ---- fact rotator ----------------------------------------------------- */
.fw-fact-rotator {
display: grid;
place-items: center;
min-height: 2.5rem;
width: 100%;
}
.fw-fact {
grid-area: 1 / 1;
text-align: center;
opacity: 0;
font-family: 'Newsreader', Georgia, serif;
font-size: 1.15rem;
line-height: 1.5;
color: var(--fw-ink-muted);
max-width: 36rem;
width: 100%;
padding: 0 1rem;
box-sizing: border-box;
}
.fw-fact strong {
color: var(--fw-royal-gold);
font-weight: 600;
}
/* ---- blocks / cards --------------------------------------------------- */
.gradio-container .block,
.gradio-container .form,
.gradio-container .gr-box {
background: var(--fw-card) !important;
border: 1px solid var(--fw-border) !important;
border-radius: 8px !important;
box-shadow: 0 2px 8px rgba(158,122,68,0.03) !important;
}
/* Style block headers and labels */
.gradio-container .block-label,
.gradio-container .label-wrap,
.gradio-container span[data-testid="block-info"],
.gradio-container .block .label-wrap {
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0.4rem 0.8rem 0 !important;
}
/* SMALLER, cleaner component labels to remove bloat */
.gradio-container .block-label span,
.gradio-container .label-wrap span,
.gradio-container label > span,
.gradio-container .block-info,
.gradio-container span[data-testid="block-info"] {
font-family: 'Instrument Serif', serif !important;
font-style: normal !important;
text-transform: none !important;
letter-spacing: 0.02em !important;
font-size: 1.15rem !important;
font-weight: 500 !important;
color: var(--fw-royal-gold) !important;
background: transparent !important;
}
.gradio-container .block-label svg,
.gradio-container .label-wrap svg { color: var(--fw-royal-gold) !important; }
.gradio-container textarea,
.gradio-container input {
font-family: 'Newsreader', Georgia, serif !important;
font-size: 1.05rem !important;
color: var(--fw-ink-dark) !important;
background: var(--fw-card) !important;
border: 1px solid var(--fw-border) !important;
border-radius: 4px !important;
}
/* ---- tabs and source navigation -------------------------------------- */
.gradio-container .tab-nav,
.gradio-container div[role="tablist"] {
background: transparent !important;
border-bottom: 1px solid var(--fw-border) !important;
gap: 0.5rem !important;
padding: 0.2rem 0 !important;
}
.gradio-container .tab-nav button,
.gradio-container button[role="tab"] {
font-family: 'Instrument Serif', Georgia, serif !important;
font-size: 1.15rem !important;
font-style: normal !important;
color: var(--fw-ink-muted) !important;
background: transparent !important;
border: 1px solid transparent !important;
border-radius: 4px 4px 0 0 !important;
padding: 0.4rem 1rem !important;
transition: all 0.15s ease !important;
}
.gradio-container .tab-nav button.selected,
.gradio-container button[role="tab"][aria-selected="true"] {
color: var(--fw-ink-dark) !important;
font-weight: 600 !important;
background: var(--fw-card) !important;
border: 1px solid var(--fw-border) !important;
border-bottom-color: var(--fw-card) !important;
box-shadow: none !important;
}
/* ---- pressable gold button ------------------------------------------- */
.fw-btn, .fw-btn button {
background: var(--fw-royal-gold) !important;
color: #fffdfa !important;
font-family: 'Instrument Serif', Georgia, serif !important;
font-weight: 600 !important;
font-size: 1.15rem !important;
letter-spacing: 0.05em !important;
text-transform: uppercase !important;
border: 1px solid var(--fw-royal-gold-hover) !important;
border-radius: 4px !important;
padding: 0.8rem 2.2rem !important;
box-shadow: 0 2px 4px rgba(26,15,8,0.1) !important;
transition: all 0.2s ease !important;
}
.fw-btn:hover, .fw-btn button:hover {
background: var(--fw-royal-gold-hover) !important;
box-shadow: 0 4px 8px rgba(26,15,8,0.15) !important;
transform: translateY(-1px);
}
.fw-btn:active, .fw-btn button:active {
background: var(--fw-royal-gold-hover) !important;
box-shadow: inset 0 2px 4px rgba(0,0,0,0.2) !important;
transform: translateY(0);
}
/* ---- force button text color inside components (microphone/upload, etc) -- */
.gradio-container button {
color: var(--fw-ink-dark) !important;
font-family: 'Instrument Serif', Georgia, serif !important;
font-style: normal !important;
font-weight: 500 !important;
font-size: 1.15rem !important;
}
/* ---- scale audio component fonts so they are not diminished ---- */
.gradio-container .fw-audio,
.gradio-container .fw-audio * {
font-size: 1.05rem !important;
}
.gradio-container .fw-audio .block-label span {
font-size: 1.15rem !important;
}
.gradio-container .fw-audio button {
font-size: 1.15rem !important;
}
/* Keep the waveform / playback chrome from spilling out of its box */
.gradio-container .fw-audio .waveform-container,
.gradio-container .fw-audio .controls,
.gradio-container .fw-audio .component-wrapper {
max-width: 100% !important;
overflow: hidden !important;
box-sizing: border-box !important;
}
.gradio-container .fw-audio .controls * { line-height: normal !important; }
.fw-placeholder-text {
color: var(--fw-ink-dark) !important;
font-style: normal !important;
font-size: 1.15rem !important;
font-family: 'Newsreader', Georgia, serif !important;
}
/* ---- diff card & text ------------------------------------------------- */
.gradio-container .fw-diff {
background: var(--fw-card) !important;
border: 1px solid var(--fw-border) !important;
border-radius: 8px !important;
padding: 1.5rem !important;
min-height: 80px !important;
overflow: visible !important;
}
.fw-diff-text, .fw-placeholder-text {
display: block;
border-left: 4px solid var(--fw-royal-gold);
padding-left: 1.5rem;
margin-left: 0.2rem;
border-radius: 4px;
}
.fw-diff-text {
font-size: 1.25rem !important;
line-height: 2 !important;
color: var(--fw-ink-dark) !important;
font-family: 'Newsreader', Georgia, serif !important;
}
.fw-diff-text span:not(.fw-removed) {
color: var(--fw-ink-dark) !important;
}
/* Tooltip container */
.fw-token {
position: relative;
display: inline-block;
cursor: pointer;
padding: 0 1px;
}
/* Tooltip text bubble */
.fw-token::after {
content: attr(data-tooltip);
position: absolute;
bottom: 130%;
left: 50%;
transform: translateX(-50%) scale(0.95);
background-color: var(--fw-royal-gold) !important;
color: var(--fw-parchment) !important;
padding: 6px 10px;
border-radius: 6px;
font-size: 0.85rem;
font-family: 'Newsreader', Georgia, serif;
font-weight: 500;
text-decoration: none !important;
font-style: normal !important;
white-space: nowrap;
z-index: 1000;
box-shadow: 0 4px 15px rgba(158, 122, 68, 0.15);
opacity: 0;
pointer-events: none;
transition: opacity 0.15s ease, transform 0.15s cubic-bezier(0.16, 1, 0.3, 1);
}
/* Tooltip arrow */
.fw-token::before {
content: "";
position: absolute;
bottom: 115%;
left: 50%;
transform: translateX(-50%) scale(0.95);
border-width: 6px;
border-style: solid;
border-color: var(--fw-royal-gold) transparent transparent transparent !important;
z-index: 1000;
opacity: 0;
pointer-events: none;
transition: opacity 0.15s ease, transform 0.15s cubic-bezier(0.16, 1, 0.3, 1);
}
.fw-token:hover::after,
.fw-token:hover::before {
opacity: 1;
transform: translateX(-50%) scale(1);
}
.fw-kept:hover {
background-color: rgba(158, 122, 68, 0.08);
border-radius: 3px;
}
.fw-removed {
text-decoration: line-through;
color: var(--fw-crimson);
background-color: var(--fw-crimson-bg);
text-decoration-thickness: 1.5px;
text-decoration-color: var(--fw-crimson);
padding: 0 3px;
border-radius: 3px;
}
.fw-removed:hover {
background-color: #fbdcd9;
}
/* ---- result metric line ---------------------------------------------- */
.gradio-container .fw-metric-wrap {
overflow: visible !important;
}
.fw-metric {
height: 36px;
display: flex;
flex-wrap: wrap;
align-items: baseline;
justify-content: space-between;
gap: 0.6rem 1.2rem;
margin: 0.9rem 0.2rem 0.9rem;
padding: 0 0.2rem;
font-family: 'Newsreader', Georgia, serif;
}
.fw-metric-live {
font-size: 1.05rem;
color: var(--fw-ink-muted);
}
/* empty state — a faint grey placeholder so a fresh visit reads clean */
.fw-metric-empty {
justify-content: flex-start;
}
.fw-metric-placeholder {
font-size: 1.05rem;
color: #b7ad9f;
}
.fw-info {
position: relative;
display: inline-block;
cursor: help;
font-size: 0.95rem;
font-weight: 600;
color: var(--fw-royal-gold);
letter-spacing: 0.01em;
border-bottom: 1px dotted var(--fw-royal-gold);
outline: none;
}
/* wrapping tooltip bubble (the .fw-token tooltip is single-line; this one wraps) */
.fw-info::after {
content: attr(data-tooltip);
position: absolute;
bottom: 150%;
right: 0;
width: 280px;
max-width: 78vw;
white-space: normal;
text-align: left;
line-height: 1.45;
background-color: var(--fw-ink-dark);
color: var(--fw-parchment);
padding: 10px 12px;
border-radius: 8px;
font-size: 0.85rem;
font-weight: 400;
font-family: 'Newsreader', Georgia, serif;
z-index: 1000;
box-shadow: 0 6px 20px rgba(30, 19, 12, 0.25);
opacity: 0;
pointer-events: none;
transform: translateY(4px);
transition: opacity 0.15s ease, transform 0.15s ease;
}
.fw-info:hover::after,
.fw-info:focus::after {
opacity: 1;
transform: translateY(0);
}
/* ---- footer ----------------------------------------------------------- */
.fw-footer {
text-align: center;
font-family: 'Instrument Serif', serif;
font-style: normal;
font-weight: bold;
font-size: 1.15rem;
color: var(--fw-ink-muted);
padding: 1.5rem 0 0.5rem;
margin-top: 1.5rem;
border-top: 1px solid var(--fw-border);
}
.fw-footer .fw-heart { color: var(--fw-crimson); }
.fw-footer a {
color: var(--fw-royal-gold);
font-weight: bold;
text-decoration: none;
border-bottom: 1px dotted var(--fw-royal-gold);
}
.fw-footer a:hover { color: var(--fw-royal-gold-hover); border-bottom-style: solid; }
/* hide gradio's default footer */
footer { display: none !important; }
/* ---- responsiveness --------------------------------------------------- */
@media (max-width: 900px) {
.gradio-container {
width: calc(100% - 1.5rem) !important;
margin: 1rem auto !important;
padding: 2.5rem 1.5rem !important;
border-width: 4px !important;
}
.fw-layout-row {
flex-direction: column !important;
gap: 3.5rem !important;
}
.gradio-container .block.fw-hero-container,
.gradio-container .block.fw-hero-container > .prose,
.gradio-container .block.fw-hero-container > .wrap,
.gradio-container .block.fw-hero-container > div {
height: auto !important;
}
.fw-hero {
min-height: auto !important;
height: auto !important;
gap: 2.5rem !important;
}
.fw-hero h1 {
font-size: 3rem !important;
}
.fw-lede {
font-size: 1.6rem !important;
}
.fw-wer-num {
font-size: 2.8rem !important;
}
}
"""
HEADER_HTML = _build_header_html(FACTS)
# ---------------------------------------------------------------------------
# Curated example clips (real DisfluencySpeech test audio). Each entry is only
# offered if its .wav is actually present, so a missing file never breaks the
# build. Paths are resolved next to this file so they work locally and on Space.
# ---------------------------------------------------------------------------
_EX_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "examples")
_EXAMPLE_SPECS = [
("idx_049.wav", "Spontaneous interview answer"),
("idx_144.wav", "Casual spoken reply"),
("idx_222.wav", "Off-the-cuff explanation"),
("idx_083.wav", "Unscripted remark")
]
EXAMPLES = [
[os.path.join(_EX_DIR, fn)] for fn, _ in _EXAMPLE_SPECS
if os.path.exists(os.path.join(_EX_DIR, fn))
]
EXAMPLE_LABELS = [
label for fn, label in _EXAMPLE_SPECS
if os.path.exists(os.path.join(_EX_DIR, fn))
]
FOOTER_HTML = """
<div class="fw-footer">
Made with <span class="fw-heart">&#10084;</span> by
<a href="https://aipdv.com" target="_blank" rel="noopener">Prabhudayal Vaishnav</a>
</div>
"""
def _theme():
"""Warm editorial paper theme so all Gradio chrome matches the palette."""
t = gr.themes.Soft(
primary_hue=gr.themes.colors.stone,
secondary_hue=gr.themes.colors.stone,
neutral_hue=gr.themes.colors.stone,
font=[gr.themes.GoogleFont("Instrument Serif"), "Georgia", "serif"],
)
t.set(
body_background_fill="#f9f6f0",
body_background_fill_dark="#f9f6f0",
block_background_fill="#fcfbf9",
block_background_fill_dark="#fcfbf9",
block_label_text_color="#9e7a44",
block_label_text_color_dark="#9e7a44",
block_title_text_color="#9e7a44",
block_title_text_color_dark="#9e7a44",
block_label_background_fill="transparent",
block_label_background_fill_dark="transparent",
block_label_border_width="0px",
block_label_border_width_dark="0px",
input_background_fill="#fcfbf9",
input_background_fill_dark="#fcfbf9",
button_primary_background_fill="#9e7a44",
button_primary_background_fill_hover="#805e2f",
button_primary_text_color="#fcfbf9",
button_primary_background_fill_dark="#9e7a44",
button_primary_text_color_dark="#fcfbf9",
button_secondary_background_fill="#fcfbf9",
button_secondary_background_fill_dark="#fcfbf9",
button_secondary_text_color="#1e130c",
button_secondary_text_color_dark="#1e130c",
button_secondary_border_color="#dfd7ca",
button_secondary_border_color_dark="#dfd7ca",
link_text_color="#9e7a44",
link_text_color_dark="#9e7a44",
)
return t
def build_demo():
full_css = CUSTOM_CSS + _fact_rotator_css(FACTS)
with gr.Blocks(title="fluentWhisper", theme=_theme(), css=full_css) as demo:
gr.HTML(HEADER_HTML, elem_classes=["fw-hero-container"])
audio_in = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Speak or upload audio",
elem_classes=["fw-audio"],
# The mid-recording "page unresponsive" freeze came from the cold
# model load blocking the first request, which the lazy load inside
# the @spaces.GPU transcribe function keeps off the page thread.
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
# Curated DisfluencySpeech test clips where the adapter cleanly removes
# real discourse markers and repetitions. These are the examples behind
# the 3.4% WER number, so they show the model at its best for the demo.
if EXAMPLES:
gr.Examples(
examples=EXAMPLES,
inputs=audio_in,
label="Or try a real clip from the DisfluencySpeech test set",
example_labels=EXAMPLE_LABELS,
)
run_btn = gr.Button(
"Transcribe", variant="primary", elem_classes=["fw-btn"]
)
# The diff is the headline result, so it sits right under the button —
# no scrolling past empty boxes to see what the model did.
diff_out = gr.HTML(
label="What the LoRA removed",
elem_classes=["fw-diff"],
value='<span class="fw-placeholder-text">Record or upload some speech, then hit Transcribe.</span>',
)
metric_out = gr.HTML(value=METRIC_PLACEHOLDER, elem_classes=["fw-metric-wrap"])
run_btn.click(
transcribe,
inputs=audio_in,
outputs=[diff_out, metric_out],
)
audio_in.stop_recording(
transcribe,
inputs=audio_in,
outputs=[diff_out, metric_out],
)
gr.HTML(FOOTER_HTML, elem_classes=["fw-footer-container"])
return demo
demo = build_demo()
if __name__ == "__main__":
demo.launch()