Spaces:

thomasjvu
/

lisper-zerogpu

Running on Zero

App Files Files Community

lisper-zerogpu / app.py

thomasjvu

Deploy Lisper ZeroGPU Space

400c600 verified 14 days ago

raw

history blame contribute delete

51.9 kB

	from __future__ import annotations

	import base64
	import io
	import json
	import math
	import os
	import re
	import tempfile
	import traceback
	from typing import Any
	from pathlib import Path

	# Unsloth's compiled Gemma 4 audio path can trip TorchDynamo on ZeroGPU's
	# runtime torch build. Keep inference eager for reliability.
	os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
	os.environ.setdefault("UNSLOTH_COMPILE_DISABLE", "1")

	import gradio as gr
	import librosa
	import numpy as np
	import soundfile as sf
	import spaces
	import torch
	from transformers import AutoProcessor, Gemma4ForConditionalGeneration

	from live_audio_policy import (
	LiveAudioPolicy,
	compute_live_audio_diagnostics,
	decide_live_analysis,
	validate_live_audio_diagnostics,
	)


	ALLOWED_CLASSES = {"clear", "frontal", "lateral", "dental", "palatal"}
	DEFAULT_MODEL_ID = "thomasjvu/lisper-gemma4-e2b-audio-full"
	DEFAULT_ADAPTER_ID = ""
	SPACE_ROOT = Path(__file__).resolve().parent
	ACOUSTIC_MODEL_PATH = SPACE_ROOT / "acoustic_model.json"
	ACOUSTIC_EXTRATREES_MODEL_PATH = SPACE_ROOT / "acoustic_extratrees_v18.joblib"
	ACOUSTIC_K = 5
	ACOUSTIC_MIN_CONFIDENCE = 0.42
	KNN_OVERRIDE_MAX_DISTANCE = 0.25
	KNN_OVERRIDE_MIN_CONFIDENCE = 0.90
	LIVE_CLEAR_MIN_CONFIDENCE = 0.85
	LIVE_CLEAR_MIN_MARGIN = 0.25
	LIVE_NONCLEAR_MIN_CONFIDENCE = 0.55
	LIVE_NONCLEAR_MIN_MARGIN = 0.12
	MIN_AUDIO_SECONDS = 0.45
	MIN_AUDIO_RMS = 0.0015
	MIN_AUDIO_PEAK = 0.012
	MIN_VOICED_RATIO = 0.002
	MIN_SPEECH_FRAME_RATIO = 0.04
	MIN_TONAL_FRAME_RATIO = 0.04
	MIN_SIBILANT_FRAME_RATIO = 0.015
	MAX_NOISE_FLATNESS = 0.40
	MAX_CLIPPING_RATIO = 0.08
	DEFAULT_PROMPT = """Analyze this pronunciation attempt for lisp type and give concise corrective coaching.

	Return exactly four labeled lines in this order:
	Detected class: clear\|frontal\|lateral\|dental\|palatal
	Reason: one brief reason tied to tongue placement or airflow
	Corrective cue: one concrete next-step cue
	Encouragement: one brief supportive line"""

	CLASS_TEMPLATES = {
	"clear": {
	"reason": "The acoustic pattern did not strongly match the trained lisp-pattern examples, so this is treated as a tentative clear result.",
	"cue": "Repeat once at a relaxed pace and keep the airflow centered through the front of the mouth.",
	},
	"dental": {
	"reason": "The acoustic pattern is closest to the dental examples, where tongue contact near the teeth can narrow the /s/ groove.",
	"cue": "Relax the tongue slightly off the teeth and keep a narrow stream of air moving forward.",
	},
	"frontal": {
	"reason": "The acoustic pattern is closest to the frontal examples, which often sound like the airflow is too far forward.",
	"cue": "Keep the tongue tip just behind the upper teeth and send the air straight forward through a small groove.",
	},
	"lateral": {
	"reason": "The acoustic pattern is closest to the lateral examples, where air may be escaping around the sides of the tongue.",
	"cue": "Start from a light /t/ position, seal the tongue sides, and let the air move forward through the center.",
	},
	"palatal": {
	"reason": "The acoustic pattern is closest to the palatal examples, where the tongue can sit too far back and muffle the sound.",
	"cue": "Bring the tongue tip slightly forward behind the upper teeth and brighten the airflow.",
	},
	}

	GUARDED_CLASS_TEMPLATES = {
	"dental": {
	"reason": "The acoustic model was not confident enough to call this clear; the nearest non-clear pattern is dental, which can happen when the tongue presses too close to the teeth.",
	"cue": "Try one slower repetition with the tongue relaxed just behind the teeth and the air moving forward through a narrow center groove.",
	},
	"frontal": {
	"reason": "The acoustic model was not confident enough to call this clear; the nearest non-clear pattern is frontal, which can happen when the tongue or airflow moves too far forward.",
	"cue": "Keep the tongue tip behind the upper teeth and avoid letting it push between the teeth during /s/ sounds.",
	},
	"lateral": {
	"reason": "The acoustic model was not confident enough to call this clear; the nearest non-clear pattern is lateral, where air may be leaking around the tongue sides.",
	"cue": "Seal the tongue sides lightly against the upper molars and aim the air straight down the middle.",
	},
	"palatal": {
	"reason": "The acoustic model was not confident enough to call this clear; the nearest non-clear pattern is palatal, where the tongue may be sitting too far back.",
	"cue": "Bring the tongue tip forward just behind the upper teeth and brighten the /s/ airflow.",
	},
	}

	BROWSER_RECORDER_START_JS = r"""
	async (payload) => {
	const state = window.__lisperRecorder \|\| {};
	if (state.recording) {
	return [payload \|\| "", "Recording is already active.", ""];
	}

	if (!navigator.mediaDevices \|\| !navigator.mediaDevices.getUserMedia) {
	return [payload \|\| "", "This browser cannot access microphone recording.", ""];
	}

	const stream = await navigator.mediaDevices.getUserMedia({
	audio: {
	channelCount: 1,
	echoCancellation: false,
	noiseSuppression: false,
	autoGainControl: true,
	},
	video: false,
	});

	const AudioContextCtor = window.AudioContext \|\| window.webkitAudioContext;
	const audioContext = new AudioContextCtor({ sampleRate: 16000 });
	await audioContext.resume();

	const source = audioContext.createMediaStreamSource(stream);
	const processor = audioContext.createScriptProcessor(4096, 1, 1);
	const silentGain = audioContext.createGain();
	silentGain.gain.value = 0;

	const chunks = [];
	let peak = 0;
	let sumSquares = 0;
	let sampleCount = 0;

	processor.onaudioprocess = (event) => {
	if (!window.__lisperRecorder?.recording) {
	return;
	}
	const input = event.inputBuffer.getChannelData(0);
	const copy = new Float32Array(input.length);
	copy.set(input);
	chunks.push(copy);
	for (let i = 0; i < copy.length; i += 1) {
	const value = copy[i];
	const absValue = Math.abs(value);
	if (absValue > peak) peak = absValue;
	sumSquares += value * value;
	}
	sampleCount += copy.length;
	};

	source.connect(processor);
	processor.connect(silentGain);
	silentGain.connect(audioContext.destination);

	window.__lisperRecorder = {
	recording: true,
	stream,
	audioContext,
	source,
	processor,
	silentGain,
	chunks,
	startedAt: Date.now(),
	getStats: () => ({
	peak,
	rms: sampleCount ? Math.sqrt(sumSquares / sampleCount) : 0,
	sampleCount,
	sampleRate: audioContext.sampleRate,
	}),
	};

	return ["", "Recording through Web Audio... press Stop when finished.", ""];
	}
	"""

	BROWSER_RECORDER_STOP_JS = r"""
	async () => {
	const state = window.__lisperRecorder;
	if (!state \|\| !state.recording) {
	return ["", "No active browser recording. Press Record first.", ""];
	}

	state.recording = false;
	try { state.processor.disconnect(); } catch (_) {}
	try { state.source.disconnect(); } catch (_) {}
	try { state.silentGain.disconnect(); } catch (_) {}
	for (const track of state.stream.getTracks()) {
	track.stop();
	}

	const stats = state.getStats();
	const sampleRate = stats.sampleRate \|\| 16000;
	const totalLength = state.chunks.reduce((sum, chunk) => sum + chunk.length, 0);
	const samples = new Float32Array(totalLength);
	let offset = 0;
	for (const chunk of state.chunks) {
	samples.set(chunk, offset);
	offset += chunk.length;
	}
	await state.audioContext.close().catch(() => undefined);
	window.__lisperRecorder = null;

	function writeString(view, byteOffset, string) {
	for (let i = 0; i < string.length; i += 1) {
	view.setUint8(byteOffset + i, string.charCodeAt(i));
	}
	}

	function encodeWav(floatSamples, wavSampleRate) {
	const bytesPerSample = 2;
	const blockAlign = bytesPerSample;
	const buffer = new ArrayBuffer(44 + floatSamples.length * bytesPerSample);
	const view = new DataView(buffer);
	writeString(view, 0, "RIFF");
	view.setUint32(4, 36 + floatSamples.length * bytesPerSample, true);
	writeString(view, 8, "WAVE");
	writeString(view, 12, "fmt ");
	view.setUint32(16, 16, true);
	view.setUint16(20, 1, true);
	view.setUint16(22, 1, true);
	view.setUint32(24, wavSampleRate, true);
	view.setUint32(28, wavSampleRate * blockAlign, true);
	view.setUint16(32, blockAlign, true);
	view.setUint16(34, 16, true);
	writeString(view, 36, "data");
	view.setUint32(40, floatSamples.length * bytesPerSample, true);
	let byteOffset = 44;
	for (let i = 0; i < floatSamples.length; i += 1, byteOffset += 2) {
	const clamped = Math.max(-1, Math.min(1, floatSamples[i]));
	view.setInt16(byteOffset, clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff, true);
	}
	return new Blob([view], { type: "audio/wav" });
	}

	const blob = encodeWav(samples, sampleRate);
	const dataUrl = await new Promise((resolve, reject) => {
	const reader = new FileReader();
	reader.onload = () => resolve(reader.result);
	reader.onerror = () => reject(reader.error);
	reader.readAsDataURL(blob);
	});

	const durationSeconds = sampleRate ? samples.length / sampleRate : 0;
	const payload = JSON.stringify({
	source: "browser-web-audio-wav",
	data_url: dataUrl,
	mime_type: "audio/wav",
	sample_rate: sampleRate,
	sample_count: samples.length,
	duration_seconds: Number(durationSeconds.toFixed(3)),
	peak: Number(stats.peak.toFixed(6)),
	rms: Number(stats.rms.toFixed(6)),
	created_at: new Date().toISOString(),
	});

	const status = stats.peak < 0.003
	? `Clip captured but appears very quiet. peak=${stats.peak.toFixed(6)} rms=${stats.rms.toFixed(6)}. Check browser microphone permission/input.`
	: `Clip ready: ${durationSeconds.toFixed(1)}s, peak=${stats.peak.toFixed(3)}. Playback should contain your voice.`;
	const playback = `<audio controls src="${dataUrl}" style="width:100%;"></audio>`;

	return [payload, status, playback];
	}
	"""

	BROWSER_RECORDER_CLEAR_JS = r"""
	async () => {
	const state = window.__lisperRecorder;
	if (state?.recording) {
	state.recording = false;
	try { state.processor.disconnect(); } catch (_) {}
	try { state.source.disconnect(); } catch (_) {}
	try { state.silentGain.disconnect(); } catch (_) {}
	for (const track of state.stream.getTracks()) {
	track.stop();
	}
	await state.audioContext.close().catch(() => undefined);
	}
	window.__lisperRecorder = null;
	return ["", "No browser recording ready.", ""];
	}
	"""


	class InvalidAudioError(ValueError):
	"""Raised when a clip is too short or too quiet to analyze honestly."""

	def __init__(self, message: str, diagnostics: dict[str, Any]):
	super().__init__(message)
	self.diagnostics = diagnostics


	def env_int(name: str, default: int) -> int:
	try:
	return int(os.environ.get(name, str(default)))
	except ValueError:
	return default


	def env_float(name: str, default: float) -> float:
	try:
	return float(os.environ.get(name, str(default)))
	except ValueError:
	return default


	def model_id() -> str:
	return os.environ.get("LISPER_ZERO_GPU_MODEL_ID", DEFAULT_MODEL_ID).strip() or DEFAULT_MODEL_ID


	def adapter_id() -> str:
	return os.environ.get("LISPER_ZERO_GPU_ADAPTER_ID", DEFAULT_ADAPTER_ID).strip()


	def max_new_tokens() -> int:
	return env_int("LISPER_ZERO_GPU_MAX_NEW_TOKENS", 96)


	def max_seq_length() -> int:
	return env_int("LISPER_ZERO_GPU_MAX_SEQ_LENGTH", 2048)


	def zero_gpu_size() -> str:
	requested = os.environ.get("LISPER_ZERO_GPU_SIZE", "large").strip().lower()
	return "xlarge" if requested == "xlarge" else "large"


	def eager_load_enabled() -> bool:
	return os.environ.get("LISPER_ZERO_GPU_EAGER_LOAD", "0").strip() != "0"


	def load_in_4bit_enabled() -> bool:
	default = "1" if adapter_id() else "0"
	return os.environ.get("LISPER_ZERO_GPU_LOAD_IN_4BIT", default).strip() != "0"


	def acoustic_hint_enabled() -> bool:
	return os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_HINT", "1").strip() != "0"


	def acoustic_model_preference() -> str:
	requested = os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_MODEL", "auto").strip().lower()
	if requested in {"extratrees", "knn"}:
	return requested
	return "auto"


	def live_clear_min_confidence() -> float:
	return env_float("LISPER_ZERO_GPU_LIVE_CLEAR_MIN_CONFIDENCE", LIVE_CLEAR_MIN_CONFIDENCE)


	def live_clear_min_margin() -> float:
	return env_float("LISPER_ZERO_GPU_LIVE_CLEAR_MIN_MARGIN", LIVE_CLEAR_MIN_MARGIN)


	def live_nonclear_min_confidence() -> float:
	return env_float(
	"LISPER_ZERO_GPU_LIVE_NONCLEAR_MIN_CONFIDENCE",
	env_float("LISPER_ZERO_GPU_LIVE_NONCLEAR_MIN_SCORE", LIVE_NONCLEAR_MIN_CONFIDENCE),
	)


	def live_nonclear_min_margin() -> float:
	return env_float("LISPER_ZERO_GPU_LIVE_NONCLEAR_MIN_MARGIN", LIVE_NONCLEAR_MIN_MARGIN)


	def knn_override_max_distance() -> float:
	return env_float("LISPER_ZERO_GPU_KNN_OVERRIDE_MAX_DISTANCE", KNN_OVERRIDE_MAX_DISTANCE)


	def knn_override_min_confidence() -> float:
	return env_float("LISPER_ZERO_GPU_KNN_OVERRIDE_MIN_CONFIDENCE", KNN_OVERRIDE_MIN_CONFIDENCE)


	def live_audio_policy() -> LiveAudioPolicy:
	return LiveAudioPolicy(
	min_audio_seconds=env_float("LISPER_ZERO_GPU_MIN_AUDIO_SECONDS", MIN_AUDIO_SECONDS),
	min_peak=env_float("LISPER_ZERO_GPU_MIN_AUDIO_PEAK", MIN_AUDIO_PEAK),
	min_rms=env_float("LISPER_ZERO_GPU_MIN_AUDIO_RMS", MIN_AUDIO_RMS),
	min_voiced_ratio=env_float("LISPER_ZERO_GPU_MIN_VOICED_RATIO", MIN_VOICED_RATIO),
	min_speech_frame_ratio=env_float("LISPER_ZERO_GPU_MIN_SPEECH_FRAME_RATIO", MIN_SPEECH_FRAME_RATIO),
	min_tonal_frame_ratio=env_float("LISPER_ZERO_GPU_MIN_TONAL_FRAME_RATIO", MIN_TONAL_FRAME_RATIO),
	min_sibilant_frame_ratio=env_float("LISPER_ZERO_GPU_MIN_SIBILANT_FRAME_RATIO", MIN_SIBILANT_FRAME_RATIO),
	max_noise_flatness=env_float("LISPER_ZERO_GPU_MAX_NOISE_FLATNESS", MAX_NOISE_FLATNESS),
	max_clipping_ratio=env_float("LISPER_ZERO_GPU_MAX_CLIPPING_RATIO", MAX_CLIPPING_RATIO),
	clear_min_confidence=live_clear_min_confidence(),
	clear_min_margin=live_clear_min_margin(),
	nonclear_min_confidence=live_nonclear_min_confidence(),
	nonclear_min_margin=live_nonclear_min_margin(),
	)


	def audio_alignment_enabled() -> bool:
	default = "0" if adapter_id() else "1"
	return os.environ.get("LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS", default).strip() != "0"


	def gemma_generation_enabled() -> bool:
	return os.environ.get("LISPER_ZERO_GPU_USE_GEMMA_GENERATION", "0").strip() != "0"


	def torch_dtype() -> torch.dtype:
	requested = os.environ.get("LISPER_ZERO_GPU_DTYPE", "float16").strip().lower()
	if requested == "bfloat16":
	return torch.bfloat16
	if requested == "float32":
	return torch.float32
	return torch.float16


	def auth_token() -> str \| None:
	token = os.environ.get("HF_TOKEN", "").strip()
	return token or None


	def decode_browser_audio_payload(payload: str) -> np.ndarray:
	try:
	parsed = json.loads(payload)
	data_url = str(parsed.get("data_url") or "")
	if "," not in data_url:
	raise ValueError("Browser recorder payload is missing audio data.")
	_, encoded = data_url.split(",", 1)
	audio_bytes = base64.b64decode(encoded)
	waveform, sample_rate = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=False)
	except Exception as exc:
	raise InvalidAudioError(
	f"Browser recording could not be decoded: {type(exc).__name__}: {exc}",
	{"status": "invalid_browser_audio_payload"},
	) from exc

	if waveform.ndim > 1:
	waveform = waveform.mean(axis=1)

	if sample_rate != 16000:
	waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)

	return waveform.astype(np.float32)


	def normalize_audio(audio_value: str \| tuple[int, np.ndarray] \| None) -> np.ndarray:
	if audio_value is None:
	raise gr.Error("Record or upload a short audio clip first.")

	if isinstance(audio_value, str):
	if audio_value.strip().startswith("{"):
	waveform = decode_browser_audio_payload(audio_value)
	sample_rate = 16000
	else:
	waveform, sample_rate = sf.read(audio_value, dtype="float32", always_2d=False)
	else:
	sample_rate, waveform = audio_value
	waveform = np.asarray(waveform, dtype=np.float32)

	if waveform.ndim > 1:
	waveform = waveform.mean(axis=1)

	if sample_rate != 16000:
	waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)

	# Keep ZeroGPU requests bounded.
	max_samples = 12 * 16000
	if waveform.shape[0] > max_samples:
	waveform = waveform[:max_samples]

	peak = float(np.max(np.abs(waveform))) if waveform.size else 0.0
	if peak > 1.0:
	waveform = waveform / peak

	return waveform.astype(np.float32)


	def audio_diagnostics(waveform: np.ndarray, sample_rate: int = 16000) -> dict[str, Any]:
	return compute_live_audio_diagnostics(waveform, sample_rate=sample_rate, policy=live_audio_policy())


	def validate_audio_for_analysis(waveform: np.ndarray) -> dict[str, Any]:
	diagnostics = audio_diagnostics(waveform)
	decision = validate_live_audio_diagnostics(diagnostics, live_audio_policy())
	if decision["status"] != "accepted":
	diagnostics["live_audio_gate"] = decision
	raise InvalidAudioError(str(decision["reason"]), diagnostics)
	return diagnostics


	def write_temp_audio(waveform: np.ndarray) -> str:
	fd, path = tempfile.mkstemp(prefix="lisper-zero-gpu-", suffix=".wav")
	os.close(fd)
	sf.write(path, waveform, 16000)
	return path


	def build_messages(target_text: str, audio_url: str, acoustic_result: dict[str, Any] \| None = None) -> list[dict[str, Any]]:
	instruction = DEFAULT_PROMPT
	if target_text.strip():
	instruction += f'\n\nTarget text: "{target_text.strip()}"'
	if acoustic_result:
	instruction += (
	"\n\nAcoustic pre-analysis from the waveform: "
	f"class={acoustic_result['detected_class']}, "
	f"confidence={acoustic_result['confidence']:.3f}. "
	"Use this exact class for the Detected class line. Do not override it."
	)

	return [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "You are Lisper, a supportive speech-therapy assistant focused on concise lisp coaching.",
	}
	],
	},
	{
	"role": "user",
	"content": [
	{"type": "audio", "url": audio_url},
	{"type": "text", "text": instruction},
	],
	},
	]


	def build_runtime() -> tuple[Any, Any]:
	repo_id = model_id()
	adapter_repo_id = adapter_id()
	token = auth_token()
	if adapter_repo_id:
	import unsloth # noqa: F401
	from unsloth import FastVisionModel

	kwargs = {
	"model_name": adapter_repo_id,
	"max_seq_length": max_seq_length(),
	"load_in_4bit": load_in_4bit_enabled(),
	"full_finetuning": False,
	}
	if token:
	kwargs["token"] = token
	model, processor = FastVisionModel.from_pretrained(**kwargs)
	FastVisionModel.for_inference(model)
	model.eval()
	return processor, model

	processor_source = adapter_repo_id or repo_id
	processor = AutoProcessor.from_pretrained(processor_source, token=token, trust_remote_code=True)
	model = Gemma4ForConditionalGeneration.from_pretrained(
	repo_id,
	token=token,
	torch_dtype=torch_dtype(),
	device_map={"": "cuda"},
	trust_remote_code=True,
	)
	model.eval()
	return processor, model


	RUNTIME: tuple[Any, Any] \| None = build_runtime() if eager_load_enabled() else None
	LAST_INPUT_SUMMARY: dict[str, Any] = {}


	def load_runtime() -> tuple[Any, Any]:
	global RUNTIME
	if RUNTIME is None:
	RUNTIME = build_runtime()
	return RUNTIME


	def strip_generation_artifacts(text: str) -> str:
	return text.replace("```", "").replace("<bos>", "").strip()


	def extract_line(label: str, text: str) -> str:
	match = re.search(rf"^{label}:\s*(.+)$", text, flags=re.IGNORECASE \| re.MULTILINE)
	return match.group(1).strip() if match else ""


	def normalize_label(text: str) -> str:
	value = text.strip().lower()
	if "inconclusive" in value or "unclear" in value:
	return "inconclusive"
	for candidate in ALLOWED_CLASSES:
	if candidate in value:
	return candidate
	return "inconclusive"


	def parse_response(response: str) -> dict[str, Any]:
	detected = normalize_label(extract_line("Detected class", response))
	return {
	"detected_class": detected,
	"reason": extract_line("Reason", response),
	"corrective_cue": extract_line("Corrective cue", response),
	"encouragement": extract_line("Encouragement", response),
	"raw_response": response,
	"model_id": model_id(),
	"adapter_id": adapter_id() or None,
	}


	def acoustic_normalize_audio(audio: np.ndarray) -> np.ndarray:
	audio = np.asarray(audio, dtype=np.float32).reshape(-1)
	if audio.size == 0:
	return audio
	audio = audio - float(np.mean(audio))
	peak = float(np.max(np.abs(audio)))
	if peak > 0:
	audio = audio * (0.98 / peak)
	return audio.astype(np.float32)


	def frame_audio(audio: np.ndarray, sr: int, frame_ms: float = 25.0, hop_ms: float = 10.0) -> np.ndarray:
	frame = max(1, int(sr * frame_ms / 1000))
	hop = max(1, int(sr * hop_ms / 1000))
	if len(audio) < frame:
	audio = np.pad(audio, (0, frame - len(audio)))
	count = 1 + (len(audio) - frame) // hop
	shape = (count, frame)
	strides = (audio.strides[0] * hop, audio.strides[0])
	return np.lib.stride_tricks.as_strided(audio, shape=shape, strides=strides).copy()


	def summarize_feature_values(values: np.ndarray) -> list[float]:
	values = np.asarray(values, dtype=np.float64)
	values = values[np.isfinite(values)]
	if values.size == 0:
	return [0.0] * 6
	return [
	float(np.mean(values)),
	float(np.std(values)),
	float(np.min(values)),
	float(np.max(values)),
	float(np.percentile(values, 10)),
	float(np.percentile(values, 90)),
	]


	def extract_acoustic_features(audio: np.ndarray, sr: int = 16000) -> np.ndarray:
	if audio.size == 0:
	return np.zeros(88, dtype=np.float32)

	audio = acoustic_normalize_audio(audio)
	frames = frame_audio(audio, sr)
	window = np.hanning(frames.shape[1]).astype(np.float32)
	spectra = np.abs(np.fft.rfft(frames * window, axis=1)).astype(np.float64)
	freqs = np.fft.rfftfreq(frames.shape[1], d=1.0 / sr).astype(np.float64)
	power = spectra**2
	eps = 1e-10
	total = power.sum(axis=1) + eps

	centroid = (power * freqs).sum(axis=1) / total
	bandwidth = np.sqrt((power * (freqs[None, :] - centroid[:, None]) ** 2).sum(axis=1) / total)
	cumulative = np.cumsum(power, axis=1)
	rolloff_idx = np.argmax(cumulative >= 0.85 * total[:, None], axis=1)
	rolloff = freqs[rolloff_idx]
	flatness = np.exp(np.mean(np.log(power + eps), axis=1)) / (np.mean(power + eps, axis=1))
	zcr = np.mean(np.abs(np.diff(np.signbit(frames), axis=1)), axis=1)
	rms = np.sqrt(np.mean(frames**2, axis=1) + eps)
	entropy = -(power / total[:, None] * np.log((power / total[:, None]) + eps)).sum(axis=1) / math.log(
	power.shape[1]
	)

	def band_ratio(low: float, high: float) -> np.ndarray:
	mask = (freqs >= low) & (freqs < high)
	if not np.any(mask):
	return np.zeros(power.shape[0])
	return power[:, mask].sum(axis=1) / total

	bands = [
	band_ratio(0, 800),
	band_ratio(800, 1800),
	band_ratio(1800, 3200),
	band_ratio(3200, 5000),
	band_ratio(5000, min(7900, sr / 2)),
	band_ratio(3500, min(7500, sr / 2)),
	]
	deltas = np.diff(centroid, prepend=centroid[0])

	features: list[float] = [
	float(len(audio) / sr),
	float(np.mean(audio)),
	float(np.std(audio)),
	float(np.max(np.abs(audio))),
	]
	for values in [centroid, bandwidth, rolloff, flatness, zcr, rms, entropy, deltas, *bands]:
	features.extend(summarize_feature_values(values))
	return np.asarray(features, dtype=np.float32)


	ACOUSTIC_MODEL: dict[str, Any] \| None = None
	ACOUSTIC_EXTRATREES_MODEL: dict[str, Any] \| None = None


	def load_acoustic_model() -> dict[str, Any] \| None:
	global ACOUSTIC_MODEL
	if not acoustic_hint_enabled():
	return None
	if ACOUSTIC_MODEL is None:
	if not ACOUSTIC_MODEL_PATH.exists():
	return None
	ACOUSTIC_MODEL = json.loads(ACOUSTIC_MODEL_PATH.read_text(encoding="utf-8"))
	return ACOUSTIC_MODEL


	def load_acoustic_extratrees_model() -> dict[str, Any] \| None:
	global ACOUSTIC_EXTRATREES_MODEL
	if not acoustic_hint_enabled():
	return None
	if not ACOUSTIC_EXTRATREES_MODEL_PATH.exists():
	return None
	if ACOUSTIC_EXTRATREES_MODEL is None:
	import joblib

	ACOUSTIC_EXTRATREES_MODEL = joblib.load(ACOUSTIC_EXTRATREES_MODEL_PATH)
	return ACOUSTIC_EXTRATREES_MODEL


	def classify_acoustic_extratrees(waveform: np.ndarray) -> dict[str, Any] \| None:
	model = load_acoustic_extratrees_model()
	if model is None:
	return None

	features = extract_acoustic_features(waveform, sr=int(model.get("sample_rate", 16000))).reshape(1, -1)
	classifier = model["classifier"]
	prediction = str(classifier.predict(features)[0])
	confidence = 1.0
	class_scores: dict[str, float] = {}
	if hasattr(classifier, "predict_proba"):
	probabilities = classifier.predict_proba(features)[0]
	classes = [str(label) for label in classifier.classes_]
	class_scores = {
	label: round(float(probability), 6)
	for label, probability in sorted(zip(classes, probabilities), key=lambda item: item[1], reverse=True)
	}
	confidence = float(class_scores.get(prediction, 0.0))

	return {
	"detected_class": prediction,
	"raw_class": prediction,
	"confidence": confidence,
	"class_scores": class_scores,
	"model_name": model.get("name", "lisper_v18_extratrees_acoustic_hint"),
	"train_rows": model.get("train_rows"),
	"feature_count": model.get("feature_count"),
	"holdout_accuracy": model.get("holdout_accuracy"),
	"low_confidence_defaulted_to_clear": False,
	}


	def apply_live_clear_guard(result: dict[str, Any] \| None) -> dict[str, Any] \| None:
	if result is not None:
	result["live_clear_guard_applied"] = False
	return result


	def classify_acoustic_knn(waveform: np.ndarray) -> dict[str, Any] \| None:
	model = load_acoustic_model()
	if model is None:
	return None

	features = extract_acoustic_features(waveform, sr=int(model.get("sample_rate", 16000)))
	mean = np.asarray(model["mean"], dtype=np.float32)
	std = np.asarray(model["std"], dtype=np.float32)
	normalized = (features - mean) / np.where(std < 1e-6, 1.0, std)

	distances = []
	for exemplar in model["exemplars"]:
	exemplar_features = np.asarray(exemplar["features"], dtype=np.float32)
	distance = float(np.linalg.norm(normalized - exemplar_features))
	distances.append((distance, exemplar["label"], exemplar.get("source_id", "")))
	distances.sort(key=lambda item: item[0])

	class_scores: dict[str, float] = {label: 0.0 for label in model["classes"]}
	for distance, label, _source_id in distances[:ACOUSTIC_K]:
	class_scores[label] += 1.0 / max(distance, 1e-4)
	ranked = sorted(class_scores.items(), key=lambda item: item[1], reverse=True)
	top_label, top_score = ranked[0]
	total_score = sum(class_scores.values()) or 1.0
	confidence = float(top_score / total_score)
	detected_class = top_label if confidence >= ACOUSTIC_MIN_CONFIDENCE else "clear"

	return {
	"detected_class": detected_class,
	"raw_class": top_label,
	"confidence": confidence,
	"nearest_distance": round(distances[0][0], 4),
	"nearest_source_id": distances[0][2],
	"class_scores": {label: round(float(score), 6) for label, score in ranked},
	"model_name": model.get("name"),
	"low_confidence_defaulted_to_clear": detected_class == "clear" and top_label != "clear",
	}


	def _compact_acoustic_result(result: dict[str, Any] \| None) -> dict[str, Any] \| None:
	if result is None:
	return None
	keys = (
	"detected_class",
	"raw_class",
	"confidence",
	"nearest_distance",
	"nearest_source_id",
	"class_scores",
	"model_name",
	"low_confidence_defaulted_to_clear",
	)
	return {key: result[key] for key in keys if key in result}


	def maybe_apply_knn_override(
	extratrees_result: dict[str, Any] \| None,
	knn_result: dict[str, Any] \| None,
	) -> dict[str, Any] \| None:
	if extratrees_result is None or knn_result is None:
	return extratrees_result

	knn_label = normalize_label(str(knn_result.get("raw_class") or knn_result.get("detected_class") or ""))
	if knn_label == "clear":
	return extratrees_result

	confidence = float(knn_result.get("confidence") or 0.0)
	nearest_distance = float(knn_result.get("nearest_distance") or math.inf)
	max_distance = knn_override_max_distance()
	min_confidence = knn_override_min_confidence()
	if confidence < min_confidence or nearest_distance > max_distance:
	return {
	**extratrees_result,
	"hybrid_override_applied": False,
	"hybrid_override_reason": "knn_not_close_enough",
	"hybrid_override_thresholds": {
	"max_distance": max_distance,
	"min_confidence": min_confidence,
	},
	"knn_result": _compact_acoustic_result(knn_result),
	}

	class_scores = {
	label: float(score)
	for label, score in (knn_result.get("class_scores") or {}).items()
	if normalize_label(str(label)) in ALLOWED_CLASSES
	}
	return {
	**extratrees_result,
	"detected_class": knn_label,
	"raw_class": knn_label,
	"confidence": confidence,
	"class_scores": class_scores,
	"model_name": "lisper_hybrid_extratrees_knn_synthetic_override",
	"low_confidence_defaulted_to_clear": False,
	"hybrid_override_applied": True,
	"hybrid_override_reason": "knn_close_synthetic_exemplar",
	"hybrid_override_thresholds": {
	"max_distance": max_distance,
	"min_confidence": min_confidence,
	},
	"extratrees_result": _compact_acoustic_result(extratrees_result),
	"knn_result": _compact_acoustic_result(knn_result),
	}


	def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] \| None:
	preference = acoustic_model_preference()
	if preference == "extratrees":
	return classify_acoustic_extratrees(waveform)
	if preference == "knn":
	return classify_acoustic_knn(waveform)

	extratrees_result = classify_acoustic_extratrees(waveform)
	knn_result = classify_acoustic_knn(waveform)
	if extratrees_result is not None:
	return maybe_apply_knn_override(extratrees_result, knn_result)
	return knn_result


	def enforce_acoustic_response(response: str, acoustic_result: dict[str, Any] \| None) -> tuple[str, dict[str, Any]]:
	parsed = parse_response(response)
	if not acoustic_result:
	return response, parsed

	detected_class = normalize_label(str(acoustic_result["detected_class"]))
	if acoustic_result.get("live_clear_guard_applied"):
	template = GUARDED_CLASS_TEMPLATES.get(detected_class, CLASS_TEMPLATES[detected_class])
	else:
	template = CLASS_TEMPLATES[detected_class]
	encouragement = parsed.get("encouragement") or "Good effort. One focused repetition is enough for the next try."
	final_response = "\n".join(
	[
	f"Detected class: {detected_class}",
	f"Reason: {template['reason']}",
	f"Corrective cue: {template['cue']}",
	f"Encouragement: {encouragement}",
	]
	)
	final_parsed = parse_response(final_response)
	final_parsed["raw_model_response"] = response
	final_parsed["acoustic_hint_enforced"] = True
	return final_response, final_parsed


	def build_inconclusive_response(
	decision: dict[str, Any],
	acoustic_result: dict[str, Any] \| None,
	clip_diagnostics: dict[str, Any],
	) -> tuple[str, dict[str, Any]]:
	reason = str(decision.get("decision_reason") or "The clip was not reliable enough to classify.")
	if decision.get("status") == "error":
	response = "Analysis unavailable. The acoustic model is not loaded, so Lisper will not guess a class."
	else:
	response = "\n".join(
	[
	"Detected class: inconclusive",
	f"Reason: {reason}",
	"Corrective cue: Record one clear phrase with /s/ or /z/ sounds, close to the microphone, then try again.",
	"Encouragement: The clip was captured; we just need a cleaner attempt before giving a label.",
	]
	)
	parsed = parse_response(response)
	parsed["status"] = str(decision.get("status") or "inconclusive")
	parsed["raw_model_response"] = None
	parsed["acoustic_hint_enforced"] = False
	parsed["audio_diagnostics"] = clip_diagnostics
	parsed["acoustic_analysis"] = acoustic_result
	parsed["live_audio_gate"] = decision
	return response, parsed


	def build_detected_acoustic_response(
	acoustic_result: dict[str, Any],
	decision: dict[str, Any],
	clip_diagnostics: dict[str, Any],
	) -> tuple[str, dict[str, Any]]:
	detected_class = normalize_label(str(decision.get("detected_class") or acoustic_result.get("detected_class") or ""))
	if detected_class not in CLASS_TEMPLATES:
	detected_class = "inconclusive"
	if detected_class == "inconclusive":
	return build_inconclusive_response(
	{
	**decision,
	"status": "inconclusive",
	"decision_reason": "The live gate did not produce a valid class label.",
	},
	acoustic_result,
	clip_diagnostics,
	)

	template = CLASS_TEMPLATES[detected_class]
	response = "\n".join(
	[
	f"Detected class: {detected_class}",
	f"Reason: {template['reason']}",
	f"Corrective cue: {template['cue']}",
	"Encouragement: Nice work getting a usable recording. Try one focused repetition next.",
	]
	)
	parsed = parse_response(response)
	parsed["status"] = "detected"
	parsed["raw_model_response"] = None
	parsed["acoustic_hint_enforced"] = True
	parsed["gemma_generation_skipped"] = True
	parsed["audio_diagnostics"] = clip_diagnostics
	parsed["acoustic_analysis"] = acoustic_result
	parsed["live_audio_gate"] = decision
	return response, parsed


	def build_audio_only_inconclusive_decision(clip_diagnostics: dict[str, Any]) -> dict[str, Any] \| None:
	policy = live_audio_policy()
	if clip_diagnostics["sibilant_frame_ratio"] >= policy.min_sibilant_frame_ratio:
	return None
	return {
	"status": "inconclusive",
	"detected_class": "inconclusive",
	"candidate_class": None,
	"decision_reason": "The clip has speech energy, but not enough usable /s/ or /z/ airflow evidence.",
	"thresholds": {
	"min_audio_seconds": policy.min_audio_seconds,
	"min_peak": policy.min_peak,
	"min_rms": policy.min_rms,
	"min_voiced_ratio": policy.min_voiced_ratio,
	"min_speech_frame_ratio": policy.min_speech_frame_ratio,
	"min_tonal_frame_ratio": policy.min_tonal_frame_ratio,
	"min_sibilant_frame_ratio": policy.min_sibilant_frame_ratio,
	"max_noise_flatness": policy.max_noise_flatness,
	"max_clipping_ratio": policy.max_clipping_ratio,
	"clear_min_confidence": policy.clear_min_confidence,
	"clear_min_margin": policy.clear_min_margin,
	"nonclear_min_confidence": policy.nonclear_min_confidence,
	"nonclear_min_margin": policy.nonclear_min_margin,
	},
	"audio_diagnostics": clip_diagnostics,
	"classifier": {"available": False, "skipped": "insufficient_sibilant_evidence"},
	}


	def audio_token_id(processor: Any) -> int \| None:
	value = getattr(processor, "audio_token_id", None)
	if value is not None:
	return int(value)
	tokenizer = getattr(processor, "tokenizer", None)
	token = getattr(processor, "audio_token", None) or getattr(tokenizer, "audio_token", None)
	if tokenizer is not None and token is not None:
	return int(tokenizer.convert_tokens_to_ids(token))
	return None


	def replace_audio_token_run(input_ids: torch.Tensor, token_id: int, count: int) -> tuple[torch.Tensor, dict[str, int]]:
	positions = (input_ids == token_id).nonzero(as_tuple=False).flatten()
	if positions.numel() == 0:
	return input_ids, {"original_audio_tokens": 0, "aligned_audio_tokens": count}

	start = int(positions[0].item())
	end = start
	while end < input_ids.shape[0] and int(input_ids[end].item()) == token_id:
	end += 1

	replacement = torch.full((count,), token_id, dtype=input_ids.dtype, device=input_ids.device)
	aligned = torch.cat([input_ids[:start], replacement, input_ids[end:]], dim=0)
	return aligned, {"original_audio_tokens": end - start, "aligned_audio_tokens": count}


	def model_inference_dtype(model: Any) -> torch.dtype:
	dtype = getattr(model, "dtype", None)
	if dtype is not None:
	return dtype
	base_model = getattr(model, "base_model", None)
	dtype = getattr(base_model, "dtype", None)
	return dtype or torch_dtype()


	def module_parameter_dtype(module: Any) -> torch.dtype:
	try:
	return next(module.parameters()).dtype
	except StopIteration:
	return model_inference_dtype(module)


	def audio_input_dtype(model: Any) -> torch.dtype:
	requested = os.environ.get("LISPER_ZERO_GPU_AUDIO_DTYPE", "").strip().lower()
	if requested == "float16":
	return torch.float16
	if requested == "float32":
	return torch.float32
	if requested == "bfloat16" or adapter_id():
	return torch.bfloat16
	return module_parameter_dtype(audio_feature_module(model))


	def audio_feature_module(model: Any) -> Any:
	candidates = [
	model,
	getattr(model, "model", None),
	getattr(model, "base_model", None),
	getattr(getattr(model, "base_model", None), "model", None),
	getattr(getattr(getattr(model, "base_model", None), "model", None), "model", None),
	]
	for candidate in candidates:
	if candidate is not None and hasattr(candidate, "get_audio_features"):
	return candidate
	raise AttributeError("Could not locate Gemma audio feature module on loaded model.")


	def summarize_inputs(inputs: Any) -> dict[str, Any]:
	summary: dict[str, Any] = {}
	for key, value in dict(inputs).items():
	if hasattr(value, "shape") and hasattr(value, "dtype"):
	summary[key] = {
	"shape": [int(dim) for dim in value.shape],
	"dtype": str(value.dtype),
	"device": str(getattr(value, "device", "")),
	}
	else:
	summary[key] = {"type": type(value).__name__}
	return summary


	def align_audio_placeholders(inputs: Any, processor: Any, model: Any) -> tuple[Any, dict[str, int]]:
	if not audio_alignment_enabled():
	return inputs, {"audio_alignment_skipped": 1}

	if "input_features" not in inputs or "input_features_mask" not in inputs:
	return inputs, {}

	token_id = audio_token_id(processor)
	if token_id is None:
	return inputs, {}

	with torch.inference_mode():
	audio_output = audio_feature_module(model).get_audio_features(
	inputs["input_features"],
	inputs["input_features_mask"],
	return_dict=True,
	)

	encoded_count = int(audio_output.attention_mask.sum().item())
	if encoded_count <= 0 or inputs["input_ids"].shape[0] != 1:
	return inputs, {"encoded_audio_tokens": encoded_count}

	new_input_ids, metadata = replace_audio_token_run(inputs["input_ids"][0], token_id, encoded_count)
	metadata["encoded_audio_tokens"] = encoded_count
	if metadata["original_audio_tokens"] == encoded_count:
	return inputs, metadata

	inputs["input_ids"] = new_input_ids.unsqueeze(0)
	inputs["attention_mask"] = torch.ones_like(inputs["input_ids"])

	if "mm_token_type_ids" in inputs and hasattr(processor, "create_mm_token_type_ids"):
	mm_token_type_ids = processor.create_mm_token_type_ids(inputs["input_ids"].detach().cpu())
	inputs["mm_token_type_ids"] = torch.as_tensor(
	mm_token_type_ids,
	dtype=inputs["input_ids"].dtype,
	device=inputs["input_ids"].device,
	)

	return inputs, metadata


	def _analyze_impl(audio: str \| tuple[int, np.ndarray] \| None, target_text: str) -> tuple[str, str]:
	global LAST_INPUT_SUMMARY

	waveform = normalize_audio(audio)
	clip_diagnostics = validate_audio_for_analysis(waveform)
	audio_only_decision = build_audio_only_inconclusive_decision(clip_diagnostics)
	if audio_only_decision is not None:
	response, parsed = build_inconclusive_response(audio_only_decision, None, clip_diagnostics)
	return response, json.dumps(parsed, indent=2)

	acoustic_result = classify_acoustic(waveform)
	live_decision = decide_live_analysis(acoustic_result, clip_diagnostics, live_audio_policy())
	if live_decision["status"] != "detected":
	response, parsed = build_inconclusive_response(live_decision, acoustic_result, clip_diagnostics)
	return response, json.dumps(parsed, indent=2)

	if not gemma_generation_enabled():
	response, parsed = build_detected_acoustic_response(acoustic_result, live_decision, clip_diagnostics)
	return response, json.dumps(parsed, indent=2)

	audio_url = write_temp_audio(waveform)
	processor, model = load_runtime()
	messages = build_messages(target_text, audio_url, acoustic_result)

	inputs = processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	enable_thinking=False,
	return_dict=True,
	return_tensors="pt",
	)

	device = next(model.parameters()).device
	if hasattr(inputs, "to"):
	inputs = inputs.to(device)
	for float_key in ("input_features", "pixel_values", "pixel_values_videos"):
	if float_key in inputs and hasattr(inputs[float_key], "to"):
	inputs[float_key] = inputs[float_key].to(dtype=model_inference_dtype(model))
	if "input_features" in inputs:
	audio_dtype = audio_input_dtype(model)
	inputs["input_features"] = inputs["input_features"].to(dtype=audio_dtype)
	if "input_features_mask" in inputs and hasattr(inputs["input_features_mask"], "to"):
	inputs["input_features_mask"] = inputs["input_features_mask"].to(device=device)
	inputs, alignment = align_audio_placeholders(inputs, processor, model)
	LAST_INPUT_SUMMARY = summarize_inputs(inputs)

	with torch.inference_mode():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens(),
	do_sample=False,
	use_cache=True,
	)

	prompt_length = inputs["input_ids"].shape[1]
	decoded = processor.decode(outputs[0][prompt_length:], skip_special_tokens=True)
	raw_response = strip_generation_artifacts(decoded)
	response, parsed = enforce_acoustic_response(raw_response, acoustic_result)
	parsed["status"] = "detected"
	parsed["audio_token_alignment"] = alignment
	parsed["audio_diagnostics"] = clip_diagnostics
	parsed["acoustic_analysis"] = acoustic_result
	parsed["live_audio_gate"] = live_decision
	return response, json.dumps(parsed, indent=2)


	def analyze_with_errors(audio: str \| tuple[int, np.ndarray] \| None, target_text: str) -> tuple[str, str]:
	try:
	return _analyze_impl(audio, target_text)
	except InvalidAudioError as exc:
	payload = {
	"status": "rejected_audio",
	"reason": str(exc),
	"audio_diagnostics": exc.diagnostics,
	}
	return (
	"Recording not usable yet. Please record a clear speech clip before analysis.",
	json.dumps(payload, indent=2),
	)
	except Exception as exc:
	payload = {
	"error_type": type(exc).__name__,
	"message": str(exc),
	"model_id": model_id(),
	"adapter_id": adapter_id() or None,
	"dtype": os.environ.get("LISPER_ZERO_GPU_DTYPE", "float16"),
	"load_in_4bit": load_in_4bit_enabled(),
	"acoustic_hint_enabled": acoustic_hint_enabled(),
	"audio_alignment_enabled": audio_alignment_enabled(),
	"zero_gpu_size": zero_gpu_size(),
	"input_summary": LAST_INPUT_SUMMARY,
	"traceback": traceback.format_exc(limit=8),
	}
	return f"ZeroGPU inference failed: {type(exc).__name__}: {exc}", json.dumps(payload, indent=2)


	def analyze(audio: str \| tuple[int, np.ndarray] \| None, target_text: str) -> tuple[str, str]:
	return analyze_with_errors(audio, target_text)


	@spaces.GPU(duration=5, size=zero_gpu_size())
	def zero_gpu_healthcheck() -> str:
	return "ok"


	def analysis_started(browser_recording_payload: str, uploaded_audio: str \| tuple[int, np.ndarray] \| None) -> tuple[str, str]:
	if not browser_recording_payload.strip() and uploaded_audio is None:
	return (
	"No clip ready yet. Use the browser recorder or upload a short speech clip first.",
	json.dumps({"status": "waiting_for_audio"}, indent=2),
	)
	return (
	"Checking the recording quality and acoustic evidence...",
	json.dumps({"status": "running", "stage": "audio_preflight_then_acoustic_gate"}, indent=2),
	)


	def analyze_ui(
	browser_recording_payload: str,
	uploaded_audio: str \| tuple[int, np.ndarray] \| None,
	target_text: str,
	) -> tuple[str, str]:
	selected_audio: str \| tuple[int, np.ndarray] \| None = (
	browser_recording_payload.strip() if browser_recording_payload.strip() else uploaded_audio
	)
	if selected_audio is None:
	return (
	"No clip ready yet. Use the browser recorder or upload a short speech clip first.",
	json.dumps({"status": "waiting_for_audio"}, indent=2),
	)
	return analyze_with_errors(selected_audio, target_text)


	def build_app() -> gr.Blocks:
	with gr.Blocks(title="Lisper ZeroGPU", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# Lisper ZeroGPU

	Server-side Gemma 4 audio analysis for users whose browser cannot comfortably run the WebGPU model.

	The currently validated fine-tuned Lisper model is Gemma 4 E2B. E4B and 31B are future model targets and should be deployed as separate revisions after training/eval.
	"""
	)
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Browser recorder")
	browser_recording_payload = gr.Textbox(
	label="Browser recorder payload",
	visible=False,
	elem_id="lisper-browser-recorder-payload",
	)
	browser_recording_status = gr.Markdown(
	"No browser recording ready. This path bypasses Gradio's microphone recorder."
	)
	browser_recording_playback = gr.HTML("")
	with gr.Row():
	browser_record = gr.Button("Record", variant="primary")
	browser_stop = gr.Button("Stop")
	browser_clear = gr.Button("Clear")
	gr.Markdown("### Upload fallback")
	audio = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="Speech clip upload",
	editable=False,
	waveform_options=gr.WaveformOptions(show_recording_waveform=False),
	)
	audio_status = gr.Markdown(
	"No uploaded clip ready. Use the browser recorder above, or upload an audio file here."
	)
	target_text = gr.Textbox(
	label="Expected text",
	placeholder="Example: Sally sells seashells.",
	lines=2,
	)
	run = gr.Button("Analyze", variant="primary")
	with gr.Column(scale=1):
	output = gr.Textbox(label="Gemma response", lines=8)
	parsed = gr.Code(label="Parsed JSON", language="json")

	gr.Markdown(
	f"""
	Configured model: `{model_id()}`

	Configured adapter: `{adapter_id() or "none"}`

	Adapter 4-bit load: `{load_in_4bit_enabled()}`

	Acoustic hint: `{acoustic_hint_enabled()}`

	Audio token alignment: `{audio_alignment_enabled()}`

	ZeroGPU size: `{zero_gpu_size()}`

	If this Space errors on private or gated models, add `HF_TOKEN` as a Space secret. For local development without downloading the model, set `LISPER_ZERO_GPU_EAGER_LOAD=0`.
	"""
	)
	browser_record.click(
	None,
	inputs=[browser_recording_payload],
	outputs=[browser_recording_payload, browser_recording_status, browser_recording_playback],
	js=BROWSER_RECORDER_START_JS,
	queue=False,
	show_progress="hidden",
	)
	browser_stop.click(
	None,
	inputs=[],
	outputs=[browser_recording_payload, browser_recording_status, browser_recording_playback],
	js=BROWSER_RECORDER_STOP_JS,
	queue=False,
	show_progress="hidden",
	)
	browser_clear.click(
	None,
	inputs=[],
	outputs=[browser_recording_payload, browser_recording_status, browser_recording_playback],
	js=BROWSER_RECORDER_CLEAR_JS,
	queue=False,
	show_progress="hidden",
	)
	audio.change(
	lambda: "Uploaded clip ready. Analyze is available.",
	inputs=[],
	outputs=[audio_status],
	queue=False,
	show_progress="hidden",
	)
	audio.clear(
	lambda: "No uploaded clip ready. Use the browser recorder above, or upload an audio file here.",
	inputs=[],
	outputs=[audio_status],
	queue=False,
	show_progress="hidden",
	)
	run.click(
	analysis_started,
	inputs=[browser_recording_payload, audio],
	outputs=[output, parsed],
	queue=False,
	).then(
	analyze_ui,
	inputs=[browser_recording_payload, audio, target_text],
	outputs=[output, parsed],
	api_name="analyze",
	)
	return demo


	demo = build_app()

	if __name__ == "__main__":
	demo.queue(default_concurrency_limit=1).launch(show_error=True)