Spaces:

rohitsar567
/

InsuranceBot

Sleeping

App Files Files Community

InsuranceBot / frontend /src /lib /useStreamingVoice.ts

rohitsar567

fix(#55+#56+#53/#54-engine): TTS full natural readout + voice warm-stream/pre-roll

7d87d62 about 2 months ago

Raw

History Blame Contribute Delete

115 kB

	"use client";

	/**
	* useStreamingVoice — KI-168 (2026-05-15).
	*
	* Replaces the custom AudioWorklet + VAD + WAV-encode + /api/transcribe path
	* (useLiveConversation) with the browser's native Web Speech API. The user
	* sees their words land in the chat input area in real time as they speak,
	* just like ChatGPT / Claude voice mode — and when the browser detects
	* end-of-utterance silence, the final transcript is auto-submitted through
	* the existing send() path.
	*
	* Why this exists
	* -------------------------------------------------------------------------
	* The previous live-mode stack accumulated 12+ KIs of failure modes
	* (KI-044/057/060/064/113/114/115/131/134/139/141/159/165) trying to bolt
	* a reliable VAD onto raw mic PCM. Every fix surfaced a new failure on a
	* different mic / room / browser combo. The native SpeechRecognition API
	* gives us:
	* - browser-grade end-of-speech detection (no rmsThreshold tuning)
	* - streaming interim transcripts (no "where did my words go?" gap)
	* - in-browser STT (no /api/transcribe round-trip latency)
	*
	* Behaviour
	* -------------------------------------------------------------------------
	* - `enabled = true` → recognition.start() runs, mic icon stays live,
	* interim transcript streams into the chat input via onInterimTranscript.
	* - Browser detects ~1.5s silence → onend fires → we hand the final
	* transcript to onFinalTranscript (caller calls send()).
	* - After onend, if `enabled` is still true and no text request is in
	* flight, we restart recognition so the mic stays live (continuous-mode
	* emulation; native `continuous=true` doesn't fire silence-end on most
	* browsers, so we use continuous=false + auto-restart instead).
	* - `enabled = false` → recognition.abort() runs, no callbacks fire.
	*
	* Bot TTS playback is untouched — the page.tsx-owned <audio> elements still
	* play Sarvam-generated audio for assistant replies.
	*/

	import { useCallback, useEffect, useRef, useState } from "react";
	import { postTranscribe } from "./api";
	// KI-223..228 (2026-05-15) — additive resilience layer (V1.1/V1.3/V5.4/V6.8).
	// Lives in a sibling module so the hook body stays under control and the
	// retry / noise-floor / sample-rate helpers can be unit-tested in isolation.
	import {
	retryPostTranscribe,
	scaleSpeechZcrBand,
	AdaptiveNoiseFloor,
	type VoiceError as VoiceErrorBase,
	} from "./voice_resilience";

	// W1 (2026-05-15) — additive 4th voice-error code. Surfaces a silent
	// `getUserMedia` permission/denial failure (NotAllowedError /
	// NotFoundError / SecurityError / generic DOMException) so page.tsx can
	// render an actionable banner and revert the "Voice on" pill. Kept as a
	// local widening of the base `VoiceError` union from voice_resilience.ts
	// (which we don't touch per scope) — callers see the same
	// `onVoiceError(err: VoiceError) => void` shape, just with one more legal
	// string value.
	export type VoiceError = VoiceErrorBase \| "mic_permission_denied";

	// KI-189 (2026-05-15) — live-speak barge-in tuning constants.
	// The MediaRecorder mic stream IS echo-cancelled by the browser (KI-185
	// `getUserMedia` AEC constraints), so the bot's TTS bleed lands at a
	// very low RMS (~0.001-0.005) while actual user speech sits at ~0.05-0.2.
	// We pick a threshold in between, and require ~300ms sustained energy
	// to avoid firing on coughs / room thumps / single-frame spikes.
	// KI-212 (2026-05-15) — was 0.025 / 18 frames. User reported barge-in
	// completely failing: bot reads entire 14s reply uninterrupted. Lowered
	// to fire on ANY decent speech burst within 100ms. Risk: false positives
	// (chair creak, cough) — acceptable trade vs. broken barge-in.
	const BARGE_IN_RMS_THRESHOLD = 0.008;
	const BARGE_IN_SUSTAINED_FRAMES = 6; // ~100ms @ 60fps rAF
	// KI-190 (2026-05-15) — adaptive threshold. The MediaRecorder mic stream
	// has AEC, but for very loud bot TTS the residual bleed can still cross
	// the static 0.025 threshold. We instead compute the threshold dynamically
	// from the bot's CURRENT audio level: bot_rms * MULTIPLIER + BASE. Bot
	// loud → threshold rises so user must speak loudly to overcome residual;
	// bot quiet → threshold drops near floor so soft speech still wins.
	// KI-212 — multiplier lowered 2.0 → 1.5 + base 0.005 → 0.002. Together
	// with the static threshold drop, makes barge-in fire on much softer
	// user speech even when bot is loud.
	const BARGE_IN_BOT_RMS_MULTIPLIER = 1.5;
	const BARGE_IN_BASE_THRESHOLD = 0.002;
	// KI-191 (2026-05-15) — duck bot TTS volume while voice mode is on.
	// Reducing playback amplitude further widens the gap between the bot's
	// residual mic bleed (after AEC) and the user's normal-volume speech,
	// making barge-in trivial. 0.6 is loud enough to hear clearly on
	// headphones and laptop speakers without overpowering user speech.
	// KI-211 (2026-05-15) — was 0.6; lowered to 0.3 because first-turn barge-in
	// fails when adaptive calibration (KI-195) hasn't sampled user_speech_rms yet.
	// 0.3 is loud enough to hear clearly on speakers + mic bleed is well under
	// the static BARGE_IN_RMS_THRESHOLD, so users can talk over the bot on the
	// first turn without needing prior calibration.
	const VOICE_MODE_TTS_VOLUME = 0.3;
	// KI-195 (2026-05-15) — adaptive TTS volume calibration relative to user's
	// own measured speech level. Architecture: while user speaks (recorder
	// active, NOT TTS) we sample mic RMS and track a rolling peak in
	// userSpeechRmsRef. While TTS plays, every 300ms we sample bot_rms_at_mic
	// via the KI-190 botAnalysers and reduce el.volume by 20% if bot_rms is
	// closer to user_rms than the target ratio. Floor at 0.15 so the bot
	// stays audible. This makes "bot bleed < user speech" a mathematical
	// guarantee after one calibration turn → barge-in always works, echo
	// never crosses the recognition threshold.
	const USER_SPEECH_RMS_INITIAL = 0.05; // typical quiet speech, used until calibrated
	const USER_SPEECH_DETECTION_THRESHOLD = 0.02; // mic RMS above this counts as "user speaking"
	// FIX 5 (HIGH) — hard ceiling on the rolling-peak userSpeechRms. Without
	// this, a single shout pins userSpeechRms at 0.4+ for the entire session
	// → adaptive barge-in threshold rises → normal-volume speech can't break
	// through → user has to shout to barge in again. The userRmsTick is also
	// gated on !isTtsPlaying, so during TTS playback there's NO decay path —
	// the wall-clock decay interval below provides decay regardless of gating.
	const USER_SPEECH_RMS_CEILING = 0.15;
	const USER_SPEECH_RMS_WALL_CLOCK_DECAY_MS = 1000;
	const USER_SPEECH_RMS_WALL_CLOCK_DECAY_FACTOR = 0.9;
	const VOLUME_CALIB_TARGET_RATIO = 0.35; // bot_rms_at_mic should be ≤ user_rms × this
	const VOLUME_CALIB_TICK_MS = 300; // calibration sample period during TTS
	const VOLUME_CALIB_DUCK_FACTOR = 0.8; // multiply el.volume by this per tick if too loud
	const VOLUME_CALIB_FLOOR = 0.15; // never drop bot below this — must stay audible

	// KI-202 (2026-05-15) — utterance batching grace window.
	// Web Speech API's `onend` fires after ~1.5s silence, which means a natural
	// mid-sentence pause ("So it will be just [pause] me") triggers TWO separate
	// onend events and the user's sentence is submitted in two halves. We delay
	// the actual submission by UTTERANCE_GRACE_MS after onend; if recognition
	// re-fires (next word burst) before the timer expires, we append the new
	// text/audio chunks and reset the timer. Only after a full UTTERANCE_GRACE_MS
	// of true silence do we submit.
	const UTTERANCE_GRACE_MS = 1500;
	// KI-203 (2026-05-15) — post-TTS result-drop window.
	// `recognition.abort()` doesn't immediately stop result delivery — onresult
	// events from the now-abandoned recognition can keep arriving for a beat
	// afterwards. Keep dropping results for this many ms after TTS ends.
	const POST_TTS_DROP_MS = 300;

	// KI-285 (2026-05-16) — echo-suppression barge-in grace window.
	//
	// ROOT CAUSE this fixes: the bot's TTS reply was stopping a fraction of a
	// second after it started, with NO user having spoken. The reply audio is a
	// single <audio> blob (no chunking, `ended` fires once at true end), so the
	// premature stop could only come from triggerBargeIn() pausing the element.
	// The barge-in VAD floors its threshold at BARGE_IN_RMS_THRESHOLD (0.008)
	// when computeBotRms() returns 0 — which it ALWAYS does for the first frames
	// of playback (the per-element MediaElementSource analyser has no data yet)
	// and PERMANENTLY whenever createMediaElementSource() throws (Safari,
	// element already Web-Audio-routed, or autoplay-suspended ctx). Browser AEC
	// is imperfect on speaker (non-headphone) users; the bot's own voice echoes
	// back into the mic at ~0.001-0.02 RMS in the speech ZCR band — clearing the
	// 0.008 floor for 6 frames (~100ms) and self-triggering a "barge-in" on the
	// bot's OWN audio. No prior hysteresis guarded the playback-start window.
	//
	// FIX: do not treat ANY VAD energy as a barge-in until the bot's audio has
	// been playing for BARGE_IN_GRACE_MS. The first ~600ms of a reply is where
	// echo (not the user) is the energy source — the user has not yet had time
	// to hear enough of the reply to decide to interrupt, let alone produce
	// BARGE_IN_SUSTAINED_FRAMES of speech. Genuine barge-in is unaffected: a
	// real interruption is the user speaking over the bot for seconds, so the
	// sustained-energy gate is re-armed and fires the instant the grace window
	// elapses while the user is still talking. Only the bot's own start-of-reply
	// echo — which by definition cannot outlast a brief grace window without the
	// user actually speaking — is suppressed.
	const BARGE_IN_GRACE_MS = 600;
	// KI-285 (2026-05-16) — defence-in-depth. Even AFTER the grace window, when
	// computeBotRms() is unavailable (returns 0) we must not collapse the
	// barge-in threshold to the bare 0.008 static floor — that floor is BELOW
	// documented speaker echo bleed (up to ~0.02 RMS per KI-189/190 comments),
	// so echo alone clears it. When we have no usable bot-level reference, hold
	// the threshold at this echo-safe floor. Real user speech sits at
	// ~0.05-0.2 RMS (KI-189) and clears this comfortably; residual AEC echo
	// (~0.02 worst case on speakers) does not.
	const BARGE_IN_NO_BOTREF_FLOOR = 0.035;

	// =========================================================================
	// #53 / #54 (2026-05-18) — push-to-talk head-clipping + start-latency fix.
	//
	// ROOT CAUSE (verified):
	// page.tsx's push-to-talk path cold-starts the mic on every SPACE press:
	// page.tsx:1350-1361 onKeyDown(SPACE) → startRecordingRef.current()
	// page.tsx:1004-1019 startRecording() → navigator.mediaDevices
	// .getUserMedia(...) [COLD — 200-700ms on HF Space]
	// page.tsx:1021 new MediaRecorder(stream)
	// page.tsx:1213 recorder.start() [capture truly begins HERE]
	// Every word the user speaks between the keydown and recorder.start()
	// firing is never captured → the leading word is lost/garbled (#53,
	// transcribed "S A R" for "Sir."). The same cold-start is the multi-second
	// delay the user feels before recording begins (#54). There is NO pre-roll
	// buffer and NO warm/pre-armed stream anywhere in the codebase.
	//
	// FIX (this hook, since page.tsx is owned by another writer and its PTT path
	// is fully self-contained):
	// - Keep ONE mic stream + MediaRecorder + AudioContext WARM for the hook's
	// entire armed lifetime (acquired once after the user opts into voice,
	// never torn down per-press, survives the Live↔PTT toggle). A persistent
	// open audio device means the OS mic is already hot, so page.tsx's own
	// per-press getUserMedia resolves in ~10-50ms instead of cold-starting
	// (200-700ms) — that alone removes the felt multi-second start delay.
	// - The warm MediaRecorder runs with a short timeslice, feeding a rolling
	// PRE-ROLL ring buffer that always holds the last ~PRE_ROLL_MS of audio.
	// - The PTT API (beginPushToTalk/endPushToTalk) prepends the pre-roll to
	// the captured utterance, so the FIRST WORD — spoken in the cold-start
	// gap — is always in the blob even though page.tsx's recorder missed it.
	// - A DELIBERATE-HOLD gate: beginPushToTalk arms instantly but the capture
	// only "engages" after HOLD_THRESHOLD_MS; a sub-threshold tap (key
	// bounce, accidental press) is discarded and produces no submission.
	// - AudioContext.resume() is kept warm WHILE armed (not lazily on first
	// press), and warm-stream / permission / worklet failures are surfaced
	// via onVoiceError — never silent.
	//
	// The pure pre-roll ring-buffer + hold-gate logic is exported (PreRollRing,
	// evaluateHoldGate) so it is self-contained and independently exercised by
	// the regression test.
	// =========================================================================

	// Size of the rolling pre-roll buffer. Must comfortably cover the worst-case
	// page.tsx cold-start gap (getUserMedia 200-700ms + MediaRecorder spin-up +
	// the optional 400ms Live-teardown wait at page.tsx:994). 800ms gives margin
	// without bloating the blob (browser webm/opus ≈ 4 KB/s ⇒ ~3.2 KB of lead-in).
	export const PRE_ROLL_MS = 800;
	// Warm MediaRecorder timeslice. Small enough that the pre-roll ring has fine
	// granularity (we never drop more than one slice of lead-in when trimming the
	// ring to PRE_ROLL_MS), large enough not to thrash ondataavailable.
	export const WARM_TIMESLICE_MS = 200;
	// Deliberate-hold threshold (#54). The hold must be intentional so an
	// accidental tap / key-bounce doesn't fire a turn, but it must feel instant
	// on a real hold — 200ms sits in the requested 150-250ms band.
	export const HOLD_THRESHOLD_MS = 200;

	/**
	* PreRollRing — a rolling, time-bounded ring buffer of MediaRecorder Blob
	* slices. `push` appends a freshly-emitted slice (each slice represents
	* ~WARM_TIMESLICE_MS of audio); the ring evicts the oldest slices once the
	* retained wall-clock duration exceeds `windowMs`, so it always holds *at
	* least* the last `windowMs` of audio (it may hold up to one extra slice so
	* a head word that started just before `windowMs` ago is never trimmed).
	*
	* Pure + framework-free so the regression test can drive it directly without
	* a browser. `drain()` returns the retained slices oldest-first and clears
	* the ring (used at PTT-engage to seed the utterance with the lead-in).
	*/
	export class PreRollRing {
	private slices: Array<{ blob: Blob; ms: number }> = [];
	private retainedMs = 0;
	private readonly windowMs: number;
	constructor(windowMs: number = PRE_ROLL_MS) {
	this.windowMs = windowMs;
	}

	push(blob: Blob, sliceMs: number = WARM_TIMESLICE_MS): void {
	if (!blob \|\| blob.size <= 0) return;
	this.slices.push({ blob, ms: sliceMs });
	this.retainedMs += sliceMs;
	// Evict from the front while doing so still leaves >= windowMs retained
	// (keep one extra slice of slack so a word that began just before the
	// window boundary survives — never trim into the requested lead-in).
	while (
	this.slices.length > 1 &&
	this.retainedMs - this.slices[0].ms >= this.windowMs
	) {
	const dropped = this.slices.shift();
	if (dropped) this.retainedMs -= dropped.ms;
	}
	}

	/** Retained lead-in slices oldest-first; clears the ring. */
	drain(): Blob[] {
	const out = this.slices.map((s) => s.blob);
	this.slices = [];
	this.retainedMs = 0;
	return out;
	}

	/** Approximate retained wall-clock duration (ms). */
	retainedDurationMs(): number {
	return this.retainedMs;
	}

	clear(): void {
	this.slices = [];
	this.retainedMs = 0;
	}
	}

	/**
	* evaluateHoldGate — pure decision for the deliberate-hold threshold (#54).
	*
	* Given when the user engaged (pressed) and released, decide whether the
	* press was a DELIBERATE hold (capture should be submitted) or a sub-threshold
	* TAP (discard — accidental press / key bounce). Kept pure so the regression
	* test can assert the boundary exactly without timers.
	*
	* heldMs >= thresholdMs → { deliberate: true } (engage + submit)
	* heldMs < thresholdMs → { deliberate: false } (discard, no submit)
	*/
	export function evaluateHoldGate(
	pressedAt: number,
	releasedAt: number,
	thresholdMs: number = HOLD_THRESHOLD_MS,
	): { deliberate: boolean; heldMs: number } {
	const heldMs = Math.max(0, releasedAt - pressedAt);
	return { deliberate: heldMs >= thresholdMs, heldMs };
	}

	// Minimal types for the Web Speech API since lib.dom.d.ts ships them under
	// `webkitSpeechRecognition` only and the standard `SpeechRecognition` symbol
	// is still vendor-prefixed in most browsers as of 2026-05.
	type SpeechRecognitionAlternative = { transcript: string; confidence: number };
	type SpeechRecognitionResult = {
	isFinal: boolean;
	length: number;
	[index: number]: SpeechRecognitionAlternative;
	};
	type SpeechRecognitionResultList = {
	length: number;
	[index: number]: SpeechRecognitionResult;
	};
	interface SpeechRecognitionEventLike extends Event {
	resultIndex: number;
	results: SpeechRecognitionResultList;
	}
	interface SpeechRecognitionErrorEventLike extends Event {
	error: string;
	message?: string;
	}
	interface SpeechRecognitionInstance extends EventTarget {
	lang: string;
	continuous: boolean;
	interimResults: boolean;
	maxAlternatives: number;
	start: () => void;
	stop: () => void;
	abort: () => void;
	onresult: ((ev: SpeechRecognitionEventLike) => void) \| null;
	onerror: ((ev: SpeechRecognitionErrorEventLike) => void) \| null;
	onend: ((ev: Event) => void) \| null;
	onstart: ((ev: Event) => void) \| null;
	}
	type SpeechRecognitionCtor = new () => SpeechRecognitionInstance;

	export interface UseStreamingVoiceOptions {
	enabled: boolean;
	onInterimTranscript: (text: string) => void;
	onFinalTranscript: (text: string) => void;
	onError: (msg: string) => void;
	onListening: (listening: boolean) => void;
	isTextRequestPendingRef: React.MutableRefObject<boolean>;
	language?: string;
	// KI-223 (2026-05-15) — V1.1 / V1.2 / V5.4. Optional structured error
	// callback so page.tsx can react specifically to recoverable failures
	// (e.g. show "tap to enable audio" when audio_context_suspended fires).
	// Optional: existing consumers that don't pass this still work.
	onVoiceError?: (err: VoiceError) => void;
	}

	export interface UseStreamingVoiceReturn {
	start: () => void;
	stop: () => void;
	isSupported: boolean;
	/**
	* FIX 3 (HIGH) — Barge-in signal. The hook flips an internal flag when
	* `triggerBargeIn` fires (user spoke over bot TTS). The caller (page.tsx)
	* should poll this method before/after every fetch tick during a /api/chat
	* stream — if it returns true, abort the in-flight request and any pending
	* audio assembly so the bot doesn't keep talking after the user
	* interrupted. Reading clears the flag (one-shot semantics).
	*
	* Wire-up (caller side, OUT OF THIS HOOK'S SCOPE):
	* - Before fetch, store an AbortController locally.
	* - In the stream-reading loop, periodically check
	* `streamingVoice.consumeBargeInSignal()` and call `controller.abort()`
	* when it returns true.
	* - Alternatively register a side-effect that polls every 100ms while a
	* send() is in flight.
	*/
	consumeBargeInSignal: () => boolean;

	// ----------------------------------------------------------------------
	// #53 / #54 — warm-stream + pre-roll push-to-talk API.
	//
	// This is the minimal API the push-to-talk UI integrates with. Even
	// without an explicit call, `armWarmStream()` is invoked autonomously by
	// the hook once voice has been enabled, so the OS mic device is kept hot
	// for the rest of the session — that removes the per-press cold-start that
	// page.tsx's own getUserMedia otherwise pays (the felt multi-second delay,
	// #54) and continuously fills the pre-roll ring so the leading word spoken
	// in the cold-start gap survives (#53).
	// ----------------------------------------------------------------------

	/** True once the warm mic stream + recorder + AudioContext are live and
	* the pre-roll ring is filling. */
	isWarm: boolean;

	/** Pre-arm (or re-arm) the persistent warm stream. Idempotent; safe to
	* call repeatedly. Resolves true when the warm stream is recording. */
	armWarmStream: () => Promise<boolean>;

	/** Release the warm stream + recorder + AudioContext (mic indicator off).
	* Called on unmount; callers may call it to fully relinquish the mic. */
	disarmWarmStream: () => void;

	/**
	* Engage a push-to-talk capture. Call on hold-start (e.g. SPACE keydown).
	* Returns immediately. The capture engages only after HOLD_THRESHOLD_MS
	* so a sub-threshold tap is ignored; the engaged utterance is seeded with
	* the pre-roll ring so the first word (spoken during the cold-start gap)
	* is always included.
	*/
	beginPushToTalk: () => void;

	/**
	* End a push-to-talk capture. Call on hold-release (e.g. SPACE keyup).
	* If the hold was deliberate (>= HOLD_THRESHOLD_MS) the assembled blob
	* (pre-roll + live capture) is transcribed and delivered via
	* onFinalTranscript; a sub-threshold tap resolves to null and submits
	* nothing. Resolves with the final transcript, or null when discarded /
	* empty.
	*/
	endPushToTalk: () => Promise<string \| null>;

	/** Snapshot+drain the current pre-roll ring (oldest-first). Exposed for
	* the regression test and any caller that wants to splice the lead-in
	* into its own recorder blob. */
	consumePreRollChunks: () => Blob[];
	}

	function resolveCtor(): SpeechRecognitionCtor \| null {
	if (typeof window === "undefined") return null;
	const w = window as unknown as {
	SpeechRecognition?: SpeechRecognitionCtor;
	webkitSpeechRecognition?: SpeechRecognitionCtor;
	};
	return w.SpeechRecognition ?? w.webkitSpeechRecognition ?? null;
	}

	export function useStreamingVoice(
	opts: UseStreamingVoiceOptions,
	): UseStreamingVoiceReturn {
	const {
	enabled,
	onInterimTranscript,
	onFinalTranscript,
	onError,
	onListening,
	isTextRequestPendingRef,
	language = "en-IN",
	onVoiceError,
	} = opts;

	// Keep latest callback refs so the recognition handlers always call the
	// freshest closure without re-binding the recognition instance on every
	// render (re-binding mid-utterance loses interim results).
	const onInterimRef = useRef(onInterimTranscript);
	const onFinalRef = useRef(onFinalTranscript);
	const onErrorRef = useRef(onError);
	const onListeningRef = useRef(onListening);
	// KI-223 — optional structured-error callback ref. Defaults to no-op so
	// the rest of the hook can call it unconditionally without null checks.
	const onVoiceErrorRef = useRef<(err: VoiceError) => void>(
	onVoiceError ?? (() => { /* no-op */ }),
	);
	useEffect(() => { onInterimRef.current = onInterimTranscript; }, [onInterimTranscript]);
	useEffect(() => { onFinalRef.current = onFinalTranscript; }, [onFinalTranscript]);
	useEffect(() => { onErrorRef.current = onError; }, [onError]);
	useEffect(() => { onListeningRef.current = onListening; }, [onListening]);
	useEffect(() => { onVoiceErrorRef.current = onVoiceError ?? (() => { /* no-op */ }); }, [onVoiceError]);

	const recognitionRef = useRef<SpeechRecognitionInstance \| null>(null);
	const finalsRef = useRef<string[]>([]);
	// KI-217 (2026-05-15) — track how many entries of finalsRef have already
	// been drained to pendingUtteranceRef. Each onend reads the slice from
	// `finalsConsumedRef.current` to end, then bumps the cursor. finalsRef
	// itself is NOT reset between restart cycles — only after the grace-timer
	// submit (when onFinalRef fires) or on user-toggled start/stop. This
	// prevents a Chrome quirk where late-delivered isFinal results arriving
	// after onend on a mid-utterance restart cycle would land in a freshly
	// wiped finalsRef and get dropped on the NEXT onend cycle's drain.
	const finalsConsumedRef = useRef<number>(0);
	const wantRunningRef = useRef(false); // mirrors `enabled` for handler closures
	const restartTimerRef = useRef<ReturnType<typeof setTimeout> \| null>(null);
	const errorBackoffRef = useRef(0);
	// KI-188 (2026-05-15) — TTS-playback gate. Web Speech API has its own
	// internal mic pipeline that bypasses our getUserMedia AEC constraints,
	// so SpeechRecognition transcribes the bot's TTS audio bleeding from
	// speakers as user input ("echo loop"). The only reliable fix from JS
	// is to abort recognition while ANY <audio> in the DOM is playing.
	// Tracked via a MutationObserver + per-element play/pause/ended hooks.
	const isTtsPlayingRef = useRef(false);
	// KI-285 (2026-05-16) — wall-clock timestamp of the moment the CURRENT
	// bot TTS playback began (the false→true edge in updateTtsState). The
	// barge-in tick refuses to trigger until BARGE_IN_GRACE_MS has elapsed
	// since this instant, so the bot's own start-of-reply echo cannot
	// self-trigger a barge-in. Reset to 0 whenever TTS is not playing.
	const ttsPlaybackStartedAtRef = useRef<number>(0);
	const ttsAudioElementsRef = useRef<Set<HTMLAudioElement>>(new Set());
	// KI-203 (2026-05-15) — silently discard SpeechRecognition.onresult events
	// while this flag is true. Flipped on the instant TTS playback starts
	// (closes the ~100-300ms window between `audio.play()` and our abort()
	// taking effect, during which bot voice was being transcribed as user
	// input). Flipped back ~POST_TTS_DROP_MS after TTS ends so any in-flight
	// results from the dying recognition pipeline are still suppressed.
	const dropResultsRef = useRef(false);
	const dropResultsClearTimerRef = useRef<ReturnType<typeof setTimeout> \| null>(null);
	// KI-202 (2026-05-15) — utterance-batching state.
	// pendingUtteranceRef accumulates the Web Speech transcript across multiple
	// onend events separated by sub-grace-window pauses. pendingChunksRef does
	// the same for MediaRecorder blobs so the Sarvam POST sees the WHOLE
	// utterance, not just the tail after the last pause. pendingSubmitTimerRef
	// is the grace-window setTimeout; it gets reset every time onend appends
	// more content.
	const pendingUtteranceRef = useRef<string>("");
	const pendingChunksRef = useRef<Blob[]>([]);
	const pendingSubmitTimerRef = useRef<ReturnType<typeof setTimeout> \| null>(null);
	// FIX 3 (HIGH) — one-shot barge-in signal. Flipped true by triggerBargeIn
	// when the VAD detects sustained user speech over bot TTS. Read+cleared
	// via consumeBargeInSignal() so the caller (page.tsx) can abort any
	// in-flight /api/chat request that's still assembling more TTS audio.
	const bargeInRequestedRef = useRef<boolean>(false);
	// KI-228 (2026-05-15) — V6.8 adaptive noise floor. Persistent across the
	// entire hook lifetime so a user's noise environment learned across the
	// first 5 seconds carries through later TTS plays even if the audio
	// effect tears down + rebuilds the analyser between turns.
	const noiseFloorRef = useRef<AdaptiveNoiseFloor>(new AdaptiveNoiseFloor());
	// KI-225 (2026-05-15) — V1.3 sample-rate-aware ZCR band, cached from the
	// AudioContext at analyser-build time. Falls back to the 48 kHz reference
	// band when the context isn't up yet.
	const zcrBandRef = useRef<{ min: number; max: number }>({ min: 20, max: 250 });

	// ----------------------------------------------------------------------
	// KI-168 PHASE 2 — Sarvam authoritative-transcript layer.
	// We run a MediaRecorder in parallel with SpeechRecognition. When the
	// browser detects end-of-utterance silence (recognition.onend), we
	// already have the raw audio chunks in memory. Send them to the backend
	// /api/transcribe endpoint (Sarvam STT) and replace the Web Speech text
	// with Sarvam's authoritative result. Web Speech remains the fallback if
	// Sarvam times out, errors, or the audio path failed to initialise.
	// ----------------------------------------------------------------------
	const mediaStreamRef = useRef<MediaStream \| null>(null);
	const mediaRecorderRef = useRef<MediaRecorder \| null>(null);
	const chunksRef = useRef<Blob[]>([]);
	const recorderMimeRef = useRef<string>("audio/webm");
	// True only when MediaRecorder.start() actually succeeded. If false we
	// bypass the Sarvam path and use Web Speech transcripts directly.
	const recorderActiveRef = useRef(false);
	// Promise resolved on the recorder's next `stop` event so we can wait
	// for the final ondataavailable chunk before building the blob.
	const recorderStopWaiterRef = useRef<(() => void) \| null>(null);

	// ----------------------------------------------------------------------
	// #53 / #54 — warm-stream + pre-roll push-to-talk state.
	//
	// SEPARATE from the Live-mode mediaStream/mediaRecorder above. The Live
	// recorder is acquired/torn-down per utterance and is gated on the
	// `enabled` prop (which page.tsx flips OFF during push-to-talk). This warm
	// stream is the OPPOSITE lifecycle: opened once after the user opts into
	// voice, kept alive across the Live↔PTT toggle for the hook's mounted
	// lifetime, never closed per-press. Holding a persistent open audio device
	// keeps the OS mic hot so any per-press getUserMedia (Live's OR page.tsx's
	// PTT) resolves near-instantly instead of cold-starting.
	// ----------------------------------------------------------------------
	const warmStreamRef = useRef<MediaStream \| null>(null);
	const warmRecorderRef = useRef<MediaRecorder \| null>(null);
	const warmCtxRef = useRef<AudioContext \| null>(null);
	const warmMimeRef = useRef<string>("audio/webm");
	// The rolling pre-roll ring — always holds ~PRE_ROLL_MS of the most recent
	// audio so a PTT engage can prepend the lead-in the user spoke during the
	// cold-start gap.
	const preRollRef = useRef<PreRollRing>(new PreRollRing(PRE_ROLL_MS));
	// Live capture slices accumulated between PTT engage and release. The
	// submitted blob is preRoll.drain() (lead-in) ++ these (live capture).
	const pttCaptureRef = useRef<Blob[]>([]);
	// True between a deliberate engage and the matching release — the warm
	// recorder's ondataavailable routes slices to pttCaptureRef instead of
	// (only) the pre-roll ring while this is set.
	const pttEngagedRef = useRef<boolean>(false);
	// wall-clock ms of the current hold's keydown (0 when not pressed). Used
	// by evaluateHoldGate to classify deliberate hold vs sub-threshold tap.
	const pttPressedAtRef = useRef<number>(0);
	// setTimeout id for the deliberate-hold engage. Fires HOLD_THRESHOLD_MS
	// after press; if release beats it, the press was a tap and is discarded.
	const pttHoldTimerRef = useRef<ReturnType<typeof setTimeout> \| null>(null);
	// True once the user has opted into voice at least once. Latches the warm
	// stream ON for the rest of the hook's mounted lifetime so it survives the
	// Live↔PTT toggle (page.tsx flips `enabled` false for pure PTT).
	const voiceEverEnabledRef = useRef<boolean>(false);
	const [isWarm, setIsWarm] = useState<boolean>(false);

	const [isSupported] = useState<boolean>(() => resolveCtor() !== null);

	const clearRestartTimer = useCallback(() => {
	if (restartTimerRef.current !== null) {
	clearTimeout(restartTimerRef.current);
	restartTimerRef.current = null;
	}
	}, []);

	// KI-210 (2026-05-15) — wait for an in-flight text turn to clear instead of
	// dropping the accumulated voice utterance. Polls isTextRequestPendingRef
	// every 300ms; resolves true once the flag clears, or false if the
	// maxWaitMs cap elapses first (we then proceed anyway rather than leak the
	// utterance forever on a stuck text request).
	const waitForTextClear = useCallback(async (maxWaitMs = 30000): Promise<boolean> => {
	const startTs = Date.now();
	while (isTextRequestPendingRef.current) {
	if (Date.now() - startTs > maxWaitMs) {
	console.debug("[useStreamingVoice] KI-210 wait timed out, submitting anyway");
	return false; // gave up waiting — proceed anyway
	}
	await new Promise((r) => setTimeout(r, 300));
	}
	return true; // text cleared, ok to proceed
	}, [isTextRequestPendingRef]);

	const safeStart = useCallback(() => {
	const rec = recognitionRef.current;
	if (!rec) return;
	try {
	rec.start();
	} catch {
	// start() throws InvalidStateError if recognition is already running.
	// Safe to ignore — onstart/onend will keep state in sync.
	}
	}, []);

	// Pick the best MediaRecorder mimeType. iOS Safari only supports
	// audio/mp4; Chromium/Firefox prefer audio/webm. Mirrors page.tsx PTT
	// recorder + the KI-134 fallback logic.
	const pickRecorderMime = useCallback((): string => {
	if (typeof window === "undefined" \|\| typeof MediaRecorder === "undefined") {
	return "";
	}
	const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/mpeg"];
	for (const m of candidates) {
	try {
	if (MediaRecorder.isTypeSupported(m)) return m;
	} catch {
	// ignore
	}
	}
	return "";
	}, []);

	const stopRecorder = useCallback((): Promise<void> => {
	const recorder = mediaRecorderRef.current;
	if (!recorder \|\| recorder.state === "inactive") {
	return Promise.resolve();
	}
	return new Promise<void>((resolve) => {
	recorderStopWaiterRef.current = () => resolve();
	try {
	recorder.stop();
	} catch {
	// already stopped
	recorderStopWaiterRef.current = null;
	resolve();
	}
	});
	}, []);

	const teardownAudio = useCallback(() => {
	const recorder = mediaRecorderRef.current;
	if (recorder) {
	try {
	if (recorder.state !== "inactive") recorder.stop();
	} catch {
	// ignore
	}
	recorder.ondataavailable = null;
	recorder.onstop = null;
	recorder.onerror = null;
	}
	mediaRecorderRef.current = null;
	const stream = mediaStreamRef.current;
	if (stream) {
	stream.getTracks().forEach((t) => {
	try { t.stop(); } catch { /* ignore */ }
	});
	}
	mediaStreamRef.current = null;
	chunksRef.current = [];
	recorderActiveRef.current = false;
	recorderStopWaiterRef.current = null;
	}, []);

	const ensureAudioCapture = useCallback(async (): Promise<boolean> => {
	if (mediaRecorderRef.current && recorderActiveRef.current) return true;
	if (typeof navigator === "undefined" \|\| !navigator.mediaDevices) return false;
	if (typeof MediaRecorder === "undefined") return false;
	try {
	// KI-185 (2026-05-15) — explicit AEC + noise suppression + auto-gain.
	// Default `{audio: true}` does NOT force AEC across all browsers, so the
	// mic was transcribing the bot's own TTS audio bleeding from speakers
	// back into the mic. Same constraints Zoom / Meet / ChatGPT-voice use.
	// For headphone users this gives near-perfect echo cancellation;
	// for speaker users it's 70-90% reduction (some bleed unavoidable
	// without server-side reference cancellation).
	// W2 (2026-05-15) — 2s watchdog around getUserMedia.
	// Some devices (Chromium on locked-down corporate Windows, certain
	// Android WebViews, OS-level mic-busy states) STALL getUserMedia
	// indefinitely instead of rejecting. Without a watchdog the pill
	// sits at "Voice on" forever, no banner, no recovery path.
	// Race the permission prompt against a 2000ms timeout that
	// rejects with name="StallTimeout" so the catch below treats it
	// identically to a hard denial (mic_permission_denied banner).
	const stream: MediaStream = await Promise.race([
	navigator.mediaDevices.getUserMedia({
	audio: {
	echoCancellation: true,
	noiseSuppression: true,
	autoGainControl: true,
	},
	}),
	new Promise<MediaStream>((_, reject) => {
	setTimeout(() => {
	const e = new Error("getUserMedia stalled >2s") as Error & { name: string };
	e.name = "StallTimeout";
	reject(e);
	}, 2000);
	}),
	]);
	const mime = pickRecorderMime();
	recorderMimeRef.current = mime \|\| "audio/webm";
	const recorder = mime ? new MediaRecorder(stream, { mimeType: mime }) : new MediaRecorder(stream);
	chunksRef.current = [];
	recorder.ondataavailable = (ev: BlobEvent) => {
	if (ev.data && ev.data.size > 0) chunksRef.current.push(ev.data);
	};
	recorder.onstop = () => {
	const waiter = recorderStopWaiterRef.current;
	recorderStopWaiterRef.current = null;
	if (waiter) waiter();
	};
	recorder.onerror = (ev: Event) => {
	console.debug("[useStreamingVoice] MediaRecorder error", ev);
	};
	mediaStreamRef.current = stream;
	mediaRecorderRef.current = recorder;
	// 1s timeslice so chunks land progressively — ondataavailable fires
	// once per second instead of only on stop().
	recorder.start(1000);
	// W2 (2026-05-15) — affirmative post-acquire validation. A
	// MediaRecorder that .start()s without throwing is NOT proof the
	// capture is alive: Playwright's fake-mic stream, a stream from a
	// device that was unplugged between getUserMedia and start(), or a
	// codec rejection that fires `onerror` async — all leave recorder.state
	// anything other than "recording". Without this check, the pill flipped
	// to "Voice on" over a silent stream. Treat any non-"recording" state
	// as a hard fail and route to the same mic_permission_denied banner.
	if (recorder.state !== "recording") {
	try { stream.getTracks().forEach((t) => t.stop()); } catch { /* ignore */ }
	mediaStreamRef.current = null;
	mediaRecorderRef.current = null;
	throw Object.assign(new Error(`MediaRecorder did not enter recording state (got ${recorder.state})`), {
	name: "RecorderNotRecording",
	});
	}
	recorderActiveRef.current = true;
	console.debug("[useStreamingVoice] MediaRecorder started", {
	mime: recorderMimeRef.current,
	state: recorder.state,
	});
	return true;
	} catch (err) {
	// W1 (2026-05-15) — DOMException name → VoiceError mapping.
	// NotAllowedError / SecurityError → user denied or browser-blocked
	// NotFoundError / OverconstrainedError → no usable input device
	// NotReadableError / AbortError → OS-level mic owned by another app
	// anything else (incl. plain Error) → treat as denial so the UI still
	// surfaces an actionable banner
	// ALL of these map to "mic_permission_denied" because the user-visible
	// remediation is the same: open site permissions, allow mic, reload.
	// Returning `false` alone was insufficient — `start()` calls this via
	// `void ensureAudioCapture()` and never sees the rejection, so the pill
	// stayed at "Voice on" with zero mic. Emitting onVoiceError + flipping
	// wantRunningRef false + onListening(false) is the recovery contract.
	const name = (err as { name?: string } \| null)?.name ?? "Error";
	console.debug(
	"[useStreamingVoice] getUserMedia / MediaRecorder init failed",
	{ name, err },
	);
	recorderActiveRef.current = false;
	// `getUserMedia` rejection happens BEFORE we assign mediaStreamRef /
	// mediaRecorderRef, so there's nothing to tear down here. The
	// `wantRunningRef = false` + `onListening(false)` below is enough to
	// halt the SR auto-restart loop. The parent's `enabled = false` flip
	// (driven by the banner code) will run stop() which idempotently
	// re-runs full cleanup.
	// Surface to the page-level banner. Cast through the local widened
	// VoiceError union (W1) so TS accepts the new string code.
	try {
	onVoiceErrorRef.current("mic_permission_denied" as VoiceError);
	} catch {
	/* never let a user-supplied callback crash the hook */
	}
	// Stop the recognition restart loop and reset listening state so the
	// pill doesn't stay green over a dead mic. The parent (page.tsx) is
	// expected to also flip `enabled` back to false on the banner code,
	// which calls our `stop()` and idempotently cleans up.
	wantRunningRef.current = false;
	try {
	onListeningRef.current(false);
	} catch {
	/* ignore */
	}
	return false;
	}
	}, [pickRecorderMime]);

	// ======================================================================
	// #53 / #54 — warm-stream + pre-roll push-to-talk engine.
	// ======================================================================

	const disarmWarmStream = useCallback(() => {
	if (pttHoldTimerRef.current !== null) {
	clearTimeout(pttHoldTimerRef.current);
	pttHoldTimerRef.current = null;
	}
	pttEngagedRef.current = false;
	pttPressedAtRef.current = 0;
	pttCaptureRef.current = [];
	preRollRef.current.clear();
	const rec = warmRecorderRef.current;
	if (rec) {
	try {
	rec.ondataavailable = null;
	rec.onerror = null;
	rec.onstop = null;
	if (rec.state !== "inactive") rec.stop();
	} catch {
	/* ignore */
	}
	}
	warmRecorderRef.current = null;
	const stream = warmStreamRef.current;
	if (stream) {
	stream.getTracks().forEach((t) => {
	try { t.stop(); } catch { /* ignore */ }
	});
	}
	warmStreamRef.current = null;
	const ctx = warmCtxRef.current;
	if (ctx) {
	warmCtxRef.current = null;
	try { void ctx.close(); } catch { /* ignore */ }
	}
	setIsWarm(false);
	}, []);

	// Acquire (or re-acquire) the persistent warm stream. Idempotent: a
	// healthy recording warm recorder short-circuits. On failure routes
	// through the SAME onVoiceError("mic_permission_denied") contract the
	// Live path uses — never a silent failure.
	const armWarmStream = useCallback(async (): Promise<boolean> => {
	voiceEverEnabledRef.current = true;
	const existing = warmRecorderRef.current;
	if (existing && existing.state === "recording" && warmStreamRef.current) {
	return true;
	}
	if (typeof navigator === "undefined" \|\| !navigator.mediaDevices) return false;
	if (typeof MediaRecorder === "undefined") return false;
	// Tear down any half-built prior attempt before re-acquiring.
	if (existing \|\| warmStreamRef.current) disarmWarmStream();
	try {
	// Same AEC/NS/AGC constraints as the Live + PTT paths (KI-185) so the
	// pre-roll is echo-cancelled identically to the rest of the capture.
	// W2-style 2s stall watchdog so a hung getUserMedia surfaces a banner
	// instead of pinning the warm state forever.
	const stream: MediaStream = await Promise.race([
	navigator.mediaDevices.getUserMedia({
	audio: {
	echoCancellation: true,
	noiseSuppression: true,
	autoGainControl: true,
	},
	}),
	new Promise<MediaStream>((_, reject) => {
	setTimeout(() => {
	const e = new Error("warm getUserMedia stalled >2s") as Error & { name: string };
	e.name = "StallTimeout";
	reject(e);
	}, 2000);
	}),
	]);
	const mime = pickRecorderMime();
	warmMimeRef.current = mime \|\| "audio/webm";
	const recorder = mime
	? new MediaRecorder(stream, { mimeType: mime })
	: new MediaRecorder(stream);
	preRollRef.current = new PreRollRing(PRE_ROLL_MS);
	pttCaptureRef.current = [];
	pttEngagedRef.current = false;
	recorder.ondataavailable = (ev: BlobEvent) => {
	if (!ev.data \|\| ev.data.size <= 0) return;
	// Always feed the rolling pre-roll ring so the lead-in is ready the
	// instant a PTT engage fires (the word spoken in the cold-start gap
	// is in here). When a PTT capture is engaged, ALSO accumulate the
	// slice into the live capture buffer — the submitted blob is
	// preRoll.drain() (lead-in) ++ pttCaptureRef (live), so the first
	// word is never lost AND no chunk is dropped.
	preRollRef.current.push(ev.data, WARM_TIMESLICE_MS);
	if (pttEngagedRef.current) {
	pttCaptureRef.current.push(ev.data);
	}
	};
	recorder.onerror = (ev: Event) => {
	console.debug("[useStreamingVoice] warm MediaRecorder error", ev);
	try { onVoiceErrorRef.current("stream_stale"); } catch { /* ignore */ }
	};
	recorder.onstop = () => {
	// The warm recorder should never stop on its own while armed; if it
	// does (device unplug, OS interruption) surface it and let the
	// re-arm effect / next press recover.
	console.debug("[useStreamingVoice] warm MediaRecorder stopped");
	};
	warmStreamRef.current = stream;
	warmRecorderRef.current = recorder;
	recorder.start(WARM_TIMESLICE_MS);
	if (recorder.state !== "recording") {
	try { stream.getTracks().forEach((t) => t.stop()); } catch { /* ignore */ }
	warmStreamRef.current = null;
	warmRecorderRef.current = null;
	throw Object.assign(
	new Error(`warm MediaRecorder not recording (got ${recorder.state})`),
	{ name: "RecorderNotRecording" },
	);
	}
	// Keep an AudioContext warm + RUNNING so it never has to be resumed
	// lazily on first press (a suspended ctx is one of the documented
	// first-word-loss vectors). resume() needs a user gesture on some
	// browsers; armWarmStream is always called from one (voice toggle).
	try {
	const Ctor = (window.AudioContext
	\|\| (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext);
	if (Ctor) {
	if (!warmCtxRef.current \|\| warmCtxRef.current.state === "closed") {
	warmCtxRef.current = new Ctor();
	}
	if (warmCtxRef.current.state === "suspended") {
	void warmCtxRef.current.resume().catch((err) => {
	console.debug("[useStreamingVoice] warm AudioContext.resume failed", err);
	try { onVoiceErrorRef.current("audio_context_suspended"); } catch { /* ignore */ }
	});
	}
	}
	} catch {
	/* AudioContext is best-effort for warmth; capture still works */
	}
	setIsWarm(true);
	console.debug("[useStreamingVoice] warm stream armed", {
	mime: warmMimeRef.current,
	preRollMs: PRE_ROLL_MS,
	timesliceMs: WARM_TIMESLICE_MS,
	});
	return true;
	} catch (err) {
	const name = (err as { name?: string } \| null)?.name ?? "Error";
	console.debug("[useStreamingVoice] warm stream arm failed", { name, err });
	setIsWarm(false);
	try {
	onVoiceErrorRef.current("mic_permission_denied" as VoiceError);
	} catch {
	/* never let a user callback crash the hook */
	}
	return false;
	}
	}, [pickRecorderMime, disarmWarmStream]);

	const consumePreRollChunks = useCallback((): Blob[] => {
	return preRollRef.current.drain();
	}, []);

	// Submit an assembled PTT blob through the SAME Sarvam-with-retry path the
	// Live grace-timer uses (KI-226/302), then deliver via onFinalTranscript.
	// Returns the authoritative transcript or null.
	const submitPttBlob = useCallback(
	async (chunks: Blob[]): Promise<string \| null> => {
	if (chunks.length === 0) return null;
	const blob = new Blob(chunks, { type: warmMimeRef.current \|\| "audio/webm" });
	// ~3 KB empirical noise floor (same as the Live path / PTT KI-134).
	const MIN_BLOB_BYTES = 3000;
	if (blob.size < MIN_BLOB_BYTES) {
	console.debug("[useStreamingVoice] PTT blob below noise floor — discard", {
	bytes: blob.size,
	});
	return null;
	}
	await waitForTextClear();
	const APPROX_BYTES_PER_CHUNK = 100_000; // ~25s of webm/opus
	const estChunks = Math.max(1, Math.ceil(blob.size / APPROX_BYTES_PER_CHUNK));
	const attemptTimeoutMs = Math.min(120_000, 8_000 + estChunks * 12_000);
	let authoritative: string \| null = null;
	const sarvam = await retryPostTranscribe(async (signal) => {
	const timeoutCtl = new AbortController();
	const timer = setTimeout(() => timeoutCtl.abort(), attemptTimeoutMs);
	const onOuterAbort = () => timeoutCtl.abort();
	signal.addEventListener("abort", onOuterAbort);
	try {
	return await postTranscribe(blob, language, timeoutCtl.signal);
	} finally {
	clearTimeout(timer);
	signal.removeEventListener("abort", onOuterAbort);
	}
	});
	if (sarvam) {
	const t = (sarvam.text \|\| "").trim();
	if (t) authoritative = t;
	} else {
	try { onVoiceErrorRef.current("transcribe_failed"); } catch { /* ignore */ }
	}
	if (authoritative) {
	await waitForTextClear();
	onFinalRef.current(authoritative);
	}
	return authoritative;
	},
	[language, waitForTextClear],
	);

	// PTT engage — called HOLD_THRESHOLD_MS after a deliberate press. Snapshots
	// the pre-roll (lead-in spoken during the cold-start gap) into the live
	// capture buffer and flips the recorder's slice routing to also accumulate.
	const engagePtt = useCallback(() => {
	pttEngagedRef.current = true;
	// Seed the capture with the pre-roll lead-in FIRST so the first word
	// (which page.tsx's cold-started recorder would have missed) is at the
	// head of the submitted blob.
	const leadIn = preRollRef.current.drain();
	pttCaptureRef.current = [...leadIn];
	console.debug("[useStreamingVoice] PTT engaged", {
	leadInSlices: leadIn.length,
	});
	}, []);

	const beginPushToTalk = useCallback(() => {
	pttPressedAtRef.current = Date.now();
	pttCaptureRef.current = [];
	pttEngagedRef.current = false;
	// Make sure the warm stream is up so the pre-roll is actually filling.
	// armWarmStream is idempotent + fast when already warm.
	void armWarmStream();
	if (pttHoldTimerRef.current !== null) {
	clearTimeout(pttHoldTimerRef.current);
	}
	// Deliberate-hold gate: engage only after the threshold so a sub-150ms
	// tap does nothing. The capture still feels instant because the pre-roll
	// ring already holds the audio spoken during these HOLD_THRESHOLD_MS.
	pttHoldTimerRef.current = setTimeout(() => {
	pttHoldTimerRef.current = null;
	// Re-check the press is still held (release clears pttPressedAtRef).
	if (pttPressedAtRef.current !== 0) engagePtt();
	}, HOLD_THRESHOLD_MS);
	}, [armWarmStream, engagePtt]);

	const endPushToTalk = useCallback(async (): Promise<string \| null> => {
	const pressedAt = pttPressedAtRef.current;
	const releasedAt = Date.now();
	pttPressedAtRef.current = 0;
	if (pttHoldTimerRef.current !== null) {
	clearTimeout(pttHoldTimerRef.current);
	pttHoldTimerRef.current = null;
	}
	const { deliberate, heldMs } = evaluateHoldGate(
	pressedAt \|\| releasedAt,
	releasedAt,
	HOLD_THRESHOLD_MS,
	);
	const wasEngaged = pttEngagedRef.current;
	pttEngagedRef.current = false;
	if (!deliberate \|\| !wasEngaged) {
	// Sub-threshold tap (or release before engage fired): discard. The
	// pre-roll ring keeps rolling for the warm stream; nothing submitted.
	console.debug("[useStreamingVoice] PTT discarded (tap)", {
	heldMs,
	deliberate,
	wasEngaged,
	});
	pttCaptureRef.current = [];
	return null;
	}
	const captured = pttCaptureRef.current;
	pttCaptureRef.current = [];
	return submitPttBlob(captured);
	}, [submitPttBlob]);

	const buildRecognition = useCallback((): SpeechRecognitionInstance \| null => {
	const Ctor = resolveCtor();
	if (!Ctor) return null;
	const rec = new Ctor();
	rec.lang = language;
	rec.continuous = false;
	rec.interimResults = true;
	rec.maxAlternatives = 1;

	rec.onstart = () => {
	onListeningRef.current(true);
	};

	rec.onresult = (ev: SpeechRecognitionEventLike) => {
	// KI-203 (2026-05-15) — early-return while TTS is playing (or within
	// the POST_TTS_DROP_MS window after TTS ends). recognition.abort()
	// doesn't immediately stop result delivery, so we silently discard
	// every chunk that arrives during the dirty window. Without this, bot
	// TTS audio ("perfect days to get started Rohit") was leaking into
	// the user input field between `audio.play()` firing and our abort()
	// actually taking effect.
	if (dropResultsRef.current \|\| isTextRequestPendingRef.current) {
	console.debug("[useStreamingVoice] KI-203/214 dropping recognition result", {
	drop: dropResultsRef.current,
	textPending: isTextRequestPendingRef.current,
	});
	return;
	}
	let interim = "";
	// Walk every result; finals get pushed onto finalsRef, interims get
	// concatenated into a running string that's displayed in the input.
	for (let i = 0; i < ev.results.length; i++) {
	const result = ev.results[i];
	const alt = result[0];
	if (!alt) continue;
	if (result.isFinal) {
	const t = alt.transcript.trim();
	if (t) finalsRef.current.push(t);
	} else {
	interim += alt.transcript;
	}
	}
	// #68 — the composer must show the COMPLETE evolving transcript, not
	// just the current recognition session's slice. continuous=false makes
	// Web Speech end+restart on every sub-1.5s pause; each restart begins a
	// fresh result list, and finals can also be skipped here during the
	// TTS/text drop window above even though the audio (→ Sarvam) still has
	// them. The authoritative running text the grace timer will submit is
	// `pendingUtteranceRef` (earlier graced segments of THIS utterance) +
	// the current session's NOT-YET-DRAINED finals + the live interim.
	//
	// Critical: finals already moved into pendingUtteranceRef on `onend`
	// stay in finalsRef until submit (so a late isFinal isn't lost), and
	// `finalsConsumedRef` is the cursor of how many were drained. Joining
	// ALL of finalsRef would double-count those (segment shown twice). So
	// we display pending + finalsRef.slice(consumed) + interim — the exact
	// union with no duplication and no lag behind what was captured/sent.
	const priorSegments = pendingUtteranceRef.current.trim();
	const freshFinals = finalsRef.current
	.slice(finalsConsumedRef.current)
	.join(" ")
	.trim();
	const running = [priorSegments, freshFinals, interim]
	.map((s) => s.trim())
	.filter(Boolean)
	.join(" ")
	.trim();
	onInterimRef.current(running);
	};

	rec.onerror = (ev: SpeechRecognitionErrorEventLike) => {
	const code = ev.error;
	// `no-speech` and `aborted` are routine in continuous-restart mode —
	// no audio detected in a window, or we deliberately stopped. Silent
	// restart via onend.
	if (code === "no-speech" \|\| code === "aborted") return;
	if (code === "not-allowed" \|\| code === "service-not-allowed") {
	wantRunningRef.current = false;
	// FIX 2 (HIGH) — Terminal-error mic leak. Without teardownAudio()
	// here the MediaRecorder + MediaStream stay open even though
	// recognition has shut down, so the browser's red-dot mic
	// indicator stays lit and the OS thinks we're still recording.
	teardownAudio();
	onErrorRef.current(
	"Mic permission denied. Click the lock icon in your browser's URL bar to enable the microphone.",
	);
	return;
	}
	if (code === "audio-capture") {
	wantRunningRef.current = false;
	// FIX 2 (HIGH) — see above.
	teardownAudio();
	onErrorRef.current("No microphone detected. Check your audio device and try again.");
	return;
	}
	if (code === "network") {
	// Transient — let onend's restart loop pick it up with backoff.
	errorBackoffRef.current = Math.min(errorBackoffRef.current + 500, 3000);
	return;
	}
	onErrorRef.current(`Voice error: ${code}${ev.message ? ` (${ev.message})` : ""}`);
	};

	rec.onend = () => {
	onListeningRef.current(false);
	// KI-217 — drain only the NEW finals (everything past the consumed
	// cursor). DO NOT reset finalsRef here: a late-delivered isFinal
	// chunk arriving after onend would otherwise be wiped before the
	// next onend cycle can pick it up. finalsRef is reset on actual
	// utterance submit (grace-timer flush) and on user start/stop.
	const newFinals = finalsRef.current.slice(finalsConsumedRef.current);
	const webSpeechText = newFinals.join(" ").trim();
	finalsConsumedRef.current = finalsRef.current.length;

	// KI-168 PHASE 2 — race guard: if a typed-text turn is in flight,
	// drop both transcripts on the floor (text wins). Don't start a
	// Sarvam fetch we'd be throwing away.
	const textRacing = isTextRequestPendingRef.current;

	// FIX 7 (HIGH) — Silent onend early-return. Chrome's "no-speech"
	// restart loop fires onend every ~5s with no content. Without this
	// guard, every silent onend re-arms the 1500ms grace timer and the
	// grace window extends forever — even when there's nothing pending
	// to submit. Skip the grace-timer reset when:
	// - no new Web Speech text in this cycle, AND
	// - no audio chunks captured this cycle (chunksRef holds the
	// undrained chunks that will become drainedThisEnd below), AND
	// - no previously pending utterance text.
	// We still call scheduleRestart() so the mic comes back online.
	const hasNewChunksThisEnd = recorderActiveRef.current && chunksRef.current.length > 0;
	if (!webSpeechText && !hasNewChunksThisEnd && pendingUtteranceRef.current === "") {
	console.debug("[useStreamingVoice] KI-222 silent onend — skipping grace reset");
	// Inline the restart-only path here so we don't need to refactor
	// the scheduleRestart closure below it.
	if (wantRunningRef.current && !isTextRequestPendingRef.current) {
	const backoff = errorBackoffRef.current;
	errorBackoffRef.current = 0;
	clearRestartTimer();
	restartTimerRef.current = setTimeout(() => {
	restartTimerRef.current = null;
	if (wantRunningRef.current) safeStart();
	}, Math.max(50, backoff));
	} else if (wantRunningRef.current && isTextRequestPendingRef.current) {
	clearRestartTimer();
	restartTimerRef.current = setTimeout(() => {
	restartTimerRef.current = null;
	if (wantRunningRef.current && !isTextRequestPendingRef.current) safeStart();
	}, 250);
	}
	return;
	}

	const scheduleRestart = () => {
	if (wantRunningRef.current && !isTextRequestPendingRef.current) {
	const backoff = errorBackoffRef.current;
	errorBackoffRef.current = 0;
	clearRestartTimer();
	restartTimerRef.current = setTimeout(() => {
	restartTimerRef.current = null;
	if (wantRunningRef.current) safeStart();
	}, Math.max(50, backoff));
	} else if (wantRunningRef.current && isTextRequestPendingRef.current) {
	// Text turn in flight — retry shortly so mic resumes the moment
	// the text turn lands.
	clearRestartTimer();
	restartTimerRef.current = setTimeout(() => {
	restartTimerRef.current = null;
	if (wantRunningRef.current && !isTextRequestPendingRef.current) safeStart();
	}, 250);
	}
	};

	// Pull the chunks we've accumulated so far so the recorder can keep
	// capturing the next utterance without us re-running getUserMedia.
	const drainChunks = (): Blob[] => {
	const drained = chunksRef.current;
	chunksRef.current = [];
	return drained;
	};

	// KI-202 (2026-05-15) — utterance batching. Web Speech's onend fires
	// after ~1.5s of silence, so a natural mid-sentence pause splits one
	// utterance into two onend events and the user's sentence gets
	// submitted in halves ("First word getting cut off. Cutoff is the
	// biggest issue. Auto-submitting without capturing the first half
	// or the second half"). Instead of submitting immediately, we
	// append THIS onend's text + audio chunks to pendingUtterance*Ref
	// buffers, then start (or reset) a UTTERANCE_GRACE_MS timer. If
	// recognition restarts (auto-restart picks up the next word burst)
	// within the grace window, the next onend appends more content +
	// resets the timer. Only after a FULL UTTERANCE_GRACE_MS of true
	// silence does the timer fire and submit the accumulated buffer.
	//
	// Pauses < 1.5s merge into one turn (intended fix).
	// Pauses > 1.5s split (intended — that IS a new turn).

	// Drain the CURRENT onend's chunks now so the recorder keeps capturing
	// the next word burst without contamination across pending utterances.
	const drainedThisEnd = recorderActiveRef.current ? drainChunks() : [];
	if (webSpeechText) {
	pendingUtteranceRef.current = pendingUtteranceRef.current
	? `${pendingUtteranceRef.current} ${webSpeechText}`
	: webSpeechText;
	}
	if (drainedThisEnd.length > 0) {
	pendingChunksRef.current.push(...drainedThisEnd);
	}
	console.debug("[useStreamingVoice] KI-202 onend appended to pending utterance", {
	thisTextLen: webSpeechText.length,
	thisChunkCount: drainedThisEnd.length,
	pendingTextLen: pendingUtteranceRef.current.length,
	pendingChunkCount: pendingChunksRef.current.length,
	textRacing,
	});

	// Mic restart happens immediately regardless of grace window — we
	// WANT recognition to come back online so it can pick up the next
	// word burst within the grace window and append to pending.
	scheduleRestart();

	// KI-210 (2026-05-15) — DO NOT drop pending utterance when text is
	// racing. Previously we cleared pendingUtteranceRef + pendingChunksRef
	// here, which silently lost any voice the user spoke during the bot's
	// text-submit/TTS-thinking gap. The downstream wait-and-retry inside
	// `submitPendingUtterance` (timer fire) + the post-await wait inside
	// the Sarvam fire-and-forget now hold the buffer until the text turn
	// clears, then submit. We leave `textRacing` as a debug breadcrumb in
	// the log above and continue accumulating.

	// KI-210 — refactor the grace-timer body into a named async function
	// so it can re-schedule itself (wait-and-retry) when text is in flight
	// instead of dropping the utterance. Capped at 30s total wait so a
	// stuck text request can't leak the timer forever; if the cap fires
	// we proceed with submission anyway (better to submit than drop).
	const SUBMIT_WAIT_CAP_MS = 30000;
	const submitStartTsRef = { ts: 0 };
	const submitPendingUtterance = async () => {
	pendingSubmitTimerRef.current = null;

	// KI-210 — if text is still in flight when the grace window fires,
	// wait instead of dropping. Re-schedule a 300ms retry until either
	// text clears or we hit the 30s cap.
	if (isTextRequestPendingRef.current) {
	if (submitStartTsRef.ts === 0) submitStartTsRef.ts = Date.now();
	if (Date.now() - submitStartTsRef.ts > SUBMIT_WAIT_CAP_MS) {
	console.debug("[useStreamingVoice] KI-210 timer wait cap reached; submitting anyway");
	// fall through and submit
	} else {
	console.debug("[useStreamingVoice] KI-210 timer fired but text in flight; waiting 300ms");
	pendingSubmitTimerRef.current = setTimeout(() => {
	void submitPendingUtterance();
	}, 300);
	return;
	}
	}

	const accumulatedText = pendingUtteranceRef.current.trim();
	const accumulatedChunks = pendingChunksRef.current;
	pendingUtteranceRef.current = "";
	pendingChunksRef.current = [];
	// KI-217 — the utterance is now being submitted; safe to wipe
	// finalsRef + reset the consumed cursor. Any late results that
	// arrive after this point are for a NEW utterance.
	finalsRef.current = [];
	finalsConsumedRef.current = 0;
	console.debug("[useStreamingVoice] KI-202 grace window elapsed — submitting", {
	textLen: accumulatedText.length,
	chunkCount: accumulatedChunks.length,
	});

	// No-recorder path: just submit Web Speech text.
	if (!recorderActiveRef.current \|\| accumulatedChunks.length === 0) {
	if (accumulatedText) {
	onFinalRef.current(accumulatedText);
	}
	return;
	}

	// Sarvam path. Fire-and-forget so we don't block recognition.
	void (async () => {
	// Snapshot user-visible interim so the input area doesn't go blank
	// while Sarvam is in flight. The page-side input still shows the
	// Web Speech transcript; we'll overwrite it via onFinalTranscript
	// once Sarvam returns.
	if (accumulatedText) onInterimRef.current(accumulatedText);

	// We need to stop the recorder to get the final dataavailable
	// chunk for the LAST burst (anything mid-recording when the grace
	// window opened is in chunksRef, which we now flush into our
	// accumulated set before posting).
	await stopRecorder();
	const tailChunks = drainChunks();
	const allChunks = [...accumulatedChunks, ...tailChunks];
	const totalSize = allChunks.reduce((n, b) => n + b.size, 0);
	console.debug("[useStreamingVoice] KI-202 batched submit", {
	webSpeechLen: accumulatedText.length,
	chunkCount: allChunks.length,
	blobBytes: totalSize,
	});

	// Re-arm audio capture for the next utterance (don't block on it).
	teardownAudio();
	if (wantRunningRef.current) {
	void ensureAudioCapture();
	}

	// Skip submit when there's effectively no audio or no Web Speech
	// text. ~3 KB is the empirical noise floor used by the PTT path's
	// KI-134 silence guard.
	const MIN_BLOB_BYTES = 3000;
	if (!accumulatedText && totalSize < MIN_BLOB_BYTES) {
	console.debug("[useStreamingVoice] KI-202 skipping submit — no text and tiny blob");
	return;
	}

	// KI-210 — wait-and-retry instead of dropping. If a text turn
	// started during the await above, hold the utterance until it
	// clears (capped at 30s) instead of throwing it away.
	await waitForTextClear();

	let authoritativeText = accumulatedText;
	if (allChunks.length > 0 && totalSize >= MIN_BLOB_BYTES) {
	const blob = new Blob(allChunks, { type: recorderMimeRef.current \|\| "audio/webm" });
	// KI-226 (2026-05-15) — V5.4. Wrap the Sarvam POST in an
	// exponential-backoff retry (1s/2s/4s, max 3 attempts). The
	// accumulatedText (Web Speech fallback) and accumulated chunks
	// are already captured locally, so retries don't lose the
	// partial transcript. Each attempt enforces its own timeout
	// via the controller signal passed in by retryPostTranscribe.
	//
	// KI-302 (2026-05-18) — full-transcript fix. The backend now
	// SPLITS audio over Sarvam's ~30s REST limit into multiple
	// chunks and transcribes them sequentially (one Sarvam round
	// trip per ~25s of speech) so a long utterance is no longer
	// silently truncated to its first 30s. A fixed 8s client
	// timeout would abort that legitimately-longer multi-chunk
	// call mid-flight and force a fall back to the (also often
	// truncated) Web Speech text — re-introducing the very bug we
	// are fixing. Scale the per-attempt timeout with the audio
	// size: an 8s floor for short clips plus a generous budget per
	// estimated 25s chunk (browser webm/opus ≈ 4 KB/s ⇒ ~100 KB
	// per 25s chunk; allow ~10s of Sarvam latency per chunk).
	const APPROX_BYTES_PER_CHUNK = 100_000; // ~25s of webm/opus
	const estChunks = Math.max(
	1,
	Math.ceil(blob.size / APPROX_BYTES_PER_CHUNK),
	);
	const attemptTimeoutMs = Math.min(
	120_000, // hard ceiling — never wait > 2 min on one attempt
	8_000 + estChunks * 12_000,
	);
	console.debug("[useStreamingVoice] POST /api/transcribe", {
	bytes: blob.size,
	mime: blob.type,
	lang: language,
	estChunks,
	attemptTimeoutMs,
	});
	const sarvam = await retryPostTranscribe(async (signal) => {
	// Race per-attempt timeout against the retry signal so a
	// hung connection still surfaces as an attempt failure (and
	// triggers the next backoff step) rather than blocking
	// forever. signal aborts when the OUTER retry loop is killed.
	const timeoutCtl = new AbortController();
	const timer = setTimeout(() => timeoutCtl.abort(), attemptTimeoutMs);
	const onOuterAbort = () => timeoutCtl.abort();
	signal.addEventListener("abort", onOuterAbort);
	try {
	return await postTranscribe(blob, language, timeoutCtl.signal);
	} finally {
	clearTimeout(timer);
	signal.removeEventListener("abort", onOuterAbort);
	}
	});
	if (sarvam) {
	const sarvamText = (sarvam.text \|\| "").trim();
	if (sarvamText) {
	authoritativeText = sarvamText;
	console.debug("[useStreamingVoice] Sarvam OK", {
	latency_ms: sarvam.latency_ms,
	webSpeechLen: accumulatedText.length,
	sarvamLen: sarvamText.length,
	});
	} else {
	console.debug("[useStreamingVoice] Sarvam returned empty; using Web Speech fallback");
	}
	} else {
	console.debug("[useStreamingVoice] Sarvam failed after retries; using Web Speech fallback");
	try { onVoiceErrorRef.current("transcribe_failed"); } catch { /* ignore */ }
	}
	}

	// KI-210 — final wait-and-retry after Sarvam round-trip. Don't
	// drop the now-authoritative transcript if text raced us during
	// the network call.
	if (authoritativeText) {
	await waitForTextClear();
	onFinalRef.current(authoritativeText);
	}
	})();
	};

	// (Re)start the grace-window timer. Every onend resets it, so as long
	// as the user keeps starting new word bursts within 1.5s of the last
	// silence, the timer never fires and the utterance keeps growing.
	if (pendingSubmitTimerRef.current !== null) {
	clearTimeout(pendingSubmitTimerRef.current);
	}
	submitStartTsRef.ts = 0;
	pendingSubmitTimerRef.current = setTimeout(() => {
	void submitPendingUtterance();
	}, UTTERANCE_GRACE_MS);
	};

	return rec;
	}, [language, isTextRequestPendingRef, clearRestartTimer, safeStart, stopRecorder, teardownAudio, ensureAudioCapture, waitForTextClear]);

	const start = useCallback(() => {
	if (!isSupported) {
	onErrorRef.current(
	"Live voice not supported in this browser. Use push-to-talk or type instead.",
	);
	return;
	}
	wantRunningRef.current = true;
	if (!recognitionRef.current) {
	recognitionRef.current = buildRecognition();
	}
	finalsRef.current = [];
	finalsConsumedRef.current = 0;
	// W1 (2026-05-15) — gate the SR start on a successful `getUserMedia`.
	// Previously this was `void ensureAudioCapture(); safeStart();` which
	// raced the two in parallel: on a Chromium / iOS Safari permission
	// denial, the recognition started, the pill flipped to "Voice on —
	// just speak", but the mic was dead (zero audio, no banner, no log).
	// By awaiting the capture result and skipping safeStart() on a hard
	// denial, the pill-flip (driven by page.tsx's
	// `onVoiceError("mic_permission_denied")` handler) lands BEFORE
	// recognition kicks off. The ensureAudioCapture catch already sets
	// wantRunningRef=false and emits onVoiceError on its way out.
	void (async () => {
	const ok = await ensureAudioCapture();
	// Hard denial path: capture failed AND ensureAudioCapture reset
	// wantRunningRef. Skip recognition.start — page.tsx will flip
	// `enabled` to false on the banner code, which triggers stop().
	if (!ok && !wantRunningRef.current) return;
	// Soft-degraded path: capture failed but wantRunning is still true
	// (e.g. MediaRecorder mime mismatch on a niche browser). Fall back
	// to Web-Speech-only — onend's restart loop handles the fallback.
	safeStart();
	})();
	}, [isSupported, buildRecognition, safeStart, ensureAudioCapture]);

	const stop = useCallback(() => {
	wantRunningRef.current = false;
	clearRestartTimer();
	const rec = recognitionRef.current;
	if (rec) {
	try {
	rec.abort();
	} catch {
	// ignore
	}
	// FIX 1 (HIGH) — Unbind handlers and null the ref so any late
	// onresult/onend events delivered by Chrome AFTER abort() can't
	// mutate finalsRef / pendingUtteranceRef / pendingChunksRef. Without
	// this, a stale recognition instance fires onend ~50-300ms after
	// abort() and re-arms the grace timer on a torn-down session.
	try {
	rec.onresult = null;
	rec.onerror = null;
	rec.onend = null;
	rec.onstart = null;
	} catch {
	// ignore — some browsers reject null assignment on EventTarget props
	}
	}
	recognitionRef.current = null;
	teardownAudio();
	finalsRef.current = [];
	finalsConsumedRef.current = 0;
	// FIX 6 (HIGH) — Mid-utterance toggle-off flush. If the user finishes
	// a complete sentence and toggles voice off within the 1.5s grace
	// window, submit the pending utterance instead of silently dropping
	// it. Only flush when no text request is racing; otherwise dropping
	// is safer than colliding with an in-flight turn.
	const finalPending = pendingUtteranceRef.current.trim();
	if (finalPending && !isTextRequestPendingRef.current) {
	console.debug("[useStreamingVoice] KI-222 flushing pending on stop", { len: finalPending.length });
	try {
	onFinalRef.current(finalPending);
	} catch {
	// never let a callback throw break stop()
	}
	}
	// KI-202 — drop any pending utterance so toggling voice off mid-grace
	// doesn't auto-submit a stale half-sentence next time voice comes on.
	if (pendingSubmitTimerRef.current !== null) {
	clearTimeout(pendingSubmitTimerRef.current);
	pendingSubmitTimerRef.current = null;
	}
	pendingUtteranceRef.current = "";
	pendingChunksRef.current = [];
	onListeningRef.current(false);
	}, [clearRestartTimer, teardownAudio, isTextRequestPendingRef]);

	// Drive start/stop from the `enabled` prop so the hook is fire-and-forget
	// for the caller (mirrors useLiveConversation's `live` state semantics).
	useEffect(() => {
	if (enabled) {
	start();
	} else {
	stop();
	}
	return () => {
	stop();
	};
	// eslint-disable-next-line react-hooks/exhaustive-deps
	}, [enabled]);

	// #53 / #54 — warm-stream lifecycle. The warm stream's lifecycle is
	// DELIBERATELY decoupled from `enabled` (which page.tsx flips OFF for pure
	// push-to-talk — see page.tsx:986 `live.setLive(false)` inside
	// startRecording). The user opting into voice latches the warm stream ON
	// for the rest of the hook's mounted lifetime so:
	// (a) the pre-roll ring is ALWAYS filling whenever the user might press
	// SPACE — including in pure-PTT mode when `enabled` is false — so the
	// first word spoken in page.tsx's cold-start gap survives (#53);
	// (b) a persistent open audio device keeps the OS mic hot so page.tsx's
	// own per-press getUserMedia resolves in ~10-50ms instead of
	// cold-starting (200-700ms), removing the felt start delay (#54).
	// Armed on the rising edge of `enabled` (the only voice-opt-in signal the
	// hook receives) and kept armed thereafter; fully released on unmount.
	useEffect(() => {
	if (!isSupported) return;
	if (enabled) {
	voiceEverEnabledRef.current = true;
	}
	if (enabled \|\| voiceEverEnabledRef.current) {
	void armWarmStream();
	}
	// No teardown on `enabled` going false — the warm stream must survive
	// the Live↔PTT toggle. Final release happens in the unmount cleanup.
	}, [enabled, isSupported, armWarmStream]);

	// #53 / #54 — warm-stream health watchdog. The OS can silently drop a
	// long-lived capture (device sleep, USB mic unplug, OS audio interruption,
	// tab backgrounding on some browsers) WITHOUT firing recorder.onerror. If
	// that happens the pre-roll ring goes stale and the very bug we fixed
	// returns. Every 4s, while voice has been opted into and we're not in the
	// middle of a PTT capture, re-assert the warm stream (armWarmStream is a
	// no-op when the recorder is healthily "recording").
	useEffect(() => {
	if (!isSupported) return;
	const tick = setInterval(() => {
	if (!voiceEverEnabledRef.current) return;
	if (pttEngagedRef.current) return; // don't disturb an in-flight capture
	const rec = warmRecorderRef.current;
	if (!rec \|\| rec.state !== "recording" \|\| !warmStreamRef.current) {
	void armWarmStream();
	}
	}, 4000);
	return () => clearInterval(tick);
	}, [isSupported, armWarmStream]);

	// KI-173 (2026-05-15) — heartbeat watchdog. Browser SpeechRecognition
	// occasionally enters a stopped state without `onend` firing (certain
	// network errors, transient OS audio interruptions, tab visibility
	// edge cases). The auto-restart in `onend` never gets the chance to
	// run, and the mic stays silently dead until the user toggles voice
	// off+on. Every 4s, if we WANT to be listening (enabled + wantRunningRef)
	// and no text turn is racing and no restart is already scheduled, call
	// `safeStart()` unconditionally — InvalidStateError is swallowed if
	// recognition is already running, otherwise this revives the dead state.
	useEffect(() => {
	if (!enabled \|\| !isSupported) return;
	const tick = setInterval(() => {
	if (
	wantRunningRef.current
	&& !isTextRequestPendingRef.current
	&& !isTtsPlayingRef.current // KI-188 — block revival during TTS playback
	&& restartTimerRef.current === null
	) {
	safeStart();
	}
	}, 4000);
	return () => clearInterval(tick);
	}, [enabled, isSupported, isTextRequestPendingRef, safeStart]);

	// KI-188 (2026-05-15) — TTS playback gate. Browser Web Speech API has
	// its own internal mic pipeline that bypasses our getUserMedia AEC
	// constraints (KI-185), so SpeechRecognition transcribes the bot's TTS
	// audio bleeding from speakers as if it were user input. The visible
	// echo "perfect days to get started Rohit" was echo of bot's TTS
	// "perfect age to get started, Rohit". The only reliable JS-level fix
	// is to ABORT recognition while ANY <audio> element in the DOM is
	// playing, then revive via the heartbeat (KI-173) the moment all
	// audio ends.
	//
	// Trade-off: live "barge-in by just speaking" is disabled DURING TTS.
	// Push-to-talk still works (it uses MediaRecorder, not SpeechRecognition).
	useEffect(() => {
	if (!enabled \|\| !isSupported) return;
	if (typeof document === "undefined") return;

	// KI-189 (2026-05-15) — barge-in VAD state. The AnalyserNode + AudioContext
	// are lazily created on first TTS-playback and reused for subsequent
	// playbacks to avoid repeated AudioContext spin-up cost (Chrome warns
	// when >6 contexts coexist).
	let audioCtx: AudioContext \| null = null;
	let analyser: AnalyserNode \| null = null;
	let sourceNode: MediaStreamAudioSourceNode \| null = null;
	let attachedStream: MediaStream \| null = null;
	let rmsBuf: Float32Array<ArrayBuffer> \| null = null;
	let sustainedFrames = 0;
	let rafId: number \| null = null;

	// KI-190 — per-<audio> bot-RMS analysers for adaptive threshold.
	// Each watched audio element gets its own MediaElementAudioSourceNode +
	// AnalyserNode so we can read the bot's instantaneous playback level
	// during a barge-in tick. Map keyed by the audio element.
	const botAnalysers = new Map<HTMLAudioElement, {
	source: MediaElementAudioSourceNode;
	analyser: AnalyserNode;
	buf: Float32Array<ArrayBuffer>;
	}>();
	// Track which <audio> elements we've dimmed so we can restore on cleanup.
	const duckedAudios = new Set<HTMLAudioElement>();
	// KI-195 — user-speech RMS tracker + per-element calibrated volume.
	// userSpeechRms is the rolling peak of mic RMS observed while the user
	// is actively speaking (recorder active, not TTS). It seeds the bot
	// volume target. Calibrated volumes per element survive across turns
	// so we don't have to re-learn after every reply.
	let userSpeechRms = USER_SPEECH_RMS_INITIAL;
	const calibratedVolumes = new Map<HTMLAudioElement, number>();
	let userRmsRafId: number \| null = null;
	let volumeCalibIntervalId: ReturnType<typeof setInterval> \| null = null;
	// FIX 5 (HIGH) — wall-clock decay interval. The rAF-driven userRmsTick
	// is gated on `!isTtsPlaying`, so during bot TTS playback there is NO
	// decay of userSpeechRms — a shout right before the bot starts speaking
	// would pin userSpeechRms at 0.4 for the entire bot turn. This setInterval
	// runs unconditionally while `enabled` is true, so the rolling peak
	// decays toward USER_SPEECH_RMS_INITIAL on a wall-clock schedule that's
	// independent of the rAF gate.
	let userRmsWallClockIntervalId: ReturnType<typeof setInterval> \| null = null;

	const sampleUserRms = (): number => {
	if (!analyser \|\| !rmsBuf) return 0;
	try {
	analyser.getFloatTimeDomainData(rmsBuf);
	} catch { return 0; }
	let sumSq = 0;
	for (let i = 0; i < rmsBuf.length; i++) {
	const v = rmsBuf[i];
	sumSq += v * v;
	}
	return Math.sqrt(sumSq / rmsBuf.length);
	};

	const userRmsTick = () => {
	// Only learn while user is potentially speaking — recorder active,
	// no TTS, voice mode on.
	if (
	!wantRunningRef.current
	\|\| isTtsPlayingRef.current
	\|\| !recorderActiveRef.current
	) {
	userRmsRafId = null;
	return;
	}
	if (!analyser \|\| !rmsBuf) {
	userRmsRafId = null;
	return;
	}
	const rms = sampleUserRms();
	// Only count as "user speaking" when above detection threshold.
	// Then update userSpeechRms via slow EMA on peak so a single shout
	// doesn't permanently raise the baseline.
	if (rms > USER_SPEECH_DETECTION_THRESHOLD) {
	userSpeechRms = Math.max(userSpeechRms * 0.95, rms);
	// FIX 5 (HIGH) — clamp to ceiling so a single shout cannot pin
	// userSpeechRms permanently high and break subsequent barge-in.
	userSpeechRms = Math.min(userSpeechRms, USER_SPEECH_RMS_CEILING);
	}
	userRmsRafId = requestAnimationFrame(userRmsTick);
	};

	const startUserRmsLoop = () => {
	if (userRmsRafId !== null) return;
	// Reuse the VAD analyser. startBargeInLoop sets it up; if it doesn't
	// exist yet, the loop will exit on first tick (analyser null) and
	// restart on the next state transition.
	userRmsRafId = requestAnimationFrame(userRmsTick);
	};

	const stopUserRmsLoop = () => {
	if (userRmsRafId !== null) {
	cancelAnimationFrame(userRmsRafId);
	userRmsRafId = null;
	}
	};

	// FIX 5 (HIGH) — wall-clock decay. Runs every USER_SPEECH_RMS_WALL_CLOCK_DECAY_MS
	// regardless of TTS state so the rolling peak can't get permanently
	// pinned high during long TTS turns. Floors at USER_SPEECH_RMS_INITIAL
	// so we don't decay below the calibrated baseline.
	const startUserRmsWallClockDecay = () => {
	if (userRmsWallClockIntervalId !== null) return;
	userRmsWallClockIntervalId = setInterval(() => {
	userSpeechRms = Math.max(
	USER_SPEECH_RMS_INITIAL,
	userSpeechRms * USER_SPEECH_RMS_WALL_CLOCK_DECAY_FACTOR,
	);
	}, USER_SPEECH_RMS_WALL_CLOCK_DECAY_MS);
	};

	const stopUserRmsWallClockDecay = () => {
	if (userRmsWallClockIntervalId !== null) {
	clearInterval(userRmsWallClockIntervalId);
	userRmsWallClockIntervalId = null;
	}
	};

	// KI-195 — volume calibration tick. Runs during TTS. Samples bot RMS
	// at the mic via botAnalysers. If bot is louder than target relative
	// to userSpeechRms, duck el.volume by 20% per tick down to the floor.
	const calibrateBotVolume = () => {
	if (!isTtsPlayingRef.current) {
	if (volumeCalibIntervalId !== null) {
	clearInterval(volumeCalibIntervalId);
	volumeCalibIntervalId = null;
	}
	return;
	}
	const target = userSpeechRms * VOLUME_CALIB_TARGET_RATIO;
	const botRms = computeBotRms();
	if (botRms > target) {
	ttsAudioElementsRef.current.forEach((el) => {
	if (el.paused \|\| el.ended) return;
	const cur = el.volume;
	const next = Math.max(VOLUME_CALIB_FLOOR, cur * VOLUME_CALIB_DUCK_FACTOR);
	if (next < cur - 0.001) {
	try {
	el.volume = next;
	calibratedVolumes.set(el, next);
	} catch { /* ignore */ }
	}
	});
	}
	};

	const startVolumeCalibration = () => {
	if (volumeCalibIntervalId !== null) return;
	volumeCalibIntervalId = setInterval(calibrateBotVolume, VOLUME_CALIB_TICK_MS);
	};

	const stopVolumeCalibration = () => {
	if (volumeCalibIntervalId !== null) {
	clearInterval(volumeCalibIntervalId);
	volumeCalibIntervalId = null;
	}
	};

	const stopBargeInLoop = () => {
	if (rafId !== null) {
	cancelAnimationFrame(rafId);
	rafId = null;
	}
	sustainedFrames = 0;
	};

	const teardownAnalyser = () => {
	stopBargeInLoop();
	try { sourceNode?.disconnect(); } catch { /* ignore */ }
	try { analyser?.disconnect(); } catch { /* ignore */ }
	sourceNode = null;
	analyser = null;
	attachedStream = null;
	rmsBuf = null;
	// KI-190 — tear down bot analysers + audio context.
	botAnalysers.forEach((entry) => {
	try { entry.source.disconnect(); } catch { /* ignore */ }
	try { entry.analyser.disconnect(); } catch { /* ignore */ }
	});
	botAnalysers.clear();
	if (audioCtx) {
	const ctx = audioCtx;
	audioCtx = null;
	try { void ctx.close(); } catch { /* ignore */ }
	}
	};

	// KI-190 — ensure an AudioContext exists for bot analyser attachment.
	// Reuses the same instance the VAD path uses.
	const ensureAudioCtx = (): AudioContext \| null => {
	if (audioCtx && audioCtx.state !== "closed") return audioCtx;
	try {
	const Ctor = (window.AudioContext
	\|\| (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext);
	if (!Ctor) return null;
	audioCtx = new Ctor();
	return audioCtx;
	} catch {
	return null;
	}
	};

	// KI-190 — attach an AnalyserNode to a bot <audio> element. Routes the
	// element's audio through the AudioContext (source → analyser →
	// destination so it stays audible). createMediaElementSource throws if
	// called twice on the same element, so we swallow and skip.
	const attachBotAnalyser = (el: HTMLAudioElement) => {
	if (botAnalysers.has(el)) return;
	const ctx = ensureAudioCtx();
	if (!ctx) return;
	try {
	const source = ctx.createMediaElementSource(el);
	const an = ctx.createAnalyser();
	an.fftSize = 1024;
	an.smoothingTimeConstant = 0.4;
	source.connect(an);
	an.connect(ctx.destination);
	const buf = new Float32Array(new ArrayBuffer(an.fftSize * 4));
	botAnalysers.set(el, { source, analyser: an, buf });
	} catch {
	// already routed through Web Audio elsewhere, or autoplay policy
	// blocked the context — bargeInTick will simply use the base
	// threshold for this turn.
	}
	};

	// KI-190 — current peak bot RMS across all playing <audio> elements.
	// We take the max (not sum) because only one TTS plays at a time in
	// practice and max behaves more sensibly if a stale paused element is
	// still in the map.
	const computeBotRms = (): number => {
	let peak = 0;
	botAnalysers.forEach(({ analyser: an, buf }, el) => {
	if (el.paused \|\| el.ended) return; // ignore idle elements
	an.getFloatTimeDomainData(buf);
	let sumSq = 0;
	for (let i = 0; i < buf.length; i++) {
	const v = buf[i];
	sumSq += v * v;
	}
	// The MediaElementSource is post-volume, so this already reflects
	// the ducked KI-191 0.6 volume — we get the actual audible level.
	const rms = Math.sqrt(sumSq / buf.length);
	if (rms > peak) peak = rms;
	});
	return peak;
	};

	const triggerBargeIn = (rms: number) => {
	console.debug("[useStreamingVoice] KI-189 barge-in detected", {
	rms: rms.toFixed(4),
	frames: sustainedFrames,
	threshold: BARGE_IN_RMS_THRESHOLD,
	});
	// KI-227 (2026-05-15) — V6.7. Flush any pending utterance that
	// accumulated during the bot's TTS window BEFORE the barge-in fires.
	// The grace-window timer (UTTERANCE_GRACE_MS) holds the user's
	// utterance for up to 1.5s waiting for more bursts — if the user
	// barges in over the bot before that timer fires, the pending text
	// would otherwise sit silently until the timer expires. Deliver it
	// now so page.tsx submits the user's actual question instead of
	// letting it die on the floor while a fresh recognition starts.
	try {
	const flushText = pendingUtteranceRef.current.trim();
	if (flushText && !isTextRequestPendingRef.current) {
	console.debug("[useStreamingVoice] V6.7 flushing pending utterance on barge-in", {
	len: flushText.length,
	});
	pendingUtteranceRef.current = "";
	pendingChunksRef.current = [];
	finalsRef.current = [];
	finalsConsumedRef.current = 0;
	if (pendingSubmitTimerRef.current !== null) {
	clearTimeout(pendingSubmitTimerRef.current);
	pendingSubmitTimerRef.current = null;
	}
	onFinalRef.current(flushText);
	}
	} catch (err) {
	// Never let the flush throw break the barge-in pipeline.
	console.debug("[useStreamingVoice] V6.7 pending flush threw", err);
	}
	// FIX 3 (HIGH) — flip the barge-in signal so the caller (page.tsx)
	// can abort the in-flight /api/chat request that's still assembling
	// more TTS audio. Without this, pausing the currently-mounted
	// <audio> elements only stops THIS chunk; the next TTS chunk that
	// arrives mounts a new <audio>, fires play, and the bot resumes
	// talking after the user has already interrupted.
	bargeInRequestedRef.current = true;
	// Pause + reset every TTS <audio>; the MutationObserver's pause
	// listener will set isTtsPlayingRef = false and call safeStart().
	ttsAudioElementsRef.current.forEach((el) => {
	try {
	el.pause();
	el.currentTime = 0;
	} catch {
	// ignore
	}
	});
	stopBargeInLoop();
	};

	const bargeInTick = () => {
	// Re-check gating each frame — if state changed mid-loop, exit cleanly.
	if (
	!isTtsPlayingRef.current
	\|\| !wantRunningRef.current
	\|\| isTextRequestPendingRef.current
	) {
	stopBargeInLoop();
	return;
	}
	if (!analyser \|\| !rmsBuf) {
	stopBargeInLoop();
	return;
	}
	analyser.getFloatTimeDomainData(rmsBuf);
	let sumSq = 0;
	// FIX 4 (HIGH) — compute zero-crossing rate alongside RMS. Speech
	// ZCR sits in a specific band; keyboard typing has very high ZCR
	// (transients), HVAC / room rumble has very low ZCR (DC-like).
	// Rejecting frames outside the speech band cuts false-positive
	// barge-ins from typing and ambient noise.
	let zeroCrossings = 0;
	let prevSign = rmsBuf[0] >= 0 ? 1 : -1;
	for (let i = 0; i < rmsBuf.length; i++) {
	const v = rmsBuf[i];
	sumSq += v * v;
	if (i > 0) {
	const sign = v >= 0 ? 1 : -1;
	if (sign !== prevSign) zeroCrossings += 1;
	prevSign = sign;
	}
	}
	const rms = Math.sqrt(sumSq / rmsBuf.length);
	// KI-228 (2026-05-15) — V6.8. Feed every frame into the adaptive
	// noise-floor estimator. It only updates the EMA when the frame is
	// below the CURRENT threshold (i.e. the frame looks like silence),
	// so speech bursts can't pollute the room baseline.
	noiseFloorRef.current.feed(rms);
	const noiseAdaptiveThreshold = noiseFloorRef.current.currentThreshold();
	// KI-190 — adaptive threshold: bot_rms * 2 + 0.005, floored at the
	// base BARGE_IN_RMS_THRESHOLD so we never set it absurdly low.
	// KI-228 (2026-05-15) — V6.8. ALSO floor at the noise-floor adaptive
	// threshold so a noisy room (HVAC, café) doesn't cause false-positive
	// barge-ins on the original static 0.008 threshold.
	const botRms = computeBotRms();
	// KI-285 (2026-05-16) — defence-in-depth for the post-grace window.
	// computeBotRms() returns 0 not only for the first frames of playback
	// but PERMANENTLY whenever createMediaElementSource() threw (Safari,
	// element already Web-Audio-routed, autoplay-suspended ctx). In that
	// state the `botRms * MULT + BASE` term collapses to 0.002 and the
	// whole Math.max() falls back to the bare 0.008 static floor — which
	// is BELOW documented speaker echo bleed (~0.02 RMS, KI-189/190). The
	// bot's own voice then clears the gate and self-triggers a barge-in.
	// When we have no usable bot-level reference, hold the threshold at an
	// echo-safe floor: above worst-case AEC residual, well below the
	// 0.05-0.2 RMS of real user speech, so genuine barge-in still fires.
	const haveBotRef = botRms > 0;
	const adaptiveThreshold = Math.max(
	haveBotRef ? BARGE_IN_RMS_THRESHOLD : BARGE_IN_NO_BOTREF_FLOOR,
	noiseAdaptiveThreshold,
	botRms * BARGE_IN_BOT_RMS_MULTIPLIER + BARGE_IN_BASE_THRESHOLD,
	);
	// FIX 4 / KI-225 (V1.3) — speech ZCR band scaled to the actual
	// AudioContext sampleRate. At 48 kHz that's the original 20..250;
	// at 16 kHz it's ~7..83.
	const band = zcrBandRef.current;
	const isSpeechBand = zeroCrossings >= band.min && zeroCrossings <= band.max;

	// KI-285 (2026-05-16) — echo-suppression grace window. For the first
	// BARGE_IN_GRACE_MS of the bot's reply, the energy at the mic is the
	// bot's OWN audio echoing back (browser AEC is imperfect on speaker
	// users), NOT the user. Refuse to accumulate sustained frames or
	// trigger during this window, but KEEP the rAF loop alive so the
	// instant the window elapses — if the user is genuinely speaking over
	// the bot — the sustained-energy gate re-arms and fires within
	// BARGE_IN_SUSTAINED_FRAMES (~100ms). Hold sustainedFrames at 0 so an
	// echo burst that straddles the grace boundary cannot carry partial
	// credit past it. Real barge-in is the user talking for seconds, so
	// it always survives a 600ms suppression; the bot's start-of-reply
	// echo, which cannot outlast the window without the user speaking, is
	// the only thing suppressed. `started === 0` means no active playback
	// stamp (defensive): treat as still-in-grace so we never trigger on a
	// stale/unknown timeline.
	const started = ttsPlaybackStartedAtRef.current;
	const inGraceWindow =
	started === 0 \|\| Date.now() - started < BARGE_IN_GRACE_MS;
	if (inGraceWindow) {
	sustainedFrames = 0;
	rafId = requestAnimationFrame(bargeInTick);
	return;
	}

	if (rms >= adaptiveThreshold && isSpeechBand) {
	sustainedFrames += 1;
	if (sustainedFrames >= BARGE_IN_SUSTAINED_FRAMES) {
	triggerBargeIn(rms);
	return;
	}
	} else {
	sustainedFrames = 0;
	}
	rafId = requestAnimationFrame(bargeInTick);
	};

	const startBargeInLoop = () => {
	// Gating: voice mode active, no racing text turn, MediaRecorder live.
	if (!wantRunningRef.current) return;
	if (isTextRequestPendingRef.current) return;
	if (!recorderActiveRef.current) return;
	const stream = mediaStreamRef.current;
	if (!stream \|\| stream.getAudioTracks().length === 0) return;

	try {
	// Reuse the AudioContext + AnalyserNode if the same stream is still
	// attached; otherwise rebuild (the stream may have been swapped out
	// by teardownAudio() between TTS plays).
	if (!audioCtx \|\| audioCtx.state === "closed") {
	const Ctor = (window.AudioContext
	\|\| (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext);
	if (!Ctor) return;
	audioCtx = new Ctor();
	}
	if (audioCtx.state === "suspended") {
	// KI-223 (2026-05-15) — V1.1. Best-effort resume; if it rejects
	// (Chrome's autoplay policy requires a user gesture), surface a
	// structured error so the UI can prompt the user to tap. Without
	// this, the VAD silently never fires and barge-in appears broken
	// for the entire session.
	void audioCtx.resume().catch((err) => {
	console.debug("[useStreamingVoice] V1.1 AudioContext.resume failed", err);
	try { onVoiceErrorRef.current("audio_context_suspended"); } catch { /* ignore */ }
	});
	}
	if (!analyser \|\| attachedStream !== stream) {
	try { sourceNode?.disconnect(); } catch { /* ignore */ }
	try { analyser?.disconnect(); } catch { /* ignore */ }
	analyser = audioCtx.createAnalyser();
	analyser.fftSize = 2048;
	analyser.smoothingTimeConstant = 0.5;
	sourceNode = audioCtx.createMediaStreamSource(stream);
	sourceNode.connect(analyser);
	attachedStream = stream;
	rmsBuf = new Float32Array(new ArrayBuffer(analyser.fftSize * 4));
	// KI-225 (2026-05-15) — V1.3. Compare the AudioContext's actual
	// sampleRate against the track's reported rate. If they disagree,
	// log a warning AND rescale the speech ZCR band so the VAD math
	// keeps meaning at 16 kHz / 24 kHz consumer mics (the static
	// 20..250 band from KI-189 was calibrated for 48 kHz).
	try {
	const trackRate = stream.getAudioTracks()[0]?.getSettings?.().sampleRate;
	const ctxRate = audioCtx.sampleRate;
	if (trackRate && Math.abs(trackRate - ctxRate) > 100) {
	console.debug(
	"[useStreamingVoice] V1.3 sample-rate mismatch",
	{ trackRate, ctxRate },
	);
	}
	zcrBandRef.current = scaleSpeechZcrBand(ctxRate);
	} catch {
	// Older browsers without MediaTrackSettings.sampleRate — keep
	// the reference band.
	zcrBandRef.current = scaleSpeechZcrBand(audioCtx.sampleRate);
	}
	}
	sustainedFrames = 0;
	if (rafId !== null) cancelAnimationFrame(rafId);
	rafId = requestAnimationFrame(bargeInTick);
	} catch (err) {
	console.debug("[useStreamingVoice] KI-189 VAD init failed", err);
	teardownAnalyser();
	}
	};

	const updateTtsState = () => {
	let anyPlaying = false;
	ttsAudioElementsRef.current.forEach((el) => {
	if (!el.paused && !el.ended) anyPlaying = true;
	});
	const wasPlaying = isTtsPlayingRef.current;
	isTtsPlayingRef.current = anyPlaying;
	if (anyPlaying && !wasPlaying) {
	// TTS just started — abort any in-flight recognition so it stops
	// transcribing the bot voice.
	// KI-285 (2026-05-16) — stamp the playback-start instant so the
	// barge-in tick can suppress detection during the BARGE_IN_GRACE_MS
	// echo window. This is the ONLY false→true edge, so it captures the
	// true start of the reply (not a per-chunk restart — the reply is a
	// single <audio> blob; see BARGE_IN_GRACE_MS comment).
	ttsPlaybackStartedAtRef.current = Date.now();
	console.debug("[useStreamingVoice] KI-188 TTS started — pausing recognition");
	// KI-203 (2026-05-15) — flip the result-drop flag the INSTANT TTS
	// starts. abort() below has a ~100-300ms tail during which onresult
	// can still fire with bot-voice transcripts; the flag closes that
	// window unconditionally.
	if (dropResultsClearTimerRef.current !== null) {
	clearTimeout(dropResultsClearTimerRef.current);
	dropResultsClearTimerRef.current = null;
	}
	dropResultsRef.current = true;
	console.debug("[useStreamingVoice] KI-203 dropResultsRef=true (TTS start)");
	const rec = recognitionRef.current;
	if (rec) {
	try { rec.abort(); } catch { /* ignore */ }
	}
	// KI-195 — user cannot be speaking during TTS playback; stop the
	// RMS-learning loop until TTS ends so we don't capture bot audio
	// bleed-through as "user speech level".
	stopUserRmsLoop();
	// KI-191 — re-duck every playing audio in case React or the audio
	// element default reset volume after watchAudio set it.
	ttsAudioElementsRef.current.forEach((el) => {
	if (!el.paused && el.volume !== VOICE_MODE_TTS_VOLUME) {
	try { el.volume = VOICE_MODE_TTS_VOLUME; } catch { /* ignore */ }
	}
	});
	// KI-195 — once the volume floor is set, begin adaptive calibration
	// so the bot's volume tracks the learned user speech level.
	startVolumeCalibration();
	// KI-192 (2026-05-15) — MediaRecorder might be torn down between
	// user utterances (KI-168 teardownAudio). Without an active
	// recorder, startBargeInLoop bails on the recorderActiveRef check
	// and barge-in never fires. Fire-and-forget ensureAudioCapture
	// first; if it succeeds, the VAD loop has a live stream.
	if (wantRunningRef.current && !isTextRequestPendingRef.current) {
	void ensureAudioCapture().then(() => {
	// Re-check we're still in TTS-playing state — TTS may have
	// ended during the async ensureAudioCapture round-trip.
	if (isTtsPlayingRef.current) {
	startBargeInLoop();
	}
	});
	} else {
	startBargeInLoop(); // best-effort if gates won't allow capture rebuild
	}
	} else if (!anyPlaying && wasPlaying) {
	// TTS just ended — let the heartbeat/visibility listeners revive.
	// Trigger immediately too so the user doesn't wait ~4s.
	// KI-285 (2026-05-16) — clear the playback-start stamp so a stale
	// value can't accidentally satisfy the grace check on the next turn
	// before updateTtsState re-stamps it.
	ttsPlaybackStartedAtRef.current = 0;
	console.debug("[useStreamingVoice] KI-188 TTS ended — resuming recognition");
	// KI-203 (2026-05-15) — keep dropping recognition results for
	// POST_TTS_DROP_MS after TTS ends. The recognition pipeline we
	// abort()'d at TTS-start can still deliver buffered events for a
	// beat; without this delayed clear, the tail of the bot's TTS
	// leaks into the input box as the user starts speaking.
	if (dropResultsClearTimerRef.current !== null) {
	clearTimeout(dropResultsClearTimerRef.current);
	}
	dropResultsClearTimerRef.current = setTimeout(() => {
	dropResultsRef.current = false;
	dropResultsClearTimerRef.current = null;
	console.debug("[useStreamingVoice] KI-203 dropResultsRef=false (post-TTS window over)");
	}, POST_TTS_DROP_MS);
	stopBargeInLoop();
	// KI-195 — freeze the per-element calibrated volume and resume
	// learning the user's speech RMS for the next turn.
	stopVolumeCalibration();
	startUserRmsLoop();
	if (wantRunningRef.current && !isTextRequestPendingRef.current) {
	safeStart();
	}
	}
	};

	const watchAudio = (el: HTMLAudioElement) => {
	if (ttsAudioElementsRef.current.has(el)) return;
	ttsAudioElementsRef.current.add(el);
	// KI-191 — duck bot TTS to 60% while voice mode is on, so AEC residual
	// is even quieter and barge-in is trivial.
	// KI-195 — if we already calibrated a volume for this exact element on
	// a previous turn (rare — elements are usually recreated), reuse it so
	// we don't reset the adaptive level on every play() event.
	try {
	const prior = calibratedVolumes.get(el);
	el.volume = prior !== undefined ? prior : VOICE_MODE_TTS_VOLUME;
	duckedAudios.add(el);
	} catch { /* readonly volume on some platforms — ignore */ }
	// KI-190 — attach bot-level analyser for adaptive threshold.
	attachBotAnalyser(el);
	el.addEventListener("play", updateTtsState);
	el.addEventListener("playing", updateTtsState);
	el.addEventListener("pause", updateTtsState);
	el.addEventListener("ended", updateTtsState);
	// Initial check (handles audio that was already playing on mount)
	updateTtsState();
	};

	const unwatchAudio = (el: HTMLAudioElement) => {
	if (!ttsAudioElementsRef.current.has(el)) return;
	el.removeEventListener("play", updateTtsState);
	el.removeEventListener("playing", updateTtsState);
	el.removeEventListener("pause", updateTtsState);
	el.removeEventListener("ended", updateTtsState);
	ttsAudioElementsRef.current.delete(el);
	updateTtsState();
	};

	// Initial scan
	document.querySelectorAll("audio").forEach((el) => watchAudio(el as HTMLAudioElement));

	// Watch the whole document for new <audio> elements
	const observer = new MutationObserver((mutations) => {
	mutations.forEach((m) => {
	m.addedNodes.forEach((n) => {
	if (n instanceof HTMLElement) {
	if (n.tagName === "AUDIO") watchAudio(n as HTMLAudioElement);
	n.querySelectorAll?.("audio").forEach((el) => watchAudio(el as HTMLAudioElement));
	}
	});
	m.removedNodes.forEach((n) => {
	if (n instanceof HTMLElement) {
	if (n.tagName === "AUDIO") unwatchAudio(n as HTMLAudioElement);
	n.querySelectorAll?.("audio").forEach((el) => unwatchAudio(el as HTMLAudioElement));
	}
	});
	});
	});
	observer.observe(document.body, { childList: true, subtree: true });

	// KI-195 — kick off the user-RMS learning loop on mount so by the time
	// the first TTS plays we already have a baseline. The loop self-exits
	// when conditions aren't met (no analyser / no stream / in TTS), so
	// firing it unconditionally here is safe.
	startUserRmsLoop();
	// FIX 5 (HIGH) — start the wall-clock decay so userSpeechRms never
	// gets permanently pinned high (even during TTS playback when the
	// rAF loop is gated off).
	startUserRmsWallClockDecay();

	return () => {
	// KI-195 — tear down adaptive volume calibration before clearing
	// ducked-audio state so the calibration tick can't race a clear().
	stopUserRmsLoop();
	// FIX 5 (HIGH) — clean up the wall-clock decay interval.
	stopUserRmsWallClockDecay();
	stopVolumeCalibration();
	calibratedVolumes.clear();
	observer.disconnect();
	// KI-191 — restore bot TTS volume to default before unmount so a
	// subsequent voice-OFF session doesn't end up with silent audio.
	duckedAudios.forEach((el) => {
	try { el.volume = 1.0; } catch { /* ignore */ }
	});
	duckedAudios.clear();
	ttsAudioElementsRef.current.forEach((el) => {
	el.removeEventListener("play", updateTtsState);
	el.removeEventListener("playing", updateTtsState);
	el.removeEventListener("pause", updateTtsState);
	el.removeEventListener("ended", updateTtsState);
	});
	ttsAudioElementsRef.current.clear();
	isTtsPlayingRef.current = false;
	// KI-203 — clear the post-TTS drop-results window timer so a
	// disabled-then-re-enabled voice mode doesn't inherit a stale flag.
	if (dropResultsClearTimerRef.current !== null) {
	clearTimeout(dropResultsClearTimerRef.current);
	dropResultsClearTimerRef.current = null;
	}
	dropResultsRef.current = false;
	// KI-189 — release AnalyserNode + AudioContext on unmount / disable.
	teardownAnalyser();
	};
	}, [enabled, isSupported, isTextRequestPendingRef, safeStart]);

	// KI-174 (2026-05-15) — immediate-revival on visibility/focus changes.
	// User reported: "sometimes when I go away from clicking the text box,
	// it seems to not input my voice anymore. I have to restart the whole
	// voice thing." Root cause: Chrome's SpeechRecognition auto-stops
	// when the tab loses visibility (tab switch, app switch, screenshot,
	// OS modal). The KI-173 heartbeat is throttled to ~1Hz when the tab
	// is hidden, so it takes several seconds to revive after returning.
	// Force-revival on:
	// - document `visibilitychange` → visible
	// - window `focus`
	// Both check wantRunningRef + isTextRequestPendingRef before firing.
	useEffect(() => {
	if (!enabled \|\| !isSupported) return;
	if (typeof window === "undefined" \|\| typeof document === "undefined") return;

	const tryRevive = (trigger: string) => {
	if (
	wantRunningRef.current
	&& !isTextRequestPendingRef.current
	&& !isTtsPlayingRef.current // KI-188 — block revival during TTS
	&& document.visibilityState === "visible"
	) {
	console.debug("[useStreamingVoice] revival trigger=" + trigger);
	safeStart();
	}
	};

	const onVisible = () => tryRevive("visibilitychange");
	const onFocus = () => tryRevive("window.focus");
	document.addEventListener("visibilitychange", onVisible);
	window.addEventListener("focus", onFocus);
	return () => {
	document.removeEventListener("visibilitychange", onVisible);
	window.removeEventListener("focus", onFocus);
	};
	}, [enabled, isSupported, isTextRequestPendingRef, safeStart]);

	// Unmount cleanup.
	useEffect(() => {
	return () => {
	wantRunningRef.current = false;
	clearRestartTimer();
	const rec = recognitionRef.current;
	if (rec) {
	try { rec.abort(); } catch {}
	rec.onresult = null;
	rec.onerror = null;
	rec.onend = null;
	rec.onstart = null;
	}
	recognitionRef.current = null;
	teardownAudio();
	// KI-202 — clear pending utterance grace timer on unmount.
	if (pendingSubmitTimerRef.current !== null) {
	clearTimeout(pendingSubmitTimerRef.current);
	pendingSubmitTimerRef.current = null;
	}
	pendingUtteranceRef.current = "";
	pendingChunksRef.current = [];
	// #53 / #54 — release the warm stream + recorder + AudioContext on
	// unmount so the OS mic indicator goes off when the app is torn down.
	disarmWarmStream();
	};
	}, [clearRestartTimer, teardownAudio, disarmWarmStream]);

	// FIX 3 (HIGH) — one-shot read-and-clear of the barge-in flag. Returns
	// true exactly once after triggerBargeIn fires; subsequent calls return
	// false until the next barge-in event.
	const consumeBargeInSignal = useCallback((): boolean => {
	if (bargeInRequestedRef.current) {
	bargeInRequestedRef.current = false;
	return true;
	}
	return false;
	}, []);

	return {
	start,
	stop,
	isSupported,
	consumeBargeInSignal,
	// #53 / #54 — warm-stream + pre-roll push-to-talk API.
	isWarm,
	armWarmStream,
	disarmWarmStream,
	beginPushToTalk,
	endPushToTalk,
	consumePreRollChunks,
	};
	}