Spaces:
Sleeping
Sleeping
| "use client"; | |
| /** | |
| * useStreamingVoice β KI-168 (2026-05-15). | |
| * | |
| * Replaces the custom AudioWorklet + VAD + WAV-encode + /api/transcribe path | |
| * (useLiveConversation) with the browser's native Web Speech API. The user | |
| * sees their words land in the chat input area in real time as they speak, | |
| * just like ChatGPT / Claude voice mode β and when the browser detects | |
| * end-of-utterance silence, the final transcript is auto-submitted through | |
| * the existing send() path. | |
| * | |
| * Why this exists | |
| * ------------------------------------------------------------------------- | |
| * The previous live-mode stack accumulated 12+ KIs of failure modes | |
| * (KI-044/057/060/064/113/114/115/131/134/139/141/159/165) trying to bolt | |
| * a reliable VAD onto raw mic PCM. Every fix surfaced a new failure on a | |
| * different mic / room / browser combo. The native SpeechRecognition API | |
| * gives us: | |
| * - browser-grade end-of-speech detection (no rmsThreshold tuning) | |
| * - streaming interim transcripts (no "where did my words go?" gap) | |
| * - in-browser STT (no /api/transcribe round-trip latency) | |
| * | |
| * Behaviour | |
| * ------------------------------------------------------------------------- | |
| * - `enabled = true` β recognition.start() runs, mic icon stays live, | |
| * interim transcript streams into the chat input via onInterimTranscript. | |
| * - Browser detects ~1.5s silence β onend fires β we hand the final | |
| * transcript to onFinalTranscript (caller calls send()). | |
| * - After onend, if `enabled` is still true and no text request is in | |
| * flight, we restart recognition so the mic stays live (continuous-mode | |
| * emulation; native `continuous=true` doesn't fire silence-end on most | |
| * browsers, so we use continuous=false + auto-restart instead). | |
| * - `enabled = false` β recognition.abort() runs, no callbacks fire. | |
| * | |
| * Bot TTS playback is untouched β the page.tsx-owned <audio> elements still | |
| * play Sarvam-generated audio for assistant replies. | |
| */ | |
| import { useCallback, useEffect, useRef, useState } from "react"; | |
| import { postTranscribe } from "./api"; | |
| // KI-223..228 (2026-05-15) β additive resilience layer (V1.1/V1.3/V5.4/V6.8). | |
| // Lives in a sibling module so the hook body stays under control and the | |
| // retry / noise-floor / sample-rate helpers can be unit-tested in isolation. | |
| import { | |
| retryPostTranscribe, | |
| scaleSpeechZcrBand, | |
| AdaptiveNoiseFloor, | |
| type VoiceError as VoiceErrorBase, | |
| } from "./voice_resilience"; | |
| // W1 (2026-05-15) β additive 4th voice-error code. Surfaces a silent | |
| // `getUserMedia` permission/denial failure (NotAllowedError / | |
| // NotFoundError / SecurityError / generic DOMException) so page.tsx can | |
| // render an actionable banner and revert the "Voice on" pill. Kept as a | |
| // local widening of the base `VoiceError` union from voice_resilience.ts | |
| // (which we don't touch per scope) β callers see the same | |
| // `onVoiceError(err: VoiceError) => void` shape, just with one more legal | |
| // string value. | |
| export type VoiceError = VoiceErrorBase | "mic_permission_denied"; | |
| // KI-189 (2026-05-15) β live-speak barge-in tuning constants. | |
| // The MediaRecorder mic stream IS echo-cancelled by the browser (KI-185 | |
| // `getUserMedia` AEC constraints), so the bot's TTS bleed lands at a | |
| // very low RMS (~0.001-0.005) while actual user speech sits at ~0.05-0.2. | |
| // We pick a threshold in between, and require ~300ms sustained energy | |
| // to avoid firing on coughs / room thumps / single-frame spikes. | |
| // KI-212 (2026-05-15) β was 0.025 / 18 frames. User reported barge-in | |
| // completely failing: bot reads entire 14s reply uninterrupted. Lowered | |
| // to fire on ANY decent speech burst within 100ms. Risk: false positives | |
| // (chair creak, cough) β acceptable trade vs. broken barge-in. | |
| const BARGE_IN_RMS_THRESHOLD = 0.008; | |
| const BARGE_IN_SUSTAINED_FRAMES = 6; // ~100ms @ 60fps rAF | |
| // KI-190 (2026-05-15) β adaptive threshold. The MediaRecorder mic stream | |
| // has AEC, but for very loud bot TTS the residual bleed can still cross | |
| // the static 0.025 threshold. We instead compute the threshold dynamically | |
| // from the bot's CURRENT audio level: bot_rms * MULTIPLIER + BASE. Bot | |
| // loud β threshold rises so user must speak loudly to overcome residual; | |
| // bot quiet β threshold drops near floor so soft speech still wins. | |
| // KI-212 β multiplier lowered 2.0 β 1.5 + base 0.005 β 0.002. Together | |
| // with the static threshold drop, makes barge-in fire on much softer | |
| // user speech even when bot is loud. | |
| const BARGE_IN_BOT_RMS_MULTIPLIER = 1.5; | |
| const BARGE_IN_BASE_THRESHOLD = 0.002; | |
| // KI-191 (2026-05-15) β duck bot TTS volume while voice mode is on. | |
| // Reducing playback amplitude further widens the gap between the bot's | |
| // residual mic bleed (after AEC) and the user's normal-volume speech, | |
| // making barge-in trivial. 0.6 is loud enough to hear clearly on | |
| // headphones and laptop speakers without overpowering user speech. | |
| // KI-211 (2026-05-15) β was 0.6; lowered to 0.3 because first-turn barge-in | |
| // fails when adaptive calibration (KI-195) hasn't sampled user_speech_rms yet. | |
| // 0.3 is loud enough to hear clearly on speakers + mic bleed is well under | |
| // the static BARGE_IN_RMS_THRESHOLD, so users can talk over the bot on the | |
| // first turn without needing prior calibration. | |
| const VOICE_MODE_TTS_VOLUME = 0.3; | |
| // KI-195 (2026-05-15) β adaptive TTS volume calibration relative to user's | |
| // own measured speech level. Architecture: while user speaks (recorder | |
| // active, NOT TTS) we sample mic RMS and track a rolling peak in | |
| // userSpeechRmsRef. While TTS plays, every 300ms we sample bot_rms_at_mic | |
| // via the KI-190 botAnalysers and reduce el.volume by 20% if bot_rms is | |
| // closer to user_rms than the target ratio. Floor at 0.15 so the bot | |
| // stays audible. This makes "bot bleed < user speech" a mathematical | |
| // guarantee after one calibration turn β barge-in always works, echo | |
| // never crosses the recognition threshold. | |
| const USER_SPEECH_RMS_INITIAL = 0.05; // typical quiet speech, used until calibrated | |
| const USER_SPEECH_DETECTION_THRESHOLD = 0.02; // mic RMS above this counts as "user speaking" | |
| // FIX 5 (HIGH) β hard ceiling on the rolling-peak userSpeechRms. Without | |
| // this, a single shout pins userSpeechRms at 0.4+ for the entire session | |
| // β adaptive barge-in threshold rises β normal-volume speech can't break | |
| // through β user has to shout to barge in again. The userRmsTick is also | |
| // gated on !isTtsPlaying, so during TTS playback there's NO decay path β | |
| // the wall-clock decay interval below provides decay regardless of gating. | |
| const USER_SPEECH_RMS_CEILING = 0.15; | |
| const USER_SPEECH_RMS_WALL_CLOCK_DECAY_MS = 1000; | |
| const USER_SPEECH_RMS_WALL_CLOCK_DECAY_FACTOR = 0.9; | |
| const VOLUME_CALIB_TARGET_RATIO = 0.35; // bot_rms_at_mic should be β€ user_rms Γ this | |
| const VOLUME_CALIB_TICK_MS = 300; // calibration sample period during TTS | |
| const VOLUME_CALIB_DUCK_FACTOR = 0.8; // multiply el.volume by this per tick if too loud | |
| const VOLUME_CALIB_FLOOR = 0.15; // never drop bot below this β must stay audible | |
| // KI-202 (2026-05-15) β utterance batching grace window. | |
| // Web Speech API's `onend` fires after ~1.5s silence, which means a natural | |
| // mid-sentence pause ("So it will be just [pause] me") triggers TWO separate | |
| // onend events and the user's sentence is submitted in two halves. We delay | |
| // the actual submission by UTTERANCE_GRACE_MS after onend; if recognition | |
| // re-fires (next word burst) before the timer expires, we append the new | |
| // text/audio chunks and reset the timer. Only after a full UTTERANCE_GRACE_MS | |
| // of true silence do we submit. | |
| const UTTERANCE_GRACE_MS = 1500; | |
| // KI-203 (2026-05-15) β post-TTS result-drop window. | |
| // `recognition.abort()` doesn't immediately stop result delivery β onresult | |
| // events from the now-abandoned recognition can keep arriving for a beat | |
| // afterwards. Keep dropping results for this many ms after TTS ends. | |
| const POST_TTS_DROP_MS = 300; | |
| // KI-285 (2026-05-16) β echo-suppression barge-in grace window. | |
| // | |
| // ROOT CAUSE this fixes: the bot's TTS reply was stopping a fraction of a | |
| // second after it started, with NO user having spoken. The reply audio is a | |
| // single <audio> blob (no chunking, `ended` fires once at true end), so the | |
| // premature stop could only come from triggerBargeIn() pausing the element. | |
| // The barge-in VAD floors its threshold at BARGE_IN_RMS_THRESHOLD (0.008) | |
| // when computeBotRms() returns 0 β which it ALWAYS does for the first frames | |
| // of playback (the per-element MediaElementSource analyser has no data yet) | |
| // and PERMANENTLY whenever createMediaElementSource() throws (Safari, | |
| // element already Web-Audio-routed, or autoplay-suspended ctx). Browser AEC | |
| // is imperfect on speaker (non-headphone) users; the bot's own voice echoes | |
| // back into the mic at ~0.001-0.02 RMS in the speech ZCR band β clearing the | |
| // 0.008 floor for 6 frames (~100ms) and self-triggering a "barge-in" on the | |
| // bot's OWN audio. No prior hysteresis guarded the playback-start window. | |
| // | |
| // FIX: do not treat ANY VAD energy as a barge-in until the bot's audio has | |
| // been playing for BARGE_IN_GRACE_MS. The first ~600ms of a reply is where | |
| // echo (not the user) is the energy source β the user has not yet had time | |
| // to hear enough of the reply to decide to interrupt, let alone produce | |
| // BARGE_IN_SUSTAINED_FRAMES of speech. Genuine barge-in is unaffected: a | |
| // real interruption is the user speaking *over* the bot for seconds, so the | |
| // sustained-energy gate is re-armed and fires the instant the grace window | |
| // elapses while the user is still talking. Only the bot's own start-of-reply | |
| // echo β which by definition cannot outlast a brief grace window without the | |
| // user actually speaking β is suppressed. | |
| const BARGE_IN_GRACE_MS = 600; | |
| // KI-285 (2026-05-16) β defence-in-depth. Even AFTER the grace window, when | |
| // computeBotRms() is unavailable (returns 0) we must not collapse the | |
| // barge-in threshold to the bare 0.008 static floor β that floor is BELOW | |
| // documented speaker echo bleed (up to ~0.02 RMS per KI-189/190 comments), | |
| // so echo alone clears it. When we have no usable bot-level reference, hold | |
| // the threshold at this echo-safe floor. Real user speech sits at | |
| // ~0.05-0.2 RMS (KI-189) and clears this comfortably; residual AEC echo | |
| // (~0.02 worst case on speakers) does not. | |
| const BARGE_IN_NO_BOTREF_FLOOR = 0.035; | |
| // ========================================================================= | |
| // #53 / #54 (2026-05-18) β push-to-talk head-clipping + start-latency fix. | |
| // | |
| // ROOT CAUSE (verified): | |
| // page.tsx's push-to-talk path cold-starts the mic on every SPACE press: | |
| // page.tsx:1350-1361 onKeyDown(SPACE) β startRecordingRef.current() | |
| // page.tsx:1004-1019 startRecording() β navigator.mediaDevices | |
| // .getUserMedia(...) [COLD β 200-700ms on HF Space] | |
| // page.tsx:1021 new MediaRecorder(stream) | |
| // page.tsx:1213 recorder.start() [capture truly begins HERE] | |
| // Every word the user speaks between the keydown and recorder.start() | |
| // firing is *never captured* β the leading word is lost/garbled (#53, | |
| // transcribed "S A R" for "Sir."). The same cold-start is the multi-second | |
| // delay the user feels before recording begins (#54). There is NO pre-roll | |
| // buffer and NO warm/pre-armed stream anywhere in the codebase. | |
| // | |
| // FIX (this hook, since page.tsx is owned by another writer and its PTT path | |
| // is fully self-contained): | |
| // - Keep ONE mic stream + MediaRecorder + AudioContext WARM for the hook's | |
| // entire armed lifetime (acquired once after the user opts into voice, | |
| // never torn down per-press, survives the LiveβPTT toggle). A persistent | |
| // open audio device means the OS mic is already hot, so page.tsx's own | |
| // per-press getUserMedia resolves in ~10-50ms instead of cold-starting | |
| // (200-700ms) β that alone removes the felt multi-second start delay. | |
| // - The warm MediaRecorder runs with a short timeslice, feeding a rolling | |
| // PRE-ROLL ring buffer that always holds the last ~PRE_ROLL_MS of audio. | |
| // - The PTT API (beginPushToTalk/endPushToTalk) prepends the pre-roll to | |
| // the captured utterance, so the FIRST WORD β spoken in the cold-start | |
| // gap β is always in the blob even though page.tsx's recorder missed it. | |
| // - A DELIBERATE-HOLD gate: beginPushToTalk arms instantly but the capture | |
| // only "engages" after HOLD_THRESHOLD_MS; a sub-threshold tap (key | |
| // bounce, accidental press) is discarded and produces no submission. | |
| // - AudioContext.resume() is kept warm WHILE armed (not lazily on first | |
| // press), and warm-stream / permission / worklet failures are surfaced | |
| // via onVoiceError β never silent. | |
| // | |
| // The pure pre-roll ring-buffer + hold-gate logic is exported (PreRollRing, | |
| // evaluateHoldGate) so it is self-contained and independently exercised by | |
| // the regression test. | |
| // ========================================================================= | |
| // Size of the rolling pre-roll buffer. Must comfortably cover the worst-case | |
| // page.tsx cold-start gap (getUserMedia 200-700ms + MediaRecorder spin-up + | |
| // the optional 400ms Live-teardown wait at page.tsx:994). 800ms gives margin | |
| // without bloating the blob (browser webm/opus β 4 KB/s β ~3.2 KB of lead-in). | |
| export const PRE_ROLL_MS = 800; | |
| // Warm MediaRecorder timeslice. Small enough that the pre-roll ring has fine | |
| // granularity (we never drop more than one slice of lead-in when trimming the | |
| // ring to PRE_ROLL_MS), large enough not to thrash ondataavailable. | |
| export const WARM_TIMESLICE_MS = 200; | |
| // Deliberate-hold threshold (#54). The hold must be intentional so an | |
| // accidental tap / key-bounce doesn't fire a turn, but it must feel instant | |
| // on a real hold β 200ms sits in the requested 150-250ms band. | |
| export const HOLD_THRESHOLD_MS = 200; | |
| /** | |
| * PreRollRing β a rolling, time-bounded ring buffer of MediaRecorder Blob | |
| * slices. `push` appends a freshly-emitted slice (each slice represents | |
| * ~WARM_TIMESLICE_MS of audio); the ring evicts the oldest slices once the | |
| * retained wall-clock duration exceeds `windowMs`, so it always holds *at | |
| * least* the last `windowMs` of audio (it may hold up to one extra slice so | |
| * a head word that started just before `windowMs` ago is never trimmed). | |
| * | |
| * Pure + framework-free so the regression test can drive it directly without | |
| * a browser. `drain()` returns the retained slices oldest-first and clears | |
| * the ring (used at PTT-engage to seed the utterance with the lead-in). | |
| */ | |
| export class PreRollRing { | |
| private slices: Array<{ blob: Blob; ms: number }> = []; | |
| private retainedMs = 0; | |
| private readonly windowMs: number; | |
| constructor(windowMs: number = PRE_ROLL_MS) { | |
| this.windowMs = windowMs; | |
| } | |
| push(blob: Blob, sliceMs: number = WARM_TIMESLICE_MS): void { | |
| if (!blob || blob.size <= 0) return; | |
| this.slices.push({ blob, ms: sliceMs }); | |
| this.retainedMs += sliceMs; | |
| // Evict from the front while doing so still leaves >= windowMs retained | |
| // (keep one extra slice of slack so a word that began just before the | |
| // window boundary survives β never trim into the requested lead-in). | |
| while ( | |
| this.slices.length > 1 && | |
| this.retainedMs - this.slices[0].ms >= this.windowMs | |
| ) { | |
| const dropped = this.slices.shift(); | |
| if (dropped) this.retainedMs -= dropped.ms; | |
| } | |
| } | |
| /** Retained lead-in slices oldest-first; clears the ring. */ | |
| drain(): Blob[] { | |
| const out = this.slices.map((s) => s.blob); | |
| this.slices = []; | |
| this.retainedMs = 0; | |
| return out; | |
| } | |
| /** Approximate retained wall-clock duration (ms). */ | |
| retainedDurationMs(): number { | |
| return this.retainedMs; | |
| } | |
| clear(): void { | |
| this.slices = []; | |
| this.retainedMs = 0; | |
| } | |
| } | |
| /** | |
| * evaluateHoldGate β pure decision for the deliberate-hold threshold (#54). | |
| * | |
| * Given when the user engaged (pressed) and released, decide whether the | |
| * press was a DELIBERATE hold (capture should be submitted) or a sub-threshold | |
| * TAP (discard β accidental press / key bounce). Kept pure so the regression | |
| * test can assert the boundary exactly without timers. | |
| * | |
| * heldMs >= thresholdMs β { deliberate: true } (engage + submit) | |
| * heldMs < thresholdMs β { deliberate: false } (discard, no submit) | |
| */ | |
| export function evaluateHoldGate( | |
| pressedAt: number, | |
| releasedAt: number, | |
| thresholdMs: number = HOLD_THRESHOLD_MS, | |
| ): { deliberate: boolean; heldMs: number } { | |
| const heldMs = Math.max(0, releasedAt - pressedAt); | |
| return { deliberate: heldMs >= thresholdMs, heldMs }; | |
| } | |
| // Minimal types for the Web Speech API since lib.dom.d.ts ships them under | |
| // `webkitSpeechRecognition` only and the standard `SpeechRecognition` symbol | |
| // is still vendor-prefixed in most browsers as of 2026-05. | |
| type SpeechRecognitionAlternative = { transcript: string; confidence: number }; | |
| type SpeechRecognitionResult = { | |
| isFinal: boolean; | |
| length: number; | |
| [index: number]: SpeechRecognitionAlternative; | |
| }; | |
| type SpeechRecognitionResultList = { | |
| length: number; | |
| [index: number]: SpeechRecognitionResult; | |
| }; | |
| interface SpeechRecognitionEventLike extends Event { | |
| resultIndex: number; | |
| results: SpeechRecognitionResultList; | |
| } | |
| interface SpeechRecognitionErrorEventLike extends Event { | |
| error: string; | |
| message?: string; | |
| } | |
| interface SpeechRecognitionInstance extends EventTarget { | |
| lang: string; | |
| continuous: boolean; | |
| interimResults: boolean; | |
| maxAlternatives: number; | |
| start: () => void; | |
| stop: () => void; | |
| abort: () => void; | |
| onresult: ((ev: SpeechRecognitionEventLike) => void) | null; | |
| onerror: ((ev: SpeechRecognitionErrorEventLike) => void) | null; | |
| onend: ((ev: Event) => void) | null; | |
| onstart: ((ev: Event) => void) | null; | |
| } | |
| type SpeechRecognitionCtor = new () => SpeechRecognitionInstance; | |
| export interface UseStreamingVoiceOptions { | |
| enabled: boolean; | |
| onInterimTranscript: (text: string) => void; | |
| onFinalTranscript: (text: string) => void; | |
| onError: (msg: string) => void; | |
| onListening: (listening: boolean) => void; | |
| isTextRequestPendingRef: React.MutableRefObject<boolean>; | |
| language?: string; | |
| // KI-223 (2026-05-15) β V1.1 / V1.2 / V5.4. Optional structured error | |
| // callback so page.tsx can react specifically to recoverable failures | |
| // (e.g. show "tap to enable audio" when audio_context_suspended fires). | |
| // Optional: existing consumers that don't pass this still work. | |
| onVoiceError?: (err: VoiceError) => void; | |
| } | |
| export interface UseStreamingVoiceReturn { | |
| start: () => void; | |
| stop: () => void; | |
| isSupported: boolean; | |
| /** | |
| * FIX 3 (HIGH) β Barge-in signal. The hook flips an internal flag when | |
| * `triggerBargeIn` fires (user spoke over bot TTS). The caller (page.tsx) | |
| * should poll this method before/after every fetch tick during a /api/chat | |
| * stream β if it returns true, abort the in-flight request and any pending | |
| * audio assembly so the bot doesn't keep talking after the user | |
| * interrupted. Reading clears the flag (one-shot semantics). | |
| * | |
| * Wire-up (caller side, OUT OF THIS HOOK'S SCOPE): | |
| * - Before fetch, store an AbortController locally. | |
| * - In the stream-reading loop, periodically check | |
| * `streamingVoice.consumeBargeInSignal()` and call `controller.abort()` | |
| * when it returns true. | |
| * - Alternatively register a side-effect that polls every 100ms while a | |
| * send() is in flight. | |
| */ | |
| consumeBargeInSignal: () => boolean; | |
| // ---------------------------------------------------------------------- | |
| // #53 / #54 β warm-stream + pre-roll push-to-talk API. | |
| // | |
| // This is the minimal API the push-to-talk UI integrates with. Even | |
| // without an explicit call, `armWarmStream()` is invoked autonomously by | |
| // the hook once voice has been enabled, so the OS mic device is kept hot | |
| // for the rest of the session β that removes the per-press cold-start that | |
| // page.tsx's own getUserMedia otherwise pays (the felt multi-second delay, | |
| // #54) and continuously fills the pre-roll ring so the leading word spoken | |
| // in the cold-start gap survives (#53). | |
| // ---------------------------------------------------------------------- | |
| /** True once the warm mic stream + recorder + AudioContext are live and | |
| * the pre-roll ring is filling. */ | |
| isWarm: boolean; | |
| /** Pre-arm (or re-arm) the persistent warm stream. Idempotent; safe to | |
| * call repeatedly. Resolves true when the warm stream is recording. */ | |
| armWarmStream: () => Promise<boolean>; | |
| /** Release the warm stream + recorder + AudioContext (mic indicator off). | |
| * Called on unmount; callers may call it to fully relinquish the mic. */ | |
| disarmWarmStream: () => void; | |
| /** | |
| * Engage a push-to-talk capture. Call on hold-start (e.g. SPACE keydown). | |
| * Returns immediately. The capture *engages* only after HOLD_THRESHOLD_MS | |
| * so a sub-threshold tap is ignored; the engaged utterance is seeded with | |
| * the pre-roll ring so the first word (spoken during the cold-start gap) | |
| * is always included. | |
| */ | |
| beginPushToTalk: () => void; | |
| /** | |
| * End a push-to-talk capture. Call on hold-release (e.g. SPACE keyup). | |
| * If the hold was deliberate (>= HOLD_THRESHOLD_MS) the assembled blob | |
| * (pre-roll + live capture) is transcribed and delivered via | |
| * onFinalTranscript; a sub-threshold tap resolves to null and submits | |
| * nothing. Resolves with the final transcript, or null when discarded / | |
| * empty. | |
| */ | |
| endPushToTalk: () => Promise<string | null>; | |
| /** Snapshot+drain the current pre-roll ring (oldest-first). Exposed for | |
| * the regression test and any caller that wants to splice the lead-in | |
| * into its own recorder blob. */ | |
| consumePreRollChunks: () => Blob[]; | |
| } | |
| function resolveCtor(): SpeechRecognitionCtor | null { | |
| if (typeof window === "undefined") return null; | |
| const w = window as unknown as { | |
| SpeechRecognition?: SpeechRecognitionCtor; | |
| webkitSpeechRecognition?: SpeechRecognitionCtor; | |
| }; | |
| return w.SpeechRecognition ?? w.webkitSpeechRecognition ?? null; | |
| } | |
| export function useStreamingVoice( | |
| opts: UseStreamingVoiceOptions, | |
| ): UseStreamingVoiceReturn { | |
| const { | |
| enabled, | |
| onInterimTranscript, | |
| onFinalTranscript, | |
| onError, | |
| onListening, | |
| isTextRequestPendingRef, | |
| language = "en-IN", | |
| onVoiceError, | |
| } = opts; | |
| // Keep latest callback refs so the recognition handlers always call the | |
| // freshest closure without re-binding the recognition instance on every | |
| // render (re-binding mid-utterance loses interim results). | |
| const onInterimRef = useRef(onInterimTranscript); | |
| const onFinalRef = useRef(onFinalTranscript); | |
| const onErrorRef = useRef(onError); | |
| const onListeningRef = useRef(onListening); | |
| // KI-223 β optional structured-error callback ref. Defaults to no-op so | |
| // the rest of the hook can call it unconditionally without null checks. | |
| const onVoiceErrorRef = useRef<(err: VoiceError) => void>( | |
| onVoiceError ?? (() => { /* no-op */ }), | |
| ); | |
| useEffect(() => { onInterimRef.current = onInterimTranscript; }, [onInterimTranscript]); | |
| useEffect(() => { onFinalRef.current = onFinalTranscript; }, [onFinalTranscript]); | |
| useEffect(() => { onErrorRef.current = onError; }, [onError]); | |
| useEffect(() => { onListeningRef.current = onListening; }, [onListening]); | |
| useEffect(() => { onVoiceErrorRef.current = onVoiceError ?? (() => { /* no-op */ }); }, [onVoiceError]); | |
| const recognitionRef = useRef<SpeechRecognitionInstance | null>(null); | |
| const finalsRef = useRef<string[]>([]); | |
| // KI-217 (2026-05-15) β track how many entries of finalsRef have already | |
| // been drained to pendingUtteranceRef. Each onend reads the slice from | |
| // `finalsConsumedRef.current` to end, then bumps the cursor. finalsRef | |
| // itself is NOT reset between restart cycles β only after the grace-timer | |
| // submit (when onFinalRef fires) or on user-toggled start/stop. This | |
| // prevents a Chrome quirk where late-delivered isFinal results arriving | |
| // after onend on a mid-utterance restart cycle would land in a freshly | |
| // wiped finalsRef and get dropped on the NEXT onend cycle's drain. | |
| const finalsConsumedRef = useRef<number>(0); | |
| const wantRunningRef = useRef(false); // mirrors `enabled` for handler closures | |
| const restartTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null); | |
| const errorBackoffRef = useRef(0); | |
| // KI-188 (2026-05-15) β TTS-playback gate. Web Speech API has its own | |
| // internal mic pipeline that bypasses our getUserMedia AEC constraints, | |
| // so SpeechRecognition transcribes the bot's TTS audio bleeding from | |
| // speakers as user input ("echo loop"). The only reliable fix from JS | |
| // is to abort recognition while ANY <audio> in the DOM is playing. | |
| // Tracked via a MutationObserver + per-element play/pause/ended hooks. | |
| const isTtsPlayingRef = useRef(false); | |
| // KI-285 (2026-05-16) β wall-clock timestamp of the moment the CURRENT | |
| // bot TTS playback began (the falseβtrue edge in updateTtsState). The | |
| // barge-in tick refuses to trigger until BARGE_IN_GRACE_MS has elapsed | |
| // since this instant, so the bot's own start-of-reply echo cannot | |
| // self-trigger a barge-in. Reset to 0 whenever TTS is not playing. | |
| const ttsPlaybackStartedAtRef = useRef<number>(0); | |
| const ttsAudioElementsRef = useRef<Set<HTMLAudioElement>>(new Set()); | |
| // KI-203 (2026-05-15) β silently discard SpeechRecognition.onresult events | |
| // while this flag is true. Flipped on the instant TTS playback starts | |
| // (closes the ~100-300ms window between `audio.play()` and our abort() | |
| // taking effect, during which bot voice was being transcribed as user | |
| // input). Flipped back ~POST_TTS_DROP_MS after TTS ends so any in-flight | |
| // results from the dying recognition pipeline are still suppressed. | |
| const dropResultsRef = useRef(false); | |
| const dropResultsClearTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null); | |
| // KI-202 (2026-05-15) β utterance-batching state. | |
| // pendingUtteranceRef accumulates the Web Speech transcript across multiple | |
| // onend events separated by sub-grace-window pauses. pendingChunksRef does | |
| // the same for MediaRecorder blobs so the Sarvam POST sees the WHOLE | |
| // utterance, not just the tail after the last pause. pendingSubmitTimerRef | |
| // is the grace-window setTimeout; it gets reset every time onend appends | |
| // more content. | |
| const pendingUtteranceRef = useRef<string>(""); | |
| const pendingChunksRef = useRef<Blob[]>([]); | |
| const pendingSubmitTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null); | |
| // FIX 3 (HIGH) β one-shot barge-in signal. Flipped true by triggerBargeIn | |
| // when the VAD detects sustained user speech over bot TTS. Read+cleared | |
| // via consumeBargeInSignal() so the caller (page.tsx) can abort any | |
| // in-flight /api/chat request that's still assembling more TTS audio. | |
| const bargeInRequestedRef = useRef<boolean>(false); | |
| // KI-228 (2026-05-15) β V6.8 adaptive noise floor. Persistent across the | |
| // entire hook lifetime so a user's noise environment learned across the | |
| // first 5 seconds carries through later TTS plays even if the audio | |
| // effect tears down + rebuilds the analyser between turns. | |
| const noiseFloorRef = useRef<AdaptiveNoiseFloor>(new AdaptiveNoiseFloor()); | |
| // KI-225 (2026-05-15) β V1.3 sample-rate-aware ZCR band, cached from the | |
| // AudioContext at analyser-build time. Falls back to the 48 kHz reference | |
| // band when the context isn't up yet. | |
| const zcrBandRef = useRef<{ min: number; max: number }>({ min: 20, max: 250 }); | |
| // ---------------------------------------------------------------------- | |
| // KI-168 PHASE 2 β Sarvam authoritative-transcript layer. | |
| // We run a MediaRecorder in parallel with SpeechRecognition. When the | |
| // browser detects end-of-utterance silence (recognition.onend), we | |
| // already have the raw audio chunks in memory. Send them to the backend | |
| // /api/transcribe endpoint (Sarvam STT) and replace the Web Speech text | |
| // with Sarvam's authoritative result. Web Speech remains the fallback if | |
| // Sarvam times out, errors, or the audio path failed to initialise. | |
| // ---------------------------------------------------------------------- | |
| const mediaStreamRef = useRef<MediaStream | null>(null); | |
| const mediaRecorderRef = useRef<MediaRecorder | null>(null); | |
| const chunksRef = useRef<Blob[]>([]); | |
| const recorderMimeRef = useRef<string>("audio/webm"); | |
| // True only when MediaRecorder.start() actually succeeded. If false we | |
| // bypass the Sarvam path and use Web Speech transcripts directly. | |
| const recorderActiveRef = useRef(false); | |
| // Promise resolved on the recorder's next `stop` event so we can wait | |
| // for the final ondataavailable chunk before building the blob. | |
| const recorderStopWaiterRef = useRef<(() => void) | null>(null); | |
| // ---------------------------------------------------------------------- | |
| // #53 / #54 β warm-stream + pre-roll push-to-talk state. | |
| // | |
| // SEPARATE from the Live-mode mediaStream/mediaRecorder above. The Live | |
| // recorder is acquired/torn-down per utterance and is gated on the | |
| // `enabled` prop (which page.tsx flips OFF during push-to-talk). This warm | |
| // stream is the OPPOSITE lifecycle: opened once after the user opts into | |
| // voice, kept alive across the LiveβPTT toggle for the hook's mounted | |
| // lifetime, never closed per-press. Holding a persistent open audio device | |
| // keeps the OS mic hot so any per-press getUserMedia (Live's OR page.tsx's | |
| // PTT) resolves near-instantly instead of cold-starting. | |
| // ---------------------------------------------------------------------- | |
| const warmStreamRef = useRef<MediaStream | null>(null); | |
| const warmRecorderRef = useRef<MediaRecorder | null>(null); | |
| const warmCtxRef = useRef<AudioContext | null>(null); | |
| const warmMimeRef = useRef<string>("audio/webm"); | |
| // The rolling pre-roll ring β always holds ~PRE_ROLL_MS of the most recent | |
| // audio so a PTT engage can prepend the lead-in the user spoke during the | |
| // cold-start gap. | |
| const preRollRef = useRef<PreRollRing>(new PreRollRing(PRE_ROLL_MS)); | |
| // Live capture slices accumulated between PTT engage and release. The | |
| // submitted blob is preRoll.drain() (lead-in) ++ these (live capture). | |
| const pttCaptureRef = useRef<Blob[]>([]); | |
| // True between a deliberate engage and the matching release β the warm | |
| // recorder's ondataavailable routes slices to pttCaptureRef instead of | |
| // (only) the pre-roll ring while this is set. | |
| const pttEngagedRef = useRef<boolean>(false); | |
| // wall-clock ms of the current hold's keydown (0 when not pressed). Used | |
| // by evaluateHoldGate to classify deliberate hold vs sub-threshold tap. | |
| const pttPressedAtRef = useRef<number>(0); | |
| // setTimeout id for the deliberate-hold engage. Fires HOLD_THRESHOLD_MS | |
| // after press; if release beats it, the press was a tap and is discarded. | |
| const pttHoldTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null); | |
| // True once the user has opted into voice at least once. Latches the warm | |
| // stream ON for the rest of the hook's mounted lifetime so it survives the | |
| // LiveβPTT toggle (page.tsx flips `enabled` false for pure PTT). | |
| const voiceEverEnabledRef = useRef<boolean>(false); | |
| const [isWarm, setIsWarm] = useState<boolean>(false); | |
| const [isSupported] = useState<boolean>(() => resolveCtor() !== null); | |
| const clearRestartTimer = useCallback(() => { | |
| if (restartTimerRef.current !== null) { | |
| clearTimeout(restartTimerRef.current); | |
| restartTimerRef.current = null; | |
| } | |
| }, []); | |
| // KI-210 (2026-05-15) β wait for an in-flight text turn to clear instead of | |
| // dropping the accumulated voice utterance. Polls isTextRequestPendingRef | |
| // every 300ms; resolves true once the flag clears, or false if the | |
| // maxWaitMs cap elapses first (we then proceed anyway rather than leak the | |
| // utterance forever on a stuck text request). | |
| const waitForTextClear = useCallback(async (maxWaitMs = 30000): Promise<boolean> => { | |
| const startTs = Date.now(); | |
| while (isTextRequestPendingRef.current) { | |
| if (Date.now() - startTs > maxWaitMs) { | |
| console.debug("[useStreamingVoice] KI-210 wait timed out, submitting anyway"); | |
| return false; // gave up waiting β proceed anyway | |
| } | |
| await new Promise((r) => setTimeout(r, 300)); | |
| } | |
| return true; // text cleared, ok to proceed | |
| }, [isTextRequestPendingRef]); | |
| const safeStart = useCallback(() => { | |
| const rec = recognitionRef.current; | |
| if (!rec) return; | |
| try { | |
| rec.start(); | |
| } catch { | |
| // start() throws InvalidStateError if recognition is already running. | |
| // Safe to ignore β onstart/onend will keep state in sync. | |
| } | |
| }, []); | |
| // Pick the best MediaRecorder mimeType. iOS Safari only supports | |
| // audio/mp4; Chromium/Firefox prefer audio/webm. Mirrors page.tsx PTT | |
| // recorder + the KI-134 fallback logic. | |
| const pickRecorderMime = useCallback((): string => { | |
| if (typeof window === "undefined" || typeof MediaRecorder === "undefined") { | |
| return ""; | |
| } | |
| const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/mpeg"]; | |
| for (const m of candidates) { | |
| try { | |
| if (MediaRecorder.isTypeSupported(m)) return m; | |
| } catch { | |
| // ignore | |
| } | |
| } | |
| return ""; | |
| }, []); | |
| const stopRecorder = useCallback((): Promise<void> => { | |
| const recorder = mediaRecorderRef.current; | |
| if (!recorder || recorder.state === "inactive") { | |
| return Promise.resolve(); | |
| } | |
| return new Promise<void>((resolve) => { | |
| recorderStopWaiterRef.current = () => resolve(); | |
| try { | |
| recorder.stop(); | |
| } catch { | |
| // already stopped | |
| recorderStopWaiterRef.current = null; | |
| resolve(); | |
| } | |
| }); | |
| }, []); | |
| const teardownAudio = useCallback(() => { | |
| const recorder = mediaRecorderRef.current; | |
| if (recorder) { | |
| try { | |
| if (recorder.state !== "inactive") recorder.stop(); | |
| } catch { | |
| // ignore | |
| } | |
| recorder.ondataavailable = null; | |
| recorder.onstop = null; | |
| recorder.onerror = null; | |
| } | |
| mediaRecorderRef.current = null; | |
| const stream = mediaStreamRef.current; | |
| if (stream) { | |
| stream.getTracks().forEach((t) => { | |
| try { t.stop(); } catch { /* ignore */ } | |
| }); | |
| } | |
| mediaStreamRef.current = null; | |
| chunksRef.current = []; | |
| recorderActiveRef.current = false; | |
| recorderStopWaiterRef.current = null; | |
| }, []); | |
| const ensureAudioCapture = useCallback(async (): Promise<boolean> => { | |
| if (mediaRecorderRef.current && recorderActiveRef.current) return true; | |
| if (typeof navigator === "undefined" || !navigator.mediaDevices) return false; | |
| if (typeof MediaRecorder === "undefined") return false; | |
| try { | |
| // KI-185 (2026-05-15) β explicit AEC + noise suppression + auto-gain. | |
| // Default `{audio: true}` does NOT force AEC across all browsers, so the | |
| // mic was transcribing the bot's own TTS audio bleeding from speakers | |
| // back into the mic. Same constraints Zoom / Meet / ChatGPT-voice use. | |
| // For headphone users this gives near-perfect echo cancellation; | |
| // for speaker users it's 70-90% reduction (some bleed unavoidable | |
| // without server-side reference cancellation). | |
| // W2 (2026-05-15) β 2s watchdog around getUserMedia. | |
| // Some devices (Chromium on locked-down corporate Windows, certain | |
| // Android WebViews, OS-level mic-busy states) STALL getUserMedia | |
| // indefinitely instead of rejecting. Without a watchdog the pill | |
| // sits at "Voice on" forever, no banner, no recovery path. | |
| // Race the permission prompt against a 2000ms timeout that | |
| // rejects with name="StallTimeout" so the catch below treats it | |
| // identically to a hard denial (mic_permission_denied banner). | |
| const stream: MediaStream = await Promise.race([ | |
| navigator.mediaDevices.getUserMedia({ | |
| audio: { | |
| echoCancellation: true, | |
| noiseSuppression: true, | |
| autoGainControl: true, | |
| }, | |
| }), | |
| new Promise<MediaStream>((_, reject) => { | |
| setTimeout(() => { | |
| const e = new Error("getUserMedia stalled >2s") as Error & { name: string }; | |
| e.name = "StallTimeout"; | |
| reject(e); | |
| }, 2000); | |
| }), | |
| ]); | |
| const mime = pickRecorderMime(); | |
| recorderMimeRef.current = mime || "audio/webm"; | |
| const recorder = mime ? new MediaRecorder(stream, { mimeType: mime }) : new MediaRecorder(stream); | |
| chunksRef.current = []; | |
| recorder.ondataavailable = (ev: BlobEvent) => { | |
| if (ev.data && ev.data.size > 0) chunksRef.current.push(ev.data); | |
| }; | |
| recorder.onstop = () => { | |
| const waiter = recorderStopWaiterRef.current; | |
| recorderStopWaiterRef.current = null; | |
| if (waiter) waiter(); | |
| }; | |
| recorder.onerror = (ev: Event) => { | |
| console.debug("[useStreamingVoice] MediaRecorder error", ev); | |
| }; | |
| mediaStreamRef.current = stream; | |
| mediaRecorderRef.current = recorder; | |
| // 1s timeslice so chunks land progressively β ondataavailable fires | |
| // once per second instead of only on stop(). | |
| recorder.start(1000); | |
| // W2 (2026-05-15) β affirmative post-acquire validation. A | |
| // MediaRecorder that .start()s without throwing is NOT proof the | |
| // capture is alive: Playwright's fake-mic stream, a stream from a | |
| // device that was unplugged between getUserMedia and start(), or a | |
| // codec rejection that fires `onerror` async β all leave recorder.state | |
| // anything other than "recording". Without this check, the pill flipped | |
| // to "Voice on" over a silent stream. Treat any non-"recording" state | |
| // as a hard fail and route to the same mic_permission_denied banner. | |
| if (recorder.state !== "recording") { | |
| try { stream.getTracks().forEach((t) => t.stop()); } catch { /* ignore */ } | |
| mediaStreamRef.current = null; | |
| mediaRecorderRef.current = null; | |
| throw Object.assign(new Error(`MediaRecorder did not enter recording state (got ${recorder.state})`), { | |
| name: "RecorderNotRecording", | |
| }); | |
| } | |
| recorderActiveRef.current = true; | |
| console.debug("[useStreamingVoice] MediaRecorder started", { | |
| mime: recorderMimeRef.current, | |
| state: recorder.state, | |
| }); | |
| return true; | |
| } catch (err) { | |
| // W1 (2026-05-15) β DOMException name β VoiceError mapping. | |
| // NotAllowedError / SecurityError β user denied or browser-blocked | |
| // NotFoundError / OverconstrainedError β no usable input device | |
| // NotReadableError / AbortError β OS-level mic owned by another app | |
| // anything else (incl. plain Error) β treat as denial so the UI still | |
| // surfaces an actionable banner | |
| // ALL of these map to "mic_permission_denied" because the user-visible | |
| // remediation is the same: open site permissions, allow mic, reload. | |
| // Returning `false` alone was insufficient β `start()` calls this via | |
| // `void ensureAudioCapture()` and never sees the rejection, so the pill | |
| // stayed at "Voice on" with zero mic. Emitting onVoiceError + flipping | |
| // wantRunningRef false + onListening(false) is the recovery contract. | |
| const name = (err as { name?: string } | null)?.name ?? "Error"; | |
| console.debug( | |
| "[useStreamingVoice] getUserMedia / MediaRecorder init failed", | |
| { name, err }, | |
| ); | |
| recorderActiveRef.current = false; | |
| // `getUserMedia` rejection happens BEFORE we assign mediaStreamRef / | |
| // mediaRecorderRef, so there's nothing to tear down here. The | |
| // `wantRunningRef = false` + `onListening(false)` below is enough to | |
| // halt the SR auto-restart loop. The parent's `enabled = false` flip | |
| // (driven by the banner code) will run stop() which idempotently | |
| // re-runs full cleanup. | |
| // Surface to the page-level banner. Cast through the local widened | |
| // VoiceError union (W1) so TS accepts the new string code. | |
| try { | |
| onVoiceErrorRef.current("mic_permission_denied" as VoiceError); | |
| } catch { | |
| /* never let a user-supplied callback crash the hook */ | |
| } | |
| // Stop the recognition restart loop and reset listening state so the | |
| // pill doesn't stay green over a dead mic. The parent (page.tsx) is | |
| // expected to also flip `enabled` back to false on the banner code, | |
| // which calls our `stop()` and idempotently cleans up. | |
| wantRunningRef.current = false; | |
| try { | |
| onListeningRef.current(false); | |
| } catch { | |
| /* ignore */ | |
| } | |
| return false; | |
| } | |
| }, [pickRecorderMime]); | |
| // ====================================================================== | |
| // #53 / #54 β warm-stream + pre-roll push-to-talk engine. | |
| // ====================================================================== | |
| const disarmWarmStream = useCallback(() => { | |
| if (pttHoldTimerRef.current !== null) { | |
| clearTimeout(pttHoldTimerRef.current); | |
| pttHoldTimerRef.current = null; | |
| } | |
| pttEngagedRef.current = false; | |
| pttPressedAtRef.current = 0; | |
| pttCaptureRef.current = []; | |
| preRollRef.current.clear(); | |
| const rec = warmRecorderRef.current; | |
| if (rec) { | |
| try { | |
| rec.ondataavailable = null; | |
| rec.onerror = null; | |
| rec.onstop = null; | |
| if (rec.state !== "inactive") rec.stop(); | |
| } catch { | |
| /* ignore */ | |
| } | |
| } | |
| warmRecorderRef.current = null; | |
| const stream = warmStreamRef.current; | |
| if (stream) { | |
| stream.getTracks().forEach((t) => { | |
| try { t.stop(); } catch { /* ignore */ } | |
| }); | |
| } | |
| warmStreamRef.current = null; | |
| const ctx = warmCtxRef.current; | |
| if (ctx) { | |
| warmCtxRef.current = null; | |
| try { void ctx.close(); } catch { /* ignore */ } | |
| } | |
| setIsWarm(false); | |
| }, []); | |
| // Acquire (or re-acquire) the persistent warm stream. Idempotent: a | |
| // healthy recording warm recorder short-circuits. On failure routes | |
| // through the SAME onVoiceError("mic_permission_denied") contract the | |
| // Live path uses β never a silent failure. | |
| const armWarmStream = useCallback(async (): Promise<boolean> => { | |
| voiceEverEnabledRef.current = true; | |
| const existing = warmRecorderRef.current; | |
| if (existing && existing.state === "recording" && warmStreamRef.current) { | |
| return true; | |
| } | |
| if (typeof navigator === "undefined" || !navigator.mediaDevices) return false; | |
| if (typeof MediaRecorder === "undefined") return false; | |
| // Tear down any half-built prior attempt before re-acquiring. | |
| if (existing || warmStreamRef.current) disarmWarmStream(); | |
| try { | |
| // Same AEC/NS/AGC constraints as the Live + PTT paths (KI-185) so the | |
| // pre-roll is echo-cancelled identically to the rest of the capture. | |
| // W2-style 2s stall watchdog so a hung getUserMedia surfaces a banner | |
| // instead of pinning the warm state forever. | |
| const stream: MediaStream = await Promise.race([ | |
| navigator.mediaDevices.getUserMedia({ | |
| audio: { | |
| echoCancellation: true, | |
| noiseSuppression: true, | |
| autoGainControl: true, | |
| }, | |
| }), | |
| new Promise<MediaStream>((_, reject) => { | |
| setTimeout(() => { | |
| const e = new Error("warm getUserMedia stalled >2s") as Error & { name: string }; | |
| e.name = "StallTimeout"; | |
| reject(e); | |
| }, 2000); | |
| }), | |
| ]); | |
| const mime = pickRecorderMime(); | |
| warmMimeRef.current = mime || "audio/webm"; | |
| const recorder = mime | |
| ? new MediaRecorder(stream, { mimeType: mime }) | |
| : new MediaRecorder(stream); | |
| preRollRef.current = new PreRollRing(PRE_ROLL_MS); | |
| pttCaptureRef.current = []; | |
| pttEngagedRef.current = false; | |
| recorder.ondataavailable = (ev: BlobEvent) => { | |
| if (!ev.data || ev.data.size <= 0) return; | |
| // Always feed the rolling pre-roll ring so the lead-in is ready the | |
| // instant a PTT engage fires (the word spoken in the cold-start gap | |
| // is in here). When a PTT capture is engaged, ALSO accumulate the | |
| // slice into the live capture buffer β the submitted blob is | |
| // preRoll.drain() (lead-in) ++ pttCaptureRef (live), so the first | |
| // word is never lost AND no chunk is dropped. | |
| preRollRef.current.push(ev.data, WARM_TIMESLICE_MS); | |
| if (pttEngagedRef.current) { | |
| pttCaptureRef.current.push(ev.data); | |
| } | |
| }; | |
| recorder.onerror = (ev: Event) => { | |
| console.debug("[useStreamingVoice] warm MediaRecorder error", ev); | |
| try { onVoiceErrorRef.current("stream_stale"); } catch { /* ignore */ } | |
| }; | |
| recorder.onstop = () => { | |
| // The warm recorder should never stop on its own while armed; if it | |
| // does (device unplug, OS interruption) surface it and let the | |
| // re-arm effect / next press recover. | |
| console.debug("[useStreamingVoice] warm MediaRecorder stopped"); | |
| }; | |
| warmStreamRef.current = stream; | |
| warmRecorderRef.current = recorder; | |
| recorder.start(WARM_TIMESLICE_MS); | |
| if (recorder.state !== "recording") { | |
| try { stream.getTracks().forEach((t) => t.stop()); } catch { /* ignore */ } | |
| warmStreamRef.current = null; | |
| warmRecorderRef.current = null; | |
| throw Object.assign( | |
| new Error(`warm MediaRecorder not recording (got ${recorder.state})`), | |
| { name: "RecorderNotRecording" }, | |
| ); | |
| } | |
| // Keep an AudioContext warm + RUNNING so it never has to be resumed | |
| // lazily on first press (a suspended ctx is one of the documented | |
| // first-word-loss vectors). resume() needs a user gesture on some | |
| // browsers; armWarmStream is always called from one (voice toggle). | |
| try { | |
| const Ctor = (window.AudioContext | |
| || (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext); | |
| if (Ctor) { | |
| if (!warmCtxRef.current || warmCtxRef.current.state === "closed") { | |
| warmCtxRef.current = new Ctor(); | |
| } | |
| if (warmCtxRef.current.state === "suspended") { | |
| void warmCtxRef.current.resume().catch((err) => { | |
| console.debug("[useStreamingVoice] warm AudioContext.resume failed", err); | |
| try { onVoiceErrorRef.current("audio_context_suspended"); } catch { /* ignore */ } | |
| }); | |
| } | |
| } | |
| } catch { | |
| /* AudioContext is best-effort for warmth; capture still works */ | |
| } | |
| setIsWarm(true); | |
| console.debug("[useStreamingVoice] warm stream armed", { | |
| mime: warmMimeRef.current, | |
| preRollMs: PRE_ROLL_MS, | |
| timesliceMs: WARM_TIMESLICE_MS, | |
| }); | |
| return true; | |
| } catch (err) { | |
| const name = (err as { name?: string } | null)?.name ?? "Error"; | |
| console.debug("[useStreamingVoice] warm stream arm failed", { name, err }); | |
| setIsWarm(false); | |
| try { | |
| onVoiceErrorRef.current("mic_permission_denied" as VoiceError); | |
| } catch { | |
| /* never let a user callback crash the hook */ | |
| } | |
| return false; | |
| } | |
| }, [pickRecorderMime, disarmWarmStream]); | |
| const consumePreRollChunks = useCallback((): Blob[] => { | |
| return preRollRef.current.drain(); | |
| }, []); | |
| // Submit an assembled PTT blob through the SAME Sarvam-with-retry path the | |
| // Live grace-timer uses (KI-226/302), then deliver via onFinalTranscript. | |
| // Returns the authoritative transcript or null. | |
| const submitPttBlob = useCallback( | |
| async (chunks: Blob[]): Promise<string | null> => { | |
| if (chunks.length === 0) return null; | |
| const blob = new Blob(chunks, { type: warmMimeRef.current || "audio/webm" }); | |
| // ~3 KB empirical noise floor (same as the Live path / PTT KI-134). | |
| const MIN_BLOB_BYTES = 3000; | |
| if (blob.size < MIN_BLOB_BYTES) { | |
| console.debug("[useStreamingVoice] PTT blob below noise floor β discard", { | |
| bytes: blob.size, | |
| }); | |
| return null; | |
| } | |
| await waitForTextClear(); | |
| const APPROX_BYTES_PER_CHUNK = 100_000; // ~25s of webm/opus | |
| const estChunks = Math.max(1, Math.ceil(blob.size / APPROX_BYTES_PER_CHUNK)); | |
| const attemptTimeoutMs = Math.min(120_000, 8_000 + estChunks * 12_000); | |
| let authoritative: string | null = null; | |
| const sarvam = await retryPostTranscribe(async (signal) => { | |
| const timeoutCtl = new AbortController(); | |
| const timer = setTimeout(() => timeoutCtl.abort(), attemptTimeoutMs); | |
| const onOuterAbort = () => timeoutCtl.abort(); | |
| signal.addEventListener("abort", onOuterAbort); | |
| try { | |
| return await postTranscribe(blob, language, timeoutCtl.signal); | |
| } finally { | |
| clearTimeout(timer); | |
| signal.removeEventListener("abort", onOuterAbort); | |
| } | |
| }); | |
| if (sarvam) { | |
| const t = (sarvam.text || "").trim(); | |
| if (t) authoritative = t; | |
| } else { | |
| try { onVoiceErrorRef.current("transcribe_failed"); } catch { /* ignore */ } | |
| } | |
| if (authoritative) { | |
| await waitForTextClear(); | |
| onFinalRef.current(authoritative); | |
| } | |
| return authoritative; | |
| }, | |
| [language, waitForTextClear], | |
| ); | |
| // PTT engage β called HOLD_THRESHOLD_MS after a deliberate press. Snapshots | |
| // the pre-roll (lead-in spoken during the cold-start gap) into the live | |
| // capture buffer and flips the recorder's slice routing to also accumulate. | |
| const engagePtt = useCallback(() => { | |
| pttEngagedRef.current = true; | |
| // Seed the capture with the pre-roll lead-in FIRST so the first word | |
| // (which page.tsx's cold-started recorder would have missed) is at the | |
| // head of the submitted blob. | |
| const leadIn = preRollRef.current.drain(); | |
| pttCaptureRef.current = [...leadIn]; | |
| console.debug("[useStreamingVoice] PTT engaged", { | |
| leadInSlices: leadIn.length, | |
| }); | |
| }, []); | |
| const beginPushToTalk = useCallback(() => { | |
| pttPressedAtRef.current = Date.now(); | |
| pttCaptureRef.current = []; | |
| pttEngagedRef.current = false; | |
| // Make sure the warm stream is up so the pre-roll is actually filling. | |
| // armWarmStream is idempotent + fast when already warm. | |
| void armWarmStream(); | |
| if (pttHoldTimerRef.current !== null) { | |
| clearTimeout(pttHoldTimerRef.current); | |
| } | |
| // Deliberate-hold gate: engage only after the threshold so a sub-150ms | |
| // tap does nothing. The capture still feels instant because the pre-roll | |
| // ring already holds the audio spoken during these HOLD_THRESHOLD_MS. | |
| pttHoldTimerRef.current = setTimeout(() => { | |
| pttHoldTimerRef.current = null; | |
| // Re-check the press is still held (release clears pttPressedAtRef). | |
| if (pttPressedAtRef.current !== 0) engagePtt(); | |
| }, HOLD_THRESHOLD_MS); | |
| }, [armWarmStream, engagePtt]); | |
| const endPushToTalk = useCallback(async (): Promise<string | null> => { | |
| const pressedAt = pttPressedAtRef.current; | |
| const releasedAt = Date.now(); | |
| pttPressedAtRef.current = 0; | |
| if (pttHoldTimerRef.current !== null) { | |
| clearTimeout(pttHoldTimerRef.current); | |
| pttHoldTimerRef.current = null; | |
| } | |
| const { deliberate, heldMs } = evaluateHoldGate( | |
| pressedAt || releasedAt, | |
| releasedAt, | |
| HOLD_THRESHOLD_MS, | |
| ); | |
| const wasEngaged = pttEngagedRef.current; | |
| pttEngagedRef.current = false; | |
| if (!deliberate || !wasEngaged) { | |
| // Sub-threshold tap (or release before engage fired): discard. The | |
| // pre-roll ring keeps rolling for the warm stream; nothing submitted. | |
| console.debug("[useStreamingVoice] PTT discarded (tap)", { | |
| heldMs, | |
| deliberate, | |
| wasEngaged, | |
| }); | |
| pttCaptureRef.current = []; | |
| return null; | |
| } | |
| const captured = pttCaptureRef.current; | |
| pttCaptureRef.current = []; | |
| return submitPttBlob(captured); | |
| }, [submitPttBlob]); | |
| const buildRecognition = useCallback((): SpeechRecognitionInstance | null => { | |
| const Ctor = resolveCtor(); | |
| if (!Ctor) return null; | |
| const rec = new Ctor(); | |
| rec.lang = language; | |
| rec.continuous = false; | |
| rec.interimResults = true; | |
| rec.maxAlternatives = 1; | |
| rec.onstart = () => { | |
| onListeningRef.current(true); | |
| }; | |
| rec.onresult = (ev: SpeechRecognitionEventLike) => { | |
| // KI-203 (2026-05-15) β early-return while TTS is playing (or within | |
| // the POST_TTS_DROP_MS window after TTS ends). recognition.abort() | |
| // doesn't immediately stop result delivery, so we silently discard | |
| // every chunk that arrives during the dirty window. Without this, bot | |
| // TTS audio ("perfect days to get started Rohit") was leaking into | |
| // the user input field between `audio.play()` firing and our abort() | |
| // actually taking effect. | |
| if (dropResultsRef.current || isTextRequestPendingRef.current) { | |
| console.debug("[useStreamingVoice] KI-203/214 dropping recognition result", { | |
| drop: dropResultsRef.current, | |
| textPending: isTextRequestPendingRef.current, | |
| }); | |
| return; | |
| } | |
| let interim = ""; | |
| // Walk every result; finals get pushed onto finalsRef, interims get | |
| // concatenated into a running string that's displayed in the input. | |
| for (let i = 0; i < ev.results.length; i++) { | |
| const result = ev.results[i]; | |
| const alt = result[0]; | |
| if (!alt) continue; | |
| if (result.isFinal) { | |
| const t = alt.transcript.trim(); | |
| if (t) finalsRef.current.push(t); | |
| } else { | |
| interim += alt.transcript; | |
| } | |
| } | |
| // #68 β the composer must show the COMPLETE evolving transcript, not | |
| // just the current recognition session's slice. continuous=false makes | |
| // Web Speech end+restart on every sub-1.5s pause; each restart begins a | |
| // fresh result list, and finals can also be skipped here during the | |
| // TTS/text drop window above even though the audio (β Sarvam) still has | |
| // them. The authoritative running text the grace timer will submit is | |
| // `pendingUtteranceRef` (earlier graced segments of THIS utterance) + | |
| // the current session's NOT-YET-DRAINED finals + the live interim. | |
| // | |
| // Critical: finals already moved into pendingUtteranceRef on `onend` | |
| // stay in finalsRef until submit (so a late isFinal isn't lost), and | |
| // `finalsConsumedRef` is the cursor of how many were drained. Joining | |
| // ALL of finalsRef would double-count those (segment shown twice). So | |
| // we display pending + finalsRef.slice(consumed) + interim β the exact | |
| // union with no duplication and no lag behind what was captured/sent. | |
| const priorSegments = pendingUtteranceRef.current.trim(); | |
| const freshFinals = finalsRef.current | |
| .slice(finalsConsumedRef.current) | |
| .join(" ") | |
| .trim(); | |
| const running = [priorSegments, freshFinals, interim] | |
| .map((s) => s.trim()) | |
| .filter(Boolean) | |
| .join(" ") | |
| .trim(); | |
| onInterimRef.current(running); | |
| }; | |
| rec.onerror = (ev: SpeechRecognitionErrorEventLike) => { | |
| const code = ev.error; | |
| // `no-speech` and `aborted` are routine in continuous-restart mode β | |
| // no audio detected in a window, or we deliberately stopped. Silent | |
| // restart via onend. | |
| if (code === "no-speech" || code === "aborted") return; | |
| if (code === "not-allowed" || code === "service-not-allowed") { | |
| wantRunningRef.current = false; | |
| // FIX 2 (HIGH) β Terminal-error mic leak. Without teardownAudio() | |
| // here the MediaRecorder + MediaStream stay open even though | |
| // recognition has shut down, so the browser's red-dot mic | |
| // indicator stays lit and the OS thinks we're still recording. | |
| teardownAudio(); | |
| onErrorRef.current( | |
| "Mic permission denied. Click the lock icon in your browser's URL bar to enable the microphone.", | |
| ); | |
| return; | |
| } | |
| if (code === "audio-capture") { | |
| wantRunningRef.current = false; | |
| // FIX 2 (HIGH) β see above. | |
| teardownAudio(); | |
| onErrorRef.current("No microphone detected. Check your audio device and try again."); | |
| return; | |
| } | |
| if (code === "network") { | |
| // Transient β let onend's restart loop pick it up with backoff. | |
| errorBackoffRef.current = Math.min(errorBackoffRef.current + 500, 3000); | |
| return; | |
| } | |
| onErrorRef.current(`Voice error: ${code}${ev.message ? ` (${ev.message})` : ""}`); | |
| }; | |
| rec.onend = () => { | |
| onListeningRef.current(false); | |
| // KI-217 β drain only the NEW finals (everything past the consumed | |
| // cursor). DO NOT reset finalsRef here: a late-delivered isFinal | |
| // chunk arriving after onend would otherwise be wiped before the | |
| // next onend cycle can pick it up. finalsRef is reset on actual | |
| // utterance submit (grace-timer flush) and on user start/stop. | |
| const newFinals = finalsRef.current.slice(finalsConsumedRef.current); | |
| const webSpeechText = newFinals.join(" ").trim(); | |
| finalsConsumedRef.current = finalsRef.current.length; | |
| // KI-168 PHASE 2 β race guard: if a typed-text turn is in flight, | |
| // drop both transcripts on the floor (text wins). Don't start a | |
| // Sarvam fetch we'd be throwing away. | |
| const textRacing = isTextRequestPendingRef.current; | |
| // FIX 7 (HIGH) β Silent onend early-return. Chrome's "no-speech" | |
| // restart loop fires onend every ~5s with no content. Without this | |
| // guard, every silent onend re-arms the 1500ms grace timer and the | |
| // grace window extends forever β even when there's nothing pending | |
| // to submit. Skip the grace-timer reset when: | |
| // - no new Web Speech text in this cycle, AND | |
| // - no audio chunks captured this cycle (chunksRef holds the | |
| // undrained chunks that will become drainedThisEnd below), AND | |
| // - no previously pending utterance text. | |
| // We still call scheduleRestart() so the mic comes back online. | |
| const hasNewChunksThisEnd = recorderActiveRef.current && chunksRef.current.length > 0; | |
| if (!webSpeechText && !hasNewChunksThisEnd && pendingUtteranceRef.current === "") { | |
| console.debug("[useStreamingVoice] KI-222 silent onend β skipping grace reset"); | |
| // Inline the restart-only path here so we don't need to refactor | |
| // the scheduleRestart closure below it. | |
| if (wantRunningRef.current && !isTextRequestPendingRef.current) { | |
| const backoff = errorBackoffRef.current; | |
| errorBackoffRef.current = 0; | |
| clearRestartTimer(); | |
| restartTimerRef.current = setTimeout(() => { | |
| restartTimerRef.current = null; | |
| if (wantRunningRef.current) safeStart(); | |
| }, Math.max(50, backoff)); | |
| } else if (wantRunningRef.current && isTextRequestPendingRef.current) { | |
| clearRestartTimer(); | |
| restartTimerRef.current = setTimeout(() => { | |
| restartTimerRef.current = null; | |
| if (wantRunningRef.current && !isTextRequestPendingRef.current) safeStart(); | |
| }, 250); | |
| } | |
| return; | |
| } | |
| const scheduleRestart = () => { | |
| if (wantRunningRef.current && !isTextRequestPendingRef.current) { | |
| const backoff = errorBackoffRef.current; | |
| errorBackoffRef.current = 0; | |
| clearRestartTimer(); | |
| restartTimerRef.current = setTimeout(() => { | |
| restartTimerRef.current = null; | |
| if (wantRunningRef.current) safeStart(); | |
| }, Math.max(50, backoff)); | |
| } else if (wantRunningRef.current && isTextRequestPendingRef.current) { | |
| // Text turn in flight β retry shortly so mic resumes the moment | |
| // the text turn lands. | |
| clearRestartTimer(); | |
| restartTimerRef.current = setTimeout(() => { | |
| restartTimerRef.current = null; | |
| if (wantRunningRef.current && !isTextRequestPendingRef.current) safeStart(); | |
| }, 250); | |
| } | |
| }; | |
| // Pull the chunks we've accumulated so far so the recorder can keep | |
| // capturing the next utterance without us re-running getUserMedia. | |
| const drainChunks = (): Blob[] => { | |
| const drained = chunksRef.current; | |
| chunksRef.current = []; | |
| return drained; | |
| }; | |
| // KI-202 (2026-05-15) β utterance batching. Web Speech's onend fires | |
| // after ~1.5s of silence, so a natural mid-sentence pause splits one | |
| // utterance into two onend events and the user's sentence gets | |
| // submitted in halves ("First word getting cut off. Cutoff is the | |
| // biggest issue. Auto-submitting without capturing the first half | |
| // or the second half"). Instead of submitting immediately, we | |
| // append THIS onend's text + audio chunks to pendingUtterance*Ref | |
| // buffers, then start (or reset) a UTTERANCE_GRACE_MS timer. If | |
| // recognition restarts (auto-restart picks up the next word burst) | |
| // within the grace window, the next onend appends more content + | |
| // resets the timer. Only after a FULL UTTERANCE_GRACE_MS of true | |
| // silence does the timer fire and submit the accumulated buffer. | |
| // | |
| // Pauses < 1.5s merge into one turn (intended fix). | |
| // Pauses > 1.5s split (intended β that IS a new turn). | |
| // Drain the CURRENT onend's chunks now so the recorder keeps capturing | |
| // the next word burst without contamination across pending utterances. | |
| const drainedThisEnd = recorderActiveRef.current ? drainChunks() : []; | |
| if (webSpeechText) { | |
| pendingUtteranceRef.current = pendingUtteranceRef.current | |
| ? `${pendingUtteranceRef.current} ${webSpeechText}` | |
| : webSpeechText; | |
| } | |
| if (drainedThisEnd.length > 0) { | |
| pendingChunksRef.current.push(...drainedThisEnd); | |
| } | |
| console.debug("[useStreamingVoice] KI-202 onend appended to pending utterance", { | |
| thisTextLen: webSpeechText.length, | |
| thisChunkCount: drainedThisEnd.length, | |
| pendingTextLen: pendingUtteranceRef.current.length, | |
| pendingChunkCount: pendingChunksRef.current.length, | |
| textRacing, | |
| }); | |
| // Mic restart happens immediately regardless of grace window β we | |
| // WANT recognition to come back online so it can pick up the next | |
| // word burst within the grace window and append to pending. | |
| scheduleRestart(); | |
| // KI-210 (2026-05-15) β DO NOT drop pending utterance when text is | |
| // racing. Previously we cleared pendingUtteranceRef + pendingChunksRef | |
| // here, which silently lost any voice the user spoke during the bot's | |
| // text-submit/TTS-thinking gap. The downstream wait-and-retry inside | |
| // `submitPendingUtterance` (timer fire) + the post-await wait inside | |
| // the Sarvam fire-and-forget now hold the buffer until the text turn | |
| // clears, then submit. We leave `textRacing` as a debug breadcrumb in | |
| // the log above and continue accumulating. | |
| // KI-210 β refactor the grace-timer body into a named async function | |
| // so it can re-schedule itself (wait-and-retry) when text is in flight | |
| // instead of dropping the utterance. Capped at 30s total wait so a | |
| // stuck text request can't leak the timer forever; if the cap fires | |
| // we proceed with submission anyway (better to submit than drop). | |
| const SUBMIT_WAIT_CAP_MS = 30000; | |
| const submitStartTsRef = { ts: 0 }; | |
| const submitPendingUtterance = async () => { | |
| pendingSubmitTimerRef.current = null; | |
| // KI-210 β if text is still in flight when the grace window fires, | |
| // wait instead of dropping. Re-schedule a 300ms retry until either | |
| // text clears or we hit the 30s cap. | |
| if (isTextRequestPendingRef.current) { | |
| if (submitStartTsRef.ts === 0) submitStartTsRef.ts = Date.now(); | |
| if (Date.now() - submitStartTsRef.ts > SUBMIT_WAIT_CAP_MS) { | |
| console.debug("[useStreamingVoice] KI-210 timer wait cap reached; submitting anyway"); | |
| // fall through and submit | |
| } else { | |
| console.debug("[useStreamingVoice] KI-210 timer fired but text in flight; waiting 300ms"); | |
| pendingSubmitTimerRef.current = setTimeout(() => { | |
| void submitPendingUtterance(); | |
| }, 300); | |
| return; | |
| } | |
| } | |
| const accumulatedText = pendingUtteranceRef.current.trim(); | |
| const accumulatedChunks = pendingChunksRef.current; | |
| pendingUtteranceRef.current = ""; | |
| pendingChunksRef.current = []; | |
| // KI-217 β the utterance is now being submitted; safe to wipe | |
| // finalsRef + reset the consumed cursor. Any late results that | |
| // arrive after this point are for a NEW utterance. | |
| finalsRef.current = []; | |
| finalsConsumedRef.current = 0; | |
| console.debug("[useStreamingVoice] KI-202 grace window elapsed β submitting", { | |
| textLen: accumulatedText.length, | |
| chunkCount: accumulatedChunks.length, | |
| }); | |
| // No-recorder path: just submit Web Speech text. | |
| if (!recorderActiveRef.current || accumulatedChunks.length === 0) { | |
| if (accumulatedText) { | |
| onFinalRef.current(accumulatedText); | |
| } | |
| return; | |
| } | |
| // Sarvam path. Fire-and-forget so we don't block recognition. | |
| void (async () => { | |
| // Snapshot user-visible interim so the input area doesn't go blank | |
| // while Sarvam is in flight. The page-side input still shows the | |
| // Web Speech transcript; we'll overwrite it via onFinalTranscript | |
| // once Sarvam returns. | |
| if (accumulatedText) onInterimRef.current(accumulatedText); | |
| // We need to stop the recorder to get the final dataavailable | |
| // chunk for the LAST burst (anything mid-recording when the grace | |
| // window opened is in chunksRef, which we now flush into our | |
| // accumulated set before posting). | |
| await stopRecorder(); | |
| const tailChunks = drainChunks(); | |
| const allChunks = [...accumulatedChunks, ...tailChunks]; | |
| const totalSize = allChunks.reduce((n, b) => n + b.size, 0); | |
| console.debug("[useStreamingVoice] KI-202 batched submit", { | |
| webSpeechLen: accumulatedText.length, | |
| chunkCount: allChunks.length, | |
| blobBytes: totalSize, | |
| }); | |
| // Re-arm audio capture for the next utterance (don't block on it). | |
| teardownAudio(); | |
| if (wantRunningRef.current) { | |
| void ensureAudioCapture(); | |
| } | |
| // Skip submit when there's effectively no audio or no Web Speech | |
| // text. ~3 KB is the empirical noise floor used by the PTT path's | |
| // KI-134 silence guard. | |
| const MIN_BLOB_BYTES = 3000; | |
| if (!accumulatedText && totalSize < MIN_BLOB_BYTES) { | |
| console.debug("[useStreamingVoice] KI-202 skipping submit β no text and tiny blob"); | |
| return; | |
| } | |
| // KI-210 β wait-and-retry instead of dropping. If a text turn | |
| // started during the await above, hold the utterance until it | |
| // clears (capped at 30s) instead of throwing it away. | |
| await waitForTextClear(); | |
| let authoritativeText = accumulatedText; | |
| if (allChunks.length > 0 && totalSize >= MIN_BLOB_BYTES) { | |
| const blob = new Blob(allChunks, { type: recorderMimeRef.current || "audio/webm" }); | |
| // KI-226 (2026-05-15) β V5.4. Wrap the Sarvam POST in an | |
| // exponential-backoff retry (1s/2s/4s, max 3 attempts). The | |
| // accumulatedText (Web Speech fallback) and accumulated chunks | |
| // are already captured locally, so retries don't lose the | |
| // partial transcript. Each attempt enforces its own timeout | |
| // via the controller signal passed in by retryPostTranscribe. | |
| // | |
| // KI-302 (2026-05-18) β full-transcript fix. The backend now | |
| // SPLITS audio over Sarvam's ~30s REST limit into multiple | |
| // chunks and transcribes them sequentially (one Sarvam round | |
| // trip per ~25s of speech) so a long utterance is no longer | |
| // silently truncated to its first 30s. A fixed 8s client | |
| // timeout would abort that legitimately-longer multi-chunk | |
| // call mid-flight and force a fall back to the (also often | |
| // truncated) Web Speech text β re-introducing the very bug we | |
| // are fixing. Scale the per-attempt timeout with the audio | |
| // size: an 8s floor for short clips plus a generous budget per | |
| // estimated 25s chunk (browser webm/opus β 4 KB/s β ~100 KB | |
| // per 25s chunk; allow ~10s of Sarvam latency per chunk). | |
| const APPROX_BYTES_PER_CHUNK = 100_000; // ~25s of webm/opus | |
| const estChunks = Math.max( | |
| 1, | |
| Math.ceil(blob.size / APPROX_BYTES_PER_CHUNK), | |
| ); | |
| const attemptTimeoutMs = Math.min( | |
| 120_000, // hard ceiling β never wait > 2 min on one attempt | |
| 8_000 + estChunks * 12_000, | |
| ); | |
| console.debug("[useStreamingVoice] POST /api/transcribe", { | |
| bytes: blob.size, | |
| mime: blob.type, | |
| lang: language, | |
| estChunks, | |
| attemptTimeoutMs, | |
| }); | |
| const sarvam = await retryPostTranscribe(async (signal) => { | |
| // Race per-attempt timeout against the retry signal so a | |
| // hung connection still surfaces as an attempt failure (and | |
| // triggers the next backoff step) rather than blocking | |
| // forever. signal aborts when the OUTER retry loop is killed. | |
| const timeoutCtl = new AbortController(); | |
| const timer = setTimeout(() => timeoutCtl.abort(), attemptTimeoutMs); | |
| const onOuterAbort = () => timeoutCtl.abort(); | |
| signal.addEventListener("abort", onOuterAbort); | |
| try { | |
| return await postTranscribe(blob, language, timeoutCtl.signal); | |
| } finally { | |
| clearTimeout(timer); | |
| signal.removeEventListener("abort", onOuterAbort); | |
| } | |
| }); | |
| if (sarvam) { | |
| const sarvamText = (sarvam.text || "").trim(); | |
| if (sarvamText) { | |
| authoritativeText = sarvamText; | |
| console.debug("[useStreamingVoice] Sarvam OK", { | |
| latency_ms: sarvam.latency_ms, | |
| webSpeechLen: accumulatedText.length, | |
| sarvamLen: sarvamText.length, | |
| }); | |
| } else { | |
| console.debug("[useStreamingVoice] Sarvam returned empty; using Web Speech fallback"); | |
| } | |
| } else { | |
| console.debug("[useStreamingVoice] Sarvam failed after retries; using Web Speech fallback"); | |
| try { onVoiceErrorRef.current("transcribe_failed"); } catch { /* ignore */ } | |
| } | |
| } | |
| // KI-210 β final wait-and-retry after Sarvam round-trip. Don't | |
| // drop the now-authoritative transcript if text raced us during | |
| // the network call. | |
| if (authoritativeText) { | |
| await waitForTextClear(); | |
| onFinalRef.current(authoritativeText); | |
| } | |
| })(); | |
| }; | |
| // (Re)start the grace-window timer. Every onend resets it, so as long | |
| // as the user keeps starting new word bursts within 1.5s of the last | |
| // silence, the timer never fires and the utterance keeps growing. | |
| if (pendingSubmitTimerRef.current !== null) { | |
| clearTimeout(pendingSubmitTimerRef.current); | |
| } | |
| submitStartTsRef.ts = 0; | |
| pendingSubmitTimerRef.current = setTimeout(() => { | |
| void submitPendingUtterance(); | |
| }, UTTERANCE_GRACE_MS); | |
| }; | |
| return rec; | |
| }, [language, isTextRequestPendingRef, clearRestartTimer, safeStart, stopRecorder, teardownAudio, ensureAudioCapture, waitForTextClear]); | |
| const start = useCallback(() => { | |
| if (!isSupported) { | |
| onErrorRef.current( | |
| "Live voice not supported in this browser. Use push-to-talk or type instead.", | |
| ); | |
| return; | |
| } | |
| wantRunningRef.current = true; | |
| if (!recognitionRef.current) { | |
| recognitionRef.current = buildRecognition(); | |
| } | |
| finalsRef.current = []; | |
| finalsConsumedRef.current = 0; | |
| // W1 (2026-05-15) β gate the SR start on a successful `getUserMedia`. | |
| // Previously this was `void ensureAudioCapture(); safeStart();` which | |
| // raced the two in parallel: on a Chromium / iOS Safari permission | |
| // denial, the recognition started, the pill flipped to "Voice on β | |
| // just speak", but the mic was dead (zero audio, no banner, no log). | |
| // By awaiting the capture result and skipping safeStart() on a hard | |
| // denial, the pill-flip (driven by page.tsx's | |
| // `onVoiceError("mic_permission_denied")` handler) lands BEFORE | |
| // recognition kicks off. The ensureAudioCapture catch already sets | |
| // wantRunningRef=false and emits onVoiceError on its way out. | |
| void (async () => { | |
| const ok = await ensureAudioCapture(); | |
| // Hard denial path: capture failed AND ensureAudioCapture reset | |
| // wantRunningRef. Skip recognition.start β page.tsx will flip | |
| // `enabled` to false on the banner code, which triggers stop(). | |
| if (!ok && !wantRunningRef.current) return; | |
| // Soft-degraded path: capture failed but wantRunning is still true | |
| // (e.g. MediaRecorder mime mismatch on a niche browser). Fall back | |
| // to Web-Speech-only β onend's restart loop handles the fallback. | |
| safeStart(); | |
| })(); | |
| }, [isSupported, buildRecognition, safeStart, ensureAudioCapture]); | |
| const stop = useCallback(() => { | |
| wantRunningRef.current = false; | |
| clearRestartTimer(); | |
| const rec = recognitionRef.current; | |
| if (rec) { | |
| try { | |
| rec.abort(); | |
| } catch { | |
| // ignore | |
| } | |
| // FIX 1 (HIGH) β Unbind handlers and null the ref so any late | |
| // onresult/onend events delivered by Chrome AFTER abort() can't | |
| // mutate finalsRef / pendingUtteranceRef / pendingChunksRef. Without | |
| // this, a stale recognition instance fires onend ~50-300ms after | |
| // abort() and re-arms the grace timer on a torn-down session. | |
| try { | |
| rec.onresult = null; | |
| rec.onerror = null; | |
| rec.onend = null; | |
| rec.onstart = null; | |
| } catch { | |
| // ignore β some browsers reject null assignment on EventTarget props | |
| } | |
| } | |
| recognitionRef.current = null; | |
| teardownAudio(); | |
| finalsRef.current = []; | |
| finalsConsumedRef.current = 0; | |
| // FIX 6 (HIGH) β Mid-utterance toggle-off flush. If the user finishes | |
| // a complete sentence and toggles voice off within the 1.5s grace | |
| // window, submit the pending utterance instead of silently dropping | |
| // it. Only flush when no text request is racing; otherwise dropping | |
| // is safer than colliding with an in-flight turn. | |
| const finalPending = pendingUtteranceRef.current.trim(); | |
| if (finalPending && !isTextRequestPendingRef.current) { | |
| console.debug("[useStreamingVoice] KI-222 flushing pending on stop", { len: finalPending.length }); | |
| try { | |
| onFinalRef.current(finalPending); | |
| } catch { | |
| // never let a callback throw break stop() | |
| } | |
| } | |
| // KI-202 β drop any pending utterance so toggling voice off mid-grace | |
| // doesn't auto-submit a stale half-sentence next time voice comes on. | |
| if (pendingSubmitTimerRef.current !== null) { | |
| clearTimeout(pendingSubmitTimerRef.current); | |
| pendingSubmitTimerRef.current = null; | |
| } | |
| pendingUtteranceRef.current = ""; | |
| pendingChunksRef.current = []; | |
| onListeningRef.current(false); | |
| }, [clearRestartTimer, teardownAudio, isTextRequestPendingRef]); | |
| // Drive start/stop from the `enabled` prop so the hook is fire-and-forget | |
| // for the caller (mirrors useLiveConversation's `live` state semantics). | |
| useEffect(() => { | |
| if (enabled) { | |
| start(); | |
| } else { | |
| stop(); | |
| } | |
| return () => { | |
| stop(); | |
| }; | |
| // eslint-disable-next-line react-hooks/exhaustive-deps | |
| }, [enabled]); | |
| // #53 / #54 β warm-stream lifecycle. The warm stream's lifecycle is | |
| // DELIBERATELY decoupled from `enabled` (which page.tsx flips OFF for pure | |
| // push-to-talk β see page.tsx:986 `live.setLive(false)` inside | |
| // startRecording). The user opting into voice latches the warm stream ON | |
| // for the rest of the hook's mounted lifetime so: | |
| // (a) the pre-roll ring is ALWAYS filling whenever the user might press | |
| // SPACE β including in pure-PTT mode when `enabled` is false β so the | |
| // first word spoken in page.tsx's cold-start gap survives (#53); | |
| // (b) a persistent open audio device keeps the OS mic hot so page.tsx's | |
| // own per-press getUserMedia resolves in ~10-50ms instead of | |
| // cold-starting (200-700ms), removing the felt start delay (#54). | |
| // Armed on the rising edge of `enabled` (the only voice-opt-in signal the | |
| // hook receives) and kept armed thereafter; fully released on unmount. | |
| useEffect(() => { | |
| if (!isSupported) return; | |
| if (enabled) { | |
| voiceEverEnabledRef.current = true; | |
| } | |
| if (enabled || voiceEverEnabledRef.current) { | |
| void armWarmStream(); | |
| } | |
| // No teardown on `enabled` going false β the warm stream must survive | |
| // the LiveβPTT toggle. Final release happens in the unmount cleanup. | |
| }, [enabled, isSupported, armWarmStream]); | |
| // #53 / #54 β warm-stream health watchdog. The OS can silently drop a | |
| // long-lived capture (device sleep, USB mic unplug, OS audio interruption, | |
| // tab backgrounding on some browsers) WITHOUT firing recorder.onerror. If | |
| // that happens the pre-roll ring goes stale and the very bug we fixed | |
| // returns. Every 4s, while voice has been opted into and we're not in the | |
| // middle of a PTT capture, re-assert the warm stream (armWarmStream is a | |
| // no-op when the recorder is healthily "recording"). | |
| useEffect(() => { | |
| if (!isSupported) return; | |
| const tick = setInterval(() => { | |
| if (!voiceEverEnabledRef.current) return; | |
| if (pttEngagedRef.current) return; // don't disturb an in-flight capture | |
| const rec = warmRecorderRef.current; | |
| if (!rec || rec.state !== "recording" || !warmStreamRef.current) { | |
| void armWarmStream(); | |
| } | |
| }, 4000); | |
| return () => clearInterval(tick); | |
| }, [isSupported, armWarmStream]); | |
| // KI-173 (2026-05-15) β heartbeat watchdog. Browser SpeechRecognition | |
| // occasionally enters a stopped state without `onend` firing (certain | |
| // network errors, transient OS audio interruptions, tab visibility | |
| // edge cases). The auto-restart in `onend` never gets the chance to | |
| // run, and the mic stays silently dead until the user toggles voice | |
| // off+on. Every 4s, if we WANT to be listening (enabled + wantRunningRef) | |
| // and no text turn is racing and no restart is already scheduled, call | |
| // `safeStart()` unconditionally β InvalidStateError is swallowed if | |
| // recognition is already running, otherwise this revives the dead state. | |
| useEffect(() => { | |
| if (!enabled || !isSupported) return; | |
| const tick = setInterval(() => { | |
| if ( | |
| wantRunningRef.current | |
| && !isTextRequestPendingRef.current | |
| && !isTtsPlayingRef.current // KI-188 β block revival during TTS playback | |
| && restartTimerRef.current === null | |
| ) { | |
| safeStart(); | |
| } | |
| }, 4000); | |
| return () => clearInterval(tick); | |
| }, [enabled, isSupported, isTextRequestPendingRef, safeStart]); | |
| // KI-188 (2026-05-15) β TTS playback gate. Browser Web Speech API has | |
| // its own internal mic pipeline that bypasses our getUserMedia AEC | |
| // constraints (KI-185), so SpeechRecognition transcribes the bot's TTS | |
| // audio bleeding from speakers as if it were user input. The visible | |
| // echo "perfect days to get started Rohit" was echo of bot's TTS | |
| // "perfect age to get started, Rohit". The only reliable JS-level fix | |
| // is to ABORT recognition while ANY <audio> element in the DOM is | |
| // playing, then revive via the heartbeat (KI-173) the moment all | |
| // audio ends. | |
| // | |
| // Trade-off: live "barge-in by just speaking" is disabled DURING TTS. | |
| // Push-to-talk still works (it uses MediaRecorder, not SpeechRecognition). | |
| useEffect(() => { | |
| if (!enabled || !isSupported) return; | |
| if (typeof document === "undefined") return; | |
| // KI-189 (2026-05-15) β barge-in VAD state. The AnalyserNode + AudioContext | |
| // are lazily created on first TTS-playback and reused for subsequent | |
| // playbacks to avoid repeated AudioContext spin-up cost (Chrome warns | |
| // when >6 contexts coexist). | |
| let audioCtx: AudioContext | null = null; | |
| let analyser: AnalyserNode | null = null; | |
| let sourceNode: MediaStreamAudioSourceNode | null = null; | |
| let attachedStream: MediaStream | null = null; | |
| let rmsBuf: Float32Array<ArrayBuffer> | null = null; | |
| let sustainedFrames = 0; | |
| let rafId: number | null = null; | |
| // KI-190 β per-<audio> bot-RMS analysers for adaptive threshold. | |
| // Each watched audio element gets its own MediaElementAudioSourceNode + | |
| // AnalyserNode so we can read the bot's instantaneous playback level | |
| // during a barge-in tick. Map keyed by the audio element. | |
| const botAnalysers = new Map<HTMLAudioElement, { | |
| source: MediaElementAudioSourceNode; | |
| analyser: AnalyserNode; | |
| buf: Float32Array<ArrayBuffer>; | |
| }>(); | |
| // Track which <audio> elements we've dimmed so we can restore on cleanup. | |
| const duckedAudios = new Set<HTMLAudioElement>(); | |
| // KI-195 β user-speech RMS tracker + per-element calibrated volume. | |
| // userSpeechRms is the rolling peak of mic RMS observed while the user | |
| // is actively speaking (recorder active, not TTS). It seeds the bot | |
| // volume target. Calibrated volumes per element survive across turns | |
| // so we don't have to re-learn after every reply. | |
| let userSpeechRms = USER_SPEECH_RMS_INITIAL; | |
| const calibratedVolumes = new Map<HTMLAudioElement, number>(); | |
| let userRmsRafId: number | null = null; | |
| let volumeCalibIntervalId: ReturnType<typeof setInterval> | null = null; | |
| // FIX 5 (HIGH) β wall-clock decay interval. The rAF-driven userRmsTick | |
| // is gated on `!isTtsPlaying`, so during bot TTS playback there is NO | |
| // decay of userSpeechRms β a shout right before the bot starts speaking | |
| // would pin userSpeechRms at 0.4 for the entire bot turn. This setInterval | |
| // runs unconditionally while `enabled` is true, so the rolling peak | |
| // decays toward USER_SPEECH_RMS_INITIAL on a wall-clock schedule that's | |
| // independent of the rAF gate. | |
| let userRmsWallClockIntervalId: ReturnType<typeof setInterval> | null = null; | |
| const sampleUserRms = (): number => { | |
| if (!analyser || !rmsBuf) return 0; | |
| try { | |
| analyser.getFloatTimeDomainData(rmsBuf); | |
| } catch { return 0; } | |
| let sumSq = 0; | |
| for (let i = 0; i < rmsBuf.length; i++) { | |
| const v = rmsBuf[i]; | |
| sumSq += v * v; | |
| } | |
| return Math.sqrt(sumSq / rmsBuf.length); | |
| }; | |
| const userRmsTick = () => { | |
| // Only learn while user is potentially speaking β recorder active, | |
| // no TTS, voice mode on. | |
| if ( | |
| !wantRunningRef.current | |
| || isTtsPlayingRef.current | |
| || !recorderActiveRef.current | |
| ) { | |
| userRmsRafId = null; | |
| return; | |
| } | |
| if (!analyser || !rmsBuf) { | |
| userRmsRafId = null; | |
| return; | |
| } | |
| const rms = sampleUserRms(); | |
| // Only count as "user speaking" when above detection threshold. | |
| // Then update userSpeechRms via slow EMA on peak so a single shout | |
| // doesn't permanently raise the baseline. | |
| if (rms > USER_SPEECH_DETECTION_THRESHOLD) { | |
| userSpeechRms = Math.max(userSpeechRms * 0.95, rms); | |
| // FIX 5 (HIGH) β clamp to ceiling so a single shout cannot pin | |
| // userSpeechRms permanently high and break subsequent barge-in. | |
| userSpeechRms = Math.min(userSpeechRms, USER_SPEECH_RMS_CEILING); | |
| } | |
| userRmsRafId = requestAnimationFrame(userRmsTick); | |
| }; | |
| const startUserRmsLoop = () => { | |
| if (userRmsRafId !== null) return; | |
| // Reuse the VAD analyser. startBargeInLoop sets it up; if it doesn't | |
| // exist yet, the loop will exit on first tick (analyser null) and | |
| // restart on the next state transition. | |
| userRmsRafId = requestAnimationFrame(userRmsTick); | |
| }; | |
| const stopUserRmsLoop = () => { | |
| if (userRmsRafId !== null) { | |
| cancelAnimationFrame(userRmsRafId); | |
| userRmsRafId = null; | |
| } | |
| }; | |
| // FIX 5 (HIGH) β wall-clock decay. Runs every USER_SPEECH_RMS_WALL_CLOCK_DECAY_MS | |
| // regardless of TTS state so the rolling peak can't get permanently | |
| // pinned high during long TTS turns. Floors at USER_SPEECH_RMS_INITIAL | |
| // so we don't decay below the calibrated baseline. | |
| const startUserRmsWallClockDecay = () => { | |
| if (userRmsWallClockIntervalId !== null) return; | |
| userRmsWallClockIntervalId = setInterval(() => { | |
| userSpeechRms = Math.max( | |
| USER_SPEECH_RMS_INITIAL, | |
| userSpeechRms * USER_SPEECH_RMS_WALL_CLOCK_DECAY_FACTOR, | |
| ); | |
| }, USER_SPEECH_RMS_WALL_CLOCK_DECAY_MS); | |
| }; | |
| const stopUserRmsWallClockDecay = () => { | |
| if (userRmsWallClockIntervalId !== null) { | |
| clearInterval(userRmsWallClockIntervalId); | |
| userRmsWallClockIntervalId = null; | |
| } | |
| }; | |
| // KI-195 β volume calibration tick. Runs during TTS. Samples bot RMS | |
| // at the mic via botAnalysers. If bot is louder than target relative | |
| // to userSpeechRms, duck el.volume by 20% per tick down to the floor. | |
| const calibrateBotVolume = () => { | |
| if (!isTtsPlayingRef.current) { | |
| if (volumeCalibIntervalId !== null) { | |
| clearInterval(volumeCalibIntervalId); | |
| volumeCalibIntervalId = null; | |
| } | |
| return; | |
| } | |
| const target = userSpeechRms * VOLUME_CALIB_TARGET_RATIO; | |
| const botRms = computeBotRms(); | |
| if (botRms > target) { | |
| ttsAudioElementsRef.current.forEach((el) => { | |
| if (el.paused || el.ended) return; | |
| const cur = el.volume; | |
| const next = Math.max(VOLUME_CALIB_FLOOR, cur * VOLUME_CALIB_DUCK_FACTOR); | |
| if (next < cur - 0.001) { | |
| try { | |
| el.volume = next; | |
| calibratedVolumes.set(el, next); | |
| } catch { /* ignore */ } | |
| } | |
| }); | |
| } | |
| }; | |
| const startVolumeCalibration = () => { | |
| if (volumeCalibIntervalId !== null) return; | |
| volumeCalibIntervalId = setInterval(calibrateBotVolume, VOLUME_CALIB_TICK_MS); | |
| }; | |
| const stopVolumeCalibration = () => { | |
| if (volumeCalibIntervalId !== null) { | |
| clearInterval(volumeCalibIntervalId); | |
| volumeCalibIntervalId = null; | |
| } | |
| }; | |
| const stopBargeInLoop = () => { | |
| if (rafId !== null) { | |
| cancelAnimationFrame(rafId); | |
| rafId = null; | |
| } | |
| sustainedFrames = 0; | |
| }; | |
| const teardownAnalyser = () => { | |
| stopBargeInLoop(); | |
| try { sourceNode?.disconnect(); } catch { /* ignore */ } | |
| try { analyser?.disconnect(); } catch { /* ignore */ } | |
| sourceNode = null; | |
| analyser = null; | |
| attachedStream = null; | |
| rmsBuf = null; | |
| // KI-190 β tear down bot analysers + audio context. | |
| botAnalysers.forEach((entry) => { | |
| try { entry.source.disconnect(); } catch { /* ignore */ } | |
| try { entry.analyser.disconnect(); } catch { /* ignore */ } | |
| }); | |
| botAnalysers.clear(); | |
| if (audioCtx) { | |
| const ctx = audioCtx; | |
| audioCtx = null; | |
| try { void ctx.close(); } catch { /* ignore */ } | |
| } | |
| }; | |
| // KI-190 β ensure an AudioContext exists for bot analyser attachment. | |
| // Reuses the same instance the VAD path uses. | |
| const ensureAudioCtx = (): AudioContext | null => { | |
| if (audioCtx && audioCtx.state !== "closed") return audioCtx; | |
| try { | |
| const Ctor = (window.AudioContext | |
| || (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext); | |
| if (!Ctor) return null; | |
| audioCtx = new Ctor(); | |
| return audioCtx; | |
| } catch { | |
| return null; | |
| } | |
| }; | |
| // KI-190 β attach an AnalyserNode to a bot <audio> element. Routes the | |
| // element's audio through the AudioContext (source β analyser β | |
| // destination so it stays audible). createMediaElementSource throws if | |
| // called twice on the same element, so we swallow and skip. | |
| const attachBotAnalyser = (el: HTMLAudioElement) => { | |
| if (botAnalysers.has(el)) return; | |
| const ctx = ensureAudioCtx(); | |
| if (!ctx) return; | |
| try { | |
| const source = ctx.createMediaElementSource(el); | |
| const an = ctx.createAnalyser(); | |
| an.fftSize = 1024; | |
| an.smoothingTimeConstant = 0.4; | |
| source.connect(an); | |
| an.connect(ctx.destination); | |
| const buf = new Float32Array(new ArrayBuffer(an.fftSize * 4)); | |
| botAnalysers.set(el, { source, analyser: an, buf }); | |
| } catch { | |
| // already routed through Web Audio elsewhere, or autoplay policy | |
| // blocked the context β bargeInTick will simply use the base | |
| // threshold for this turn. | |
| } | |
| }; | |
| // KI-190 β current peak bot RMS across all playing <audio> elements. | |
| // We take the max (not sum) because only one TTS plays at a time in | |
| // practice and max behaves more sensibly if a stale paused element is | |
| // still in the map. | |
| const computeBotRms = (): number => { | |
| let peak = 0; | |
| botAnalysers.forEach(({ analyser: an, buf }, el) => { | |
| if (el.paused || el.ended) return; // ignore idle elements | |
| an.getFloatTimeDomainData(buf); | |
| let sumSq = 0; | |
| for (let i = 0; i < buf.length; i++) { | |
| const v = buf[i]; | |
| sumSq += v * v; | |
| } | |
| // The MediaElementSource is post-volume, so this already reflects | |
| // the ducked KI-191 0.6 volume β we get the actual audible level. | |
| const rms = Math.sqrt(sumSq / buf.length); | |
| if (rms > peak) peak = rms; | |
| }); | |
| return peak; | |
| }; | |
| const triggerBargeIn = (rms: number) => { | |
| console.debug("[useStreamingVoice] KI-189 barge-in detected", { | |
| rms: rms.toFixed(4), | |
| frames: sustainedFrames, | |
| threshold: BARGE_IN_RMS_THRESHOLD, | |
| }); | |
| // KI-227 (2026-05-15) β V6.7. Flush any pending utterance that | |
| // accumulated during the bot's TTS window BEFORE the barge-in fires. | |
| // The grace-window timer (UTTERANCE_GRACE_MS) holds the user's | |
| // utterance for up to 1.5s waiting for more bursts β if the user | |
| // barges in over the bot before that timer fires, the pending text | |
| // would otherwise sit silently until the timer expires. Deliver it | |
| // now so page.tsx submits the user's actual question instead of | |
| // letting it die on the floor while a fresh recognition starts. | |
| try { | |
| const flushText = pendingUtteranceRef.current.trim(); | |
| if (flushText && !isTextRequestPendingRef.current) { | |
| console.debug("[useStreamingVoice] V6.7 flushing pending utterance on barge-in", { | |
| len: flushText.length, | |
| }); | |
| pendingUtteranceRef.current = ""; | |
| pendingChunksRef.current = []; | |
| finalsRef.current = []; | |
| finalsConsumedRef.current = 0; | |
| if (pendingSubmitTimerRef.current !== null) { | |
| clearTimeout(pendingSubmitTimerRef.current); | |
| pendingSubmitTimerRef.current = null; | |
| } | |
| onFinalRef.current(flushText); | |
| } | |
| } catch (err) { | |
| // Never let the flush throw break the barge-in pipeline. | |
| console.debug("[useStreamingVoice] V6.7 pending flush threw", err); | |
| } | |
| // FIX 3 (HIGH) β flip the barge-in signal so the caller (page.tsx) | |
| // can abort the in-flight /api/chat request that's still assembling | |
| // more TTS audio. Without this, pausing the currently-mounted | |
| // <audio> elements only stops THIS chunk; the next TTS chunk that | |
| // arrives mounts a new <audio>, fires play, and the bot resumes | |
| // talking after the user has already interrupted. | |
| bargeInRequestedRef.current = true; | |
| // Pause + reset every TTS <audio>; the MutationObserver's pause | |
| // listener will set isTtsPlayingRef = false and call safeStart(). | |
| ttsAudioElementsRef.current.forEach((el) => { | |
| try { | |
| el.pause(); | |
| el.currentTime = 0; | |
| } catch { | |
| // ignore | |
| } | |
| }); | |
| stopBargeInLoop(); | |
| }; | |
| const bargeInTick = () => { | |
| // Re-check gating each frame β if state changed mid-loop, exit cleanly. | |
| if ( | |
| !isTtsPlayingRef.current | |
| || !wantRunningRef.current | |
| || isTextRequestPendingRef.current | |
| ) { | |
| stopBargeInLoop(); | |
| return; | |
| } | |
| if (!analyser || !rmsBuf) { | |
| stopBargeInLoop(); | |
| return; | |
| } | |
| analyser.getFloatTimeDomainData(rmsBuf); | |
| let sumSq = 0; | |
| // FIX 4 (HIGH) β compute zero-crossing rate alongside RMS. Speech | |
| // ZCR sits in a specific band; keyboard typing has very high ZCR | |
| // (transients), HVAC / room rumble has very low ZCR (DC-like). | |
| // Rejecting frames outside the speech band cuts false-positive | |
| // barge-ins from typing and ambient noise. | |
| let zeroCrossings = 0; | |
| let prevSign = rmsBuf[0] >= 0 ? 1 : -1; | |
| for (let i = 0; i < rmsBuf.length; i++) { | |
| const v = rmsBuf[i]; | |
| sumSq += v * v; | |
| if (i > 0) { | |
| const sign = v >= 0 ? 1 : -1; | |
| if (sign !== prevSign) zeroCrossings += 1; | |
| prevSign = sign; | |
| } | |
| } | |
| const rms = Math.sqrt(sumSq / rmsBuf.length); | |
| // KI-228 (2026-05-15) β V6.8. Feed every frame into the adaptive | |
| // noise-floor estimator. It only updates the EMA when the frame is | |
| // below the CURRENT threshold (i.e. the frame looks like silence), | |
| // so speech bursts can't pollute the room baseline. | |
| noiseFloorRef.current.feed(rms); | |
| const noiseAdaptiveThreshold = noiseFloorRef.current.currentThreshold(); | |
| // KI-190 β adaptive threshold: bot_rms * 2 + 0.005, floored at the | |
| // base BARGE_IN_RMS_THRESHOLD so we never set it absurdly low. | |
| // KI-228 (2026-05-15) β V6.8. ALSO floor at the noise-floor adaptive | |
| // threshold so a noisy room (HVAC, cafΓ©) doesn't cause false-positive | |
| // barge-ins on the original static 0.008 threshold. | |
| const botRms = computeBotRms(); | |
| // KI-285 (2026-05-16) β defence-in-depth for the post-grace window. | |
| // computeBotRms() returns 0 not only for the first frames of playback | |
| // but PERMANENTLY whenever createMediaElementSource() threw (Safari, | |
| // element already Web-Audio-routed, autoplay-suspended ctx). In that | |
| // state the `botRms * MULT + BASE` term collapses to 0.002 and the | |
| // whole Math.max() falls back to the bare 0.008 static floor β which | |
| // is BELOW documented speaker echo bleed (~0.02 RMS, KI-189/190). The | |
| // bot's own voice then clears the gate and self-triggers a barge-in. | |
| // When we have no usable bot-level reference, hold the threshold at an | |
| // echo-safe floor: above worst-case AEC residual, well below the | |
| // 0.05-0.2 RMS of real user speech, so genuine barge-in still fires. | |
| const haveBotRef = botRms > 0; | |
| const adaptiveThreshold = Math.max( | |
| haveBotRef ? BARGE_IN_RMS_THRESHOLD : BARGE_IN_NO_BOTREF_FLOOR, | |
| noiseAdaptiveThreshold, | |
| botRms * BARGE_IN_BOT_RMS_MULTIPLIER + BARGE_IN_BASE_THRESHOLD, | |
| ); | |
| // FIX 4 / KI-225 (V1.3) β speech ZCR band scaled to the actual | |
| // AudioContext sampleRate. At 48 kHz that's the original 20..250; | |
| // at 16 kHz it's ~7..83. | |
| const band = zcrBandRef.current; | |
| const isSpeechBand = zeroCrossings >= band.min && zeroCrossings <= band.max; | |
| // KI-285 (2026-05-16) β echo-suppression grace window. For the first | |
| // BARGE_IN_GRACE_MS of the bot's reply, the energy at the mic is the | |
| // bot's OWN audio echoing back (browser AEC is imperfect on speaker | |
| // users), NOT the user. Refuse to accumulate sustained frames or | |
| // trigger during this window, but KEEP the rAF loop alive so the | |
| // instant the window elapses β if the user is genuinely speaking over | |
| // the bot β the sustained-energy gate re-arms and fires within | |
| // BARGE_IN_SUSTAINED_FRAMES (~100ms). Hold sustainedFrames at 0 so an | |
| // echo burst that straddles the grace boundary cannot carry partial | |
| // credit past it. Real barge-in is the user talking for *seconds*, so | |
| // it always survives a 600ms suppression; the bot's start-of-reply | |
| // echo, which cannot outlast the window without the user speaking, is | |
| // the only thing suppressed. `started === 0` means no active playback | |
| // stamp (defensive): treat as still-in-grace so we never trigger on a | |
| // stale/unknown timeline. | |
| const started = ttsPlaybackStartedAtRef.current; | |
| const inGraceWindow = | |
| started === 0 || Date.now() - started < BARGE_IN_GRACE_MS; | |
| if (inGraceWindow) { | |
| sustainedFrames = 0; | |
| rafId = requestAnimationFrame(bargeInTick); | |
| return; | |
| } | |
| if (rms >= adaptiveThreshold && isSpeechBand) { | |
| sustainedFrames += 1; | |
| if (sustainedFrames >= BARGE_IN_SUSTAINED_FRAMES) { | |
| triggerBargeIn(rms); | |
| return; | |
| } | |
| } else { | |
| sustainedFrames = 0; | |
| } | |
| rafId = requestAnimationFrame(bargeInTick); | |
| }; | |
| const startBargeInLoop = () => { | |
| // Gating: voice mode active, no racing text turn, MediaRecorder live. | |
| if (!wantRunningRef.current) return; | |
| if (isTextRequestPendingRef.current) return; | |
| if (!recorderActiveRef.current) return; | |
| const stream = mediaStreamRef.current; | |
| if (!stream || stream.getAudioTracks().length === 0) return; | |
| try { | |
| // Reuse the AudioContext + AnalyserNode if the same stream is still | |
| // attached; otherwise rebuild (the stream may have been swapped out | |
| // by teardownAudio() between TTS plays). | |
| if (!audioCtx || audioCtx.state === "closed") { | |
| const Ctor = (window.AudioContext | |
| || (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext); | |
| if (!Ctor) return; | |
| audioCtx = new Ctor(); | |
| } | |
| if (audioCtx.state === "suspended") { | |
| // KI-223 (2026-05-15) β V1.1. Best-effort resume; if it rejects | |
| // (Chrome's autoplay policy requires a user gesture), surface a | |
| // structured error so the UI can prompt the user to tap. Without | |
| // this, the VAD silently never fires and barge-in appears broken | |
| // for the entire session. | |
| void audioCtx.resume().catch((err) => { | |
| console.debug("[useStreamingVoice] V1.1 AudioContext.resume failed", err); | |
| try { onVoiceErrorRef.current("audio_context_suspended"); } catch { /* ignore */ } | |
| }); | |
| } | |
| if (!analyser || attachedStream !== stream) { | |
| try { sourceNode?.disconnect(); } catch { /* ignore */ } | |
| try { analyser?.disconnect(); } catch { /* ignore */ } | |
| analyser = audioCtx.createAnalyser(); | |
| analyser.fftSize = 2048; | |
| analyser.smoothingTimeConstant = 0.5; | |
| sourceNode = audioCtx.createMediaStreamSource(stream); | |
| sourceNode.connect(analyser); | |
| attachedStream = stream; | |
| rmsBuf = new Float32Array(new ArrayBuffer(analyser.fftSize * 4)); | |
| // KI-225 (2026-05-15) β V1.3. Compare the AudioContext's actual | |
| // sampleRate against the track's reported rate. If they disagree, | |
| // log a warning AND rescale the speech ZCR band so the VAD math | |
| // keeps meaning at 16 kHz / 24 kHz consumer mics (the static | |
| // 20..250 band from KI-189 was calibrated for 48 kHz). | |
| try { | |
| const trackRate = stream.getAudioTracks()[0]?.getSettings?.().sampleRate; | |
| const ctxRate = audioCtx.sampleRate; | |
| if (trackRate && Math.abs(trackRate - ctxRate) > 100) { | |
| console.debug( | |
| "[useStreamingVoice] V1.3 sample-rate mismatch", | |
| { trackRate, ctxRate }, | |
| ); | |
| } | |
| zcrBandRef.current = scaleSpeechZcrBand(ctxRate); | |
| } catch { | |
| // Older browsers without MediaTrackSettings.sampleRate β keep | |
| // the reference band. | |
| zcrBandRef.current = scaleSpeechZcrBand(audioCtx.sampleRate); | |
| } | |
| } | |
| sustainedFrames = 0; | |
| if (rafId !== null) cancelAnimationFrame(rafId); | |
| rafId = requestAnimationFrame(bargeInTick); | |
| } catch (err) { | |
| console.debug("[useStreamingVoice] KI-189 VAD init failed", err); | |
| teardownAnalyser(); | |
| } | |
| }; | |
| const updateTtsState = () => { | |
| let anyPlaying = false; | |
| ttsAudioElementsRef.current.forEach((el) => { | |
| if (!el.paused && !el.ended) anyPlaying = true; | |
| }); | |
| const wasPlaying = isTtsPlayingRef.current; | |
| isTtsPlayingRef.current = anyPlaying; | |
| if (anyPlaying && !wasPlaying) { | |
| // TTS just started β abort any in-flight recognition so it stops | |
| // transcribing the bot voice. | |
| // KI-285 (2026-05-16) β stamp the playback-start instant so the | |
| // barge-in tick can suppress detection during the BARGE_IN_GRACE_MS | |
| // echo window. This is the ONLY falseβtrue edge, so it captures the | |
| // true start of the reply (not a per-chunk restart β the reply is a | |
| // single <audio> blob; see BARGE_IN_GRACE_MS comment). | |
| ttsPlaybackStartedAtRef.current = Date.now(); | |
| console.debug("[useStreamingVoice] KI-188 TTS started β pausing recognition"); | |
| // KI-203 (2026-05-15) β flip the result-drop flag the INSTANT TTS | |
| // starts. abort() below has a ~100-300ms tail during which onresult | |
| // can still fire with bot-voice transcripts; the flag closes that | |
| // window unconditionally. | |
| if (dropResultsClearTimerRef.current !== null) { | |
| clearTimeout(dropResultsClearTimerRef.current); | |
| dropResultsClearTimerRef.current = null; | |
| } | |
| dropResultsRef.current = true; | |
| console.debug("[useStreamingVoice] KI-203 dropResultsRef=true (TTS start)"); | |
| const rec = recognitionRef.current; | |
| if (rec) { | |
| try { rec.abort(); } catch { /* ignore */ } | |
| } | |
| // KI-195 β user cannot be speaking during TTS playback; stop the | |
| // RMS-learning loop until TTS ends so we don't capture bot audio | |
| // bleed-through as "user speech level". | |
| stopUserRmsLoop(); | |
| // KI-191 β re-duck every playing audio in case React or the audio | |
| // element default reset volume after watchAudio set it. | |
| ttsAudioElementsRef.current.forEach((el) => { | |
| if (!el.paused && el.volume !== VOICE_MODE_TTS_VOLUME) { | |
| try { el.volume = VOICE_MODE_TTS_VOLUME; } catch { /* ignore */ } | |
| } | |
| }); | |
| // KI-195 β once the volume floor is set, begin adaptive calibration | |
| // so the bot's volume tracks the learned user speech level. | |
| startVolumeCalibration(); | |
| // KI-192 (2026-05-15) β MediaRecorder might be torn down between | |
| // user utterances (KI-168 teardownAudio). Without an active | |
| // recorder, startBargeInLoop bails on the recorderActiveRef check | |
| // and barge-in never fires. Fire-and-forget ensureAudioCapture | |
| // first; if it succeeds, the VAD loop has a live stream. | |
| if (wantRunningRef.current && !isTextRequestPendingRef.current) { | |
| void ensureAudioCapture().then(() => { | |
| // Re-check we're still in TTS-playing state β TTS may have | |
| // ended during the async ensureAudioCapture round-trip. | |
| if (isTtsPlayingRef.current) { | |
| startBargeInLoop(); | |
| } | |
| }); | |
| } else { | |
| startBargeInLoop(); // best-effort if gates won't allow capture rebuild | |
| } | |
| } else if (!anyPlaying && wasPlaying) { | |
| // TTS just ended β let the heartbeat/visibility listeners revive. | |
| // Trigger immediately too so the user doesn't wait ~4s. | |
| // KI-285 (2026-05-16) β clear the playback-start stamp so a stale | |
| // value can't accidentally satisfy the grace check on the next turn | |
| // before updateTtsState re-stamps it. | |
| ttsPlaybackStartedAtRef.current = 0; | |
| console.debug("[useStreamingVoice] KI-188 TTS ended β resuming recognition"); | |
| // KI-203 (2026-05-15) β keep dropping recognition results for | |
| // POST_TTS_DROP_MS after TTS ends. The recognition pipeline we | |
| // abort()'d at TTS-start can still deliver buffered events for a | |
| // beat; without this delayed clear, the tail of the bot's TTS | |
| // leaks into the input box as the user starts speaking. | |
| if (dropResultsClearTimerRef.current !== null) { | |
| clearTimeout(dropResultsClearTimerRef.current); | |
| } | |
| dropResultsClearTimerRef.current = setTimeout(() => { | |
| dropResultsRef.current = false; | |
| dropResultsClearTimerRef.current = null; | |
| console.debug("[useStreamingVoice] KI-203 dropResultsRef=false (post-TTS window over)"); | |
| }, POST_TTS_DROP_MS); | |
| stopBargeInLoop(); | |
| // KI-195 β freeze the per-element calibrated volume and resume | |
| // learning the user's speech RMS for the next turn. | |
| stopVolumeCalibration(); | |
| startUserRmsLoop(); | |
| if (wantRunningRef.current && !isTextRequestPendingRef.current) { | |
| safeStart(); | |
| } | |
| } | |
| }; | |
| const watchAudio = (el: HTMLAudioElement) => { | |
| if (ttsAudioElementsRef.current.has(el)) return; | |
| ttsAudioElementsRef.current.add(el); | |
| // KI-191 β duck bot TTS to 60% while voice mode is on, so AEC residual | |
| // is even quieter and barge-in is trivial. | |
| // KI-195 β if we already calibrated a volume for this exact element on | |
| // a previous turn (rare β elements are usually recreated), reuse it so | |
| // we don't reset the adaptive level on every play() event. | |
| try { | |
| const prior = calibratedVolumes.get(el); | |
| el.volume = prior !== undefined ? prior : VOICE_MODE_TTS_VOLUME; | |
| duckedAudios.add(el); | |
| } catch { /* readonly volume on some platforms β ignore */ } | |
| // KI-190 β attach bot-level analyser for adaptive threshold. | |
| attachBotAnalyser(el); | |
| el.addEventListener("play", updateTtsState); | |
| el.addEventListener("playing", updateTtsState); | |
| el.addEventListener("pause", updateTtsState); | |
| el.addEventListener("ended", updateTtsState); | |
| // Initial check (handles audio that was already playing on mount) | |
| updateTtsState(); | |
| }; | |
| const unwatchAudio = (el: HTMLAudioElement) => { | |
| if (!ttsAudioElementsRef.current.has(el)) return; | |
| el.removeEventListener("play", updateTtsState); | |
| el.removeEventListener("playing", updateTtsState); | |
| el.removeEventListener("pause", updateTtsState); | |
| el.removeEventListener("ended", updateTtsState); | |
| ttsAudioElementsRef.current.delete(el); | |
| updateTtsState(); | |
| }; | |
| // Initial scan | |
| document.querySelectorAll("audio").forEach((el) => watchAudio(el as HTMLAudioElement)); | |
| // Watch the whole document for new <audio> elements | |
| const observer = new MutationObserver((mutations) => { | |
| mutations.forEach((m) => { | |
| m.addedNodes.forEach((n) => { | |
| if (n instanceof HTMLElement) { | |
| if (n.tagName === "AUDIO") watchAudio(n as HTMLAudioElement); | |
| n.querySelectorAll?.("audio").forEach((el) => watchAudio(el as HTMLAudioElement)); | |
| } | |
| }); | |
| m.removedNodes.forEach((n) => { | |
| if (n instanceof HTMLElement) { | |
| if (n.tagName === "AUDIO") unwatchAudio(n as HTMLAudioElement); | |
| n.querySelectorAll?.("audio").forEach((el) => unwatchAudio(el as HTMLAudioElement)); | |
| } | |
| }); | |
| }); | |
| }); | |
| observer.observe(document.body, { childList: true, subtree: true }); | |
| // KI-195 β kick off the user-RMS learning loop on mount so by the time | |
| // the first TTS plays we already have a baseline. The loop self-exits | |
| // when conditions aren't met (no analyser / no stream / in TTS), so | |
| // firing it unconditionally here is safe. | |
| startUserRmsLoop(); | |
| // FIX 5 (HIGH) β start the wall-clock decay so userSpeechRms never | |
| // gets permanently pinned high (even during TTS playback when the | |
| // rAF loop is gated off). | |
| startUserRmsWallClockDecay(); | |
| return () => { | |
| // KI-195 β tear down adaptive volume calibration before clearing | |
| // ducked-audio state so the calibration tick can't race a clear(). | |
| stopUserRmsLoop(); | |
| // FIX 5 (HIGH) β clean up the wall-clock decay interval. | |
| stopUserRmsWallClockDecay(); | |
| stopVolumeCalibration(); | |
| calibratedVolumes.clear(); | |
| observer.disconnect(); | |
| // KI-191 β restore bot TTS volume to default before unmount so a | |
| // subsequent voice-OFF session doesn't end up with silent audio. | |
| duckedAudios.forEach((el) => { | |
| try { el.volume = 1.0; } catch { /* ignore */ } | |
| }); | |
| duckedAudios.clear(); | |
| ttsAudioElementsRef.current.forEach((el) => { | |
| el.removeEventListener("play", updateTtsState); | |
| el.removeEventListener("playing", updateTtsState); | |
| el.removeEventListener("pause", updateTtsState); | |
| el.removeEventListener("ended", updateTtsState); | |
| }); | |
| ttsAudioElementsRef.current.clear(); | |
| isTtsPlayingRef.current = false; | |
| // KI-203 β clear the post-TTS drop-results window timer so a | |
| // disabled-then-re-enabled voice mode doesn't inherit a stale flag. | |
| if (dropResultsClearTimerRef.current !== null) { | |
| clearTimeout(dropResultsClearTimerRef.current); | |
| dropResultsClearTimerRef.current = null; | |
| } | |
| dropResultsRef.current = false; | |
| // KI-189 β release AnalyserNode + AudioContext on unmount / disable. | |
| teardownAnalyser(); | |
| }; | |
| }, [enabled, isSupported, isTextRequestPendingRef, safeStart]); | |
| // KI-174 (2026-05-15) β immediate-revival on visibility/focus changes. | |
| // User reported: "sometimes when I go away from clicking the text box, | |
| // it seems to not input my voice anymore. I have to restart the whole | |
| // voice thing." Root cause: Chrome's SpeechRecognition auto-stops | |
| // when the tab loses visibility (tab switch, app switch, screenshot, | |
| // OS modal). The KI-173 heartbeat is throttled to ~1Hz when the tab | |
| // is hidden, so it takes several seconds to revive after returning. | |
| // Force-revival on: | |
| // - document `visibilitychange` β visible | |
| // - window `focus` | |
| // Both check wantRunningRef + isTextRequestPendingRef before firing. | |
| useEffect(() => { | |
| if (!enabled || !isSupported) return; | |
| if (typeof window === "undefined" || typeof document === "undefined") return; | |
| const tryRevive = (trigger: string) => { | |
| if ( | |
| wantRunningRef.current | |
| && !isTextRequestPendingRef.current | |
| && !isTtsPlayingRef.current // KI-188 β block revival during TTS | |
| && document.visibilityState === "visible" | |
| ) { | |
| console.debug("[useStreamingVoice] revival trigger=" + trigger); | |
| safeStart(); | |
| } | |
| }; | |
| const onVisible = () => tryRevive("visibilitychange"); | |
| const onFocus = () => tryRevive("window.focus"); | |
| document.addEventListener("visibilitychange", onVisible); | |
| window.addEventListener("focus", onFocus); | |
| return () => { | |
| document.removeEventListener("visibilitychange", onVisible); | |
| window.removeEventListener("focus", onFocus); | |
| }; | |
| }, [enabled, isSupported, isTextRequestPendingRef, safeStart]); | |
| // Unmount cleanup. | |
| useEffect(() => { | |
| return () => { | |
| wantRunningRef.current = false; | |
| clearRestartTimer(); | |
| const rec = recognitionRef.current; | |
| if (rec) { | |
| try { rec.abort(); } catch {} | |
| rec.onresult = null; | |
| rec.onerror = null; | |
| rec.onend = null; | |
| rec.onstart = null; | |
| } | |
| recognitionRef.current = null; | |
| teardownAudio(); | |
| // KI-202 β clear pending utterance grace timer on unmount. | |
| if (pendingSubmitTimerRef.current !== null) { | |
| clearTimeout(pendingSubmitTimerRef.current); | |
| pendingSubmitTimerRef.current = null; | |
| } | |
| pendingUtteranceRef.current = ""; | |
| pendingChunksRef.current = []; | |
| // #53 / #54 β release the warm stream + recorder + AudioContext on | |
| // unmount so the OS mic indicator goes off when the app is torn down. | |
| disarmWarmStream(); | |
| }; | |
| }, [clearRestartTimer, teardownAudio, disarmWarmStream]); | |
| // FIX 3 (HIGH) β one-shot read-and-clear of the barge-in flag. Returns | |
| // true exactly once after triggerBargeIn fires; subsequent calls return | |
| // false until the next barge-in event. | |
| const consumeBargeInSignal = useCallback((): boolean => { | |
| if (bargeInRequestedRef.current) { | |
| bargeInRequestedRef.current = false; | |
| return true; | |
| } | |
| return false; | |
| }, []); | |
| return { | |
| start, | |
| stop, | |
| isSupported, | |
| consumeBargeInSignal, | |
| // #53 / #54 β warm-stream + pre-roll push-to-talk API. | |
| isWarm, | |
| armWarmStream, | |
| disarmWarmStream, | |
| beginPushToTalk, | |
| endPushToTalk, | |
| consumePreRollChunks, | |
| }; | |
| } | |