Spaces:
Sleeping
Sleeping
fix(#40+voice+upload): SSOT scorecard parity + Sarvam 30s-truncation + uploaded-PDF retrieval gate
a5d9bfd | """Regression test for KI-302 (2026-05-18) β full voice transcript. | |
| BUG (user-reported, live): "Full transcript still not coming while | |
| speaking." A long spoken utterance was truncated to a partial transcript. | |
| ROOT CAUSE: Sarvam's saarika REST /speech-to-text endpoint has a hard ~30s | |
| audio limit. It does NOT 4xx on longer audio β it returns HTTP 200 with a | |
| `transcript` containing ONLY the first ~30s and silently drops the rest. | |
| The live-voice hook's grace-window batching deliberately merges multiple | |
| pause-separated speech bursts into ONE blob, so any real-world utterance | |
| with natural pauses easily exceeds 30s and was silently cut. | |
| FIX: `SarvamSTT.transcribe` now decodes the audio, and if it exceeds the | |
| safe REST ceiling it splits it into <= STT_CHUNK_MS chunks (at silence | |
| boundaries where possible), transcribes each chunk, and concatenates the | |
| transcripts in order so the COMPLETE utterance survives. | |
| This test pins that contract WITHOUT needing ffmpeg/pydub by stubbing a | |
| minimal AudioSegment that supports the slicing / len / dBFS / export | |
| surface the splitter uses, and a fake Sarvam HTTP layer that returns a | |
| DIFFERENT transcript per 25s window β so a regression to single-shot | |
| behaviour (only the first window transcribed) fails loudly. | |
| Run: | |
| cd /Users/rohitsar/Developer/Insurance\\ Sales\\ Bot | |
| PYTHONPATH=$PWD .venv/bin/python -m pytest \ | |
| tests/test_stt_long_audio_chunking.py -v | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import os | |
| import sys | |
| import types | |
| import pytest | |
| os.environ.setdefault("SARVAM_API_KEY", "test-key-for-stt-chunking") | |
| from backend.providers.sarvam_stt import ( # noqa: E402 | |
| STT_CHUNK_MS, | |
| SarvamSTT, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Minimal pydub.AudioSegment stub. | |
| # The splitter only uses: len(seg) -> ms, seg[a:b] -> sub-segment, | |
| # seg.dBFS -> float, seg.export(buf, format=...) , set_frame_rate/channels/ | |
| # sample_width (chained no-ops), AudioSegment.from_file(...). | |
| # Each "ms" carries a sentinel byte so a concatenated/exported chunk can be | |
| # decoded back into the exact ms range it represents β that lets the fake | |
| # Sarvam server return the words spoken in that window. | |
| # --------------------------------------------------------------------------- | |
| class FakeAudioSegment: | |
| def __init__(self, start_ms: int, end_ms: int): | |
| self.start_ms = start_ms | |
| self.end_ms = end_ms # exclusive | |
| # --- duration ----------------------------------------------------------- | |
| def __len__(self): | |
| return self.end_ms - self.start_ms | |
| # --- slicing (pydub uses ms-based slicing) ------------------------------ | |
| def __getitem__(self, sl): | |
| if isinstance(sl, slice): | |
| lo = 0 if sl.start is None else sl.start | |
| hi = len(self) if sl.stop is None else sl.stop | |
| lo = max(0, min(lo, len(self))) | |
| hi = max(0, min(hi, len(self))) | |
| return FakeAudioSegment(self.start_ms + lo, self.start_ms + hi) | |
| raise TypeError("only slice indexing used by splitter") | |
| # --- loudness ----------------------------------------------------------- | |
| def dBFS(self): | |
| # Constant moderate loudness everywhere EXCEPT a deliberate silent | |
| # gap at [24000, 24400) ms so _split_on_silence has a real pause to | |
| # snap to near the 25s ceiling. | |
| if self.end_ms - self.start_ms == 0: | |
| return float("-inf") | |
| if 24000 <= self.start_ms < 24400: | |
| return float("-inf") | |
| return -20.0 | |
| # --- transcode no-ops --------------------------------------------------- | |
| def set_frame_rate(self, _): | |
| return self | |
| def set_channels(self, _): | |
| return self | |
| def set_sample_width(self, _): | |
| return self | |
| def export(self, buf, format="wav"): # noqa: A002 - pydub signature | |
| # Encode the ms range so the fake Sarvam server can recover which | |
| # words this chunk covers. | |
| buf.write(f"FAKEWAV:{self.start_ms}:{self.end_ms}".encode()) | |
| buf.seek(0) | |
| return buf | |
| class FakeAudioSegmentFactory: | |
| def from_file(_bio, format=None): # noqa: A002 - pydub signature | |
| # A 92-second utterance (> 3 Sarvam windows of 25s). | |
| return FakeAudioSegment(0, 92_000) | |
| def patch_pydub(monkeypatch): | |
| fake_pydub = types.ModuleType("pydub") | |
| fake_pydub.AudioSegment = FakeAudioSegmentFactory | |
| monkeypatch.setitem(sys.modules, "pydub", fake_pydub) | |
| yield | |
| # --------------------------------------------------------------------------- | |
| # Fake Sarvam HTTP layer. Returns the words spoken in the chunk's ms window. | |
| # The "spoken script" is one word per second: word_0 .. word_91. Sarvam's | |
| # real 30s truncation is simulated by capping any single call at 30000 ms of | |
| # audio (it ignores everything past 30s of the blob it receives) β exactly | |
| # the silent-truncation behaviour the fix must defeat by never sending a | |
| # chunk longer than ~25s. | |
| # --------------------------------------------------------------------------- | |
| SARVAM_HARD_LIMIT_MS = 30_000 | |
| class _FakeResp: | |
| def __init__(self, payload): | |
| self._payload = payload | |
| def raise_for_status(self): | |
| return None | |
| def json(self): | |
| return self._payload | |
| class _FakeClient: | |
| def __init__(self, *a, **k): | |
| pass | |
| async def __aenter__(self): | |
| return self | |
| async def __aexit__(self, *a): | |
| return False | |
| async def post(self, url, headers=None, files=None, data=None): | |
| raw = files["file"][1].read() | |
| text = raw.decode() | |
| assert text.startswith("FAKEWAV:"), text | |
| _, s, e = text.split(":") | |
| start_ms, end_ms = int(s), int(e) | |
| # Simulate Sarvam's silent 30s truncation on the received blob. | |
| capped_end = min(end_ms, start_ms + SARVAM_HARD_LIMIT_MS) | |
| words = [ | |
| f"word_{ms // 1000}" | |
| for ms in range(start_ms, capped_end, 1000) | |
| ] | |
| return _FakeResp({ | |
| "transcript": " ".join(words), | |
| "language_code": "en-IN", | |
| "language_probability": 0.99, | |
| }) | |
| def patch_httpx(monkeypatch): | |
| import backend.providers.sarvam_stt as mod | |
| monkeypatch.setattr(mod.httpx, "AsyncClient", _FakeClient) | |
| yield | |
| def test_long_utterance_is_fully_transcribed(patch_pydub, patch_httpx): | |
| """A 92s utterance must yield ALL 92 words, not just the first ~30.""" | |
| stt = SarvamSTT() | |
| # > 1 KB so the short-audio guard doesn't short-circuit. | |
| audio = b"\x00" * 4096 | |
| result = asyncio.run( | |
| stt.transcribe(audio_bytes=audio, audio_format="webm", language_code="en-IN") | |
| ) | |
| words = result.text.split() | |
| # CORE CONTRACT: every one of the 92 spoken words must survive β the | |
| # pre-fix single-shot path returned only word_0..word_29 (Sarvam's 30s | |
| # silent truncation). A word landing exactly on a silence-snap seam can | |
| # legitimately appear in BOTH adjoining chunks (a duplicated word at the | |
| # seam is harmless; a LOST word is the bug). So we assert: (1) no word | |
| # is missing, (2) order is non-decreasing, (3) the count is ~92 (never | |
| # ~30). De-duplicating consecutive repeats must reproduce the exact | |
| # script. | |
| expected = [f"word_{i}" for i in range(92)] | |
| deduped = [w for i, w in enumerate(words) if i == 0 or w != words[i - 1]] | |
| assert deduped == expected, ( | |
| f"transcript truncated/garbled: deduped {len(deduped)} words " | |
| f"(first={deduped[:3]}, last={deduped[-3:]}), expected 92 in order" | |
| ) | |
| # Hard proof this is NOT the pre-fix 30s truncation. | |
| assert len(words) >= 90 | |
| assert "word_91" in words and "word_60" in words | |
| assert result.raw.get("chunked") is True | |
| # 92s split at <=25s ceiling => at least 4 chunks. | |
| assert result.raw.get("chunk_count", 0) >= 4 | |
| assert result.language_code == "en-IN" | |
| def test_short_utterance_single_call_unchanged(patch_pydub, patch_httpx, monkeypatch): | |
| """A sub-ceiling clip must take exactly ONE Sarvam call (no behaviour | |
| change / no extra latency for the common case).""" | |
| # Patch the factory to return a short 8s clip. | |
| short = FakeAudioSegment(0, 8_000) | |
| monkeypatch.setattr( | |
| FakeAudioSegmentFactory, "from_file", staticmethod(lambda *a, **k: short) | |
| ) | |
| stt = SarvamSTT() | |
| result = asyncio.run( | |
| stt.transcribe(audio_bytes=b"\x00" * 4096, audio_format="webm") | |
| ) | |
| words = result.text.split() | |
| assert words == [f"word_{i}" for i in range(8)] | |
| # Single-call path does NOT set the chunked marker. | |
| assert result.raw.get("chunked") is not True | |
| def test_no_chunk_exceeds_sarvam_safe_ceiling(patch_pydub): | |
| """Every produced chunk must be <= STT_CHUNK_MS so Sarvam never silently | |
| truncates a chunk. This is the core invariant the fix rests on.""" | |
| seg = FakeAudioSegment(0, 92_000) | |
| chunks = SarvamSTT._split_on_silence(seg, STT_CHUNK_MS) | |
| assert len(chunks) >= 4 | |
| for c in chunks: | |
| assert len(c) <= STT_CHUNK_MS, f"chunk {len(c)} ms exceeds {STT_CHUNK_MS}" | |
| # Chunks must tile the whole utterance with no gap and no overlap. | |
| assert chunks[0].start_ms == 0 | |
| assert chunks[-1].end_ms == 92_000 | |
| for a, b in zip(chunks, chunks[1:]): | |
| assert a.end_ms == b.start_ms, "chunk boundary gap/overlap loses audio" | |