InsuranceBot / tests /test_stt_long_audio_chunking.py
rohitsar567's picture
fix(#40+voice+upload): SSOT scorecard parity + Sarvam 30s-truncation + uploaded-PDF retrieval gate
a5d9bfd
Raw
History Blame Contribute Delete
9.37 kB
"""Regression test for KI-302 (2026-05-18) β€” full voice transcript.
BUG (user-reported, live): "Full transcript still not coming while
speaking." A long spoken utterance was truncated to a partial transcript.
ROOT CAUSE: Sarvam's saarika REST /speech-to-text endpoint has a hard ~30s
audio limit. It does NOT 4xx on longer audio β€” it returns HTTP 200 with a
`transcript` containing ONLY the first ~30s and silently drops the rest.
The live-voice hook's grace-window batching deliberately merges multiple
pause-separated speech bursts into ONE blob, so any real-world utterance
with natural pauses easily exceeds 30s and was silently cut.
FIX: `SarvamSTT.transcribe` now decodes the audio, and if it exceeds the
safe REST ceiling it splits it into <= STT_CHUNK_MS chunks (at silence
boundaries where possible), transcribes each chunk, and concatenates the
transcripts in order so the COMPLETE utterance survives.
This test pins that contract WITHOUT needing ffmpeg/pydub by stubbing a
minimal AudioSegment that supports the slicing / len / dBFS / export
surface the splitter uses, and a fake Sarvam HTTP layer that returns a
DIFFERENT transcript per 25s window β€” so a regression to single-shot
behaviour (only the first window transcribed) fails loudly.
Run:
cd /Users/rohitsar/Developer/Insurance\\ Sales\\ Bot
PYTHONPATH=$PWD .venv/bin/python -m pytest \
tests/test_stt_long_audio_chunking.py -v
"""
from __future__ import annotations
import asyncio
import os
import sys
import types
import pytest
os.environ.setdefault("SARVAM_API_KEY", "test-key-for-stt-chunking")
from backend.providers.sarvam_stt import ( # noqa: E402
STT_CHUNK_MS,
SarvamSTT,
)
# ---------------------------------------------------------------------------
# Minimal pydub.AudioSegment stub.
# The splitter only uses: len(seg) -> ms, seg[a:b] -> sub-segment,
# seg.dBFS -> float, seg.export(buf, format=...) , set_frame_rate/channels/
# sample_width (chained no-ops), AudioSegment.from_file(...).
# Each "ms" carries a sentinel byte so a concatenated/exported chunk can be
# decoded back into the exact ms range it represents β€” that lets the fake
# Sarvam server return the words spoken in that window.
# ---------------------------------------------------------------------------
class FakeAudioSegment:
def __init__(self, start_ms: int, end_ms: int):
self.start_ms = start_ms
self.end_ms = end_ms # exclusive
# --- duration -----------------------------------------------------------
def __len__(self):
return self.end_ms - self.start_ms
# --- slicing (pydub uses ms-based slicing) ------------------------------
def __getitem__(self, sl):
if isinstance(sl, slice):
lo = 0 if sl.start is None else sl.start
hi = len(self) if sl.stop is None else sl.stop
lo = max(0, min(lo, len(self)))
hi = max(0, min(hi, len(self)))
return FakeAudioSegment(self.start_ms + lo, self.start_ms + hi)
raise TypeError("only slice indexing used by splitter")
# --- loudness -----------------------------------------------------------
@property
def dBFS(self):
# Constant moderate loudness everywhere EXCEPT a deliberate silent
# gap at [24000, 24400) ms so _split_on_silence has a real pause to
# snap to near the 25s ceiling.
if self.end_ms - self.start_ms == 0:
return float("-inf")
if 24000 <= self.start_ms < 24400:
return float("-inf")
return -20.0
# --- transcode no-ops ---------------------------------------------------
def set_frame_rate(self, _):
return self
def set_channels(self, _):
return self
def set_sample_width(self, _):
return self
def export(self, buf, format="wav"): # noqa: A002 - pydub signature
# Encode the ms range so the fake Sarvam server can recover which
# words this chunk covers.
buf.write(f"FAKEWAV:{self.start_ms}:{self.end_ms}".encode())
buf.seek(0)
return buf
class FakeAudioSegmentFactory:
@staticmethod
def from_file(_bio, format=None): # noqa: A002 - pydub signature
# A 92-second utterance (> 3 Sarvam windows of 25s).
return FakeAudioSegment(0, 92_000)
@pytest.fixture
def patch_pydub(monkeypatch):
fake_pydub = types.ModuleType("pydub")
fake_pydub.AudioSegment = FakeAudioSegmentFactory
monkeypatch.setitem(sys.modules, "pydub", fake_pydub)
yield
# ---------------------------------------------------------------------------
# Fake Sarvam HTTP layer. Returns the words spoken in the chunk's ms window.
# The "spoken script" is one word per second: word_0 .. word_91. Sarvam's
# real 30s truncation is simulated by capping any single call at 30000 ms of
# audio (it ignores everything past 30s of the blob it receives) β€” exactly
# the silent-truncation behaviour the fix must defeat by never sending a
# chunk longer than ~25s.
# ---------------------------------------------------------------------------
SARVAM_HARD_LIMIT_MS = 30_000
class _FakeResp:
def __init__(self, payload):
self._payload = payload
def raise_for_status(self):
return None
def json(self):
return self._payload
class _FakeClient:
def __init__(self, *a, **k):
pass
async def __aenter__(self):
return self
async def __aexit__(self, *a):
return False
async def post(self, url, headers=None, files=None, data=None):
raw = files["file"][1].read()
text = raw.decode()
assert text.startswith("FAKEWAV:"), text
_, s, e = text.split(":")
start_ms, end_ms = int(s), int(e)
# Simulate Sarvam's silent 30s truncation on the received blob.
capped_end = min(end_ms, start_ms + SARVAM_HARD_LIMIT_MS)
words = [
f"word_{ms // 1000}"
for ms in range(start_ms, capped_end, 1000)
]
return _FakeResp({
"transcript": " ".join(words),
"language_code": "en-IN",
"language_probability": 0.99,
})
@pytest.fixture
def patch_httpx(monkeypatch):
import backend.providers.sarvam_stt as mod
monkeypatch.setattr(mod.httpx, "AsyncClient", _FakeClient)
yield
def test_long_utterance_is_fully_transcribed(patch_pydub, patch_httpx):
"""A 92s utterance must yield ALL 92 words, not just the first ~30."""
stt = SarvamSTT()
# > 1 KB so the short-audio guard doesn't short-circuit.
audio = b"\x00" * 4096
result = asyncio.run(
stt.transcribe(audio_bytes=audio, audio_format="webm", language_code="en-IN")
)
words = result.text.split()
# CORE CONTRACT: every one of the 92 spoken words must survive β€” the
# pre-fix single-shot path returned only word_0..word_29 (Sarvam's 30s
# silent truncation). A word landing exactly on a silence-snap seam can
# legitimately appear in BOTH adjoining chunks (a duplicated word at the
# seam is harmless; a LOST word is the bug). So we assert: (1) no word
# is missing, (2) order is non-decreasing, (3) the count is ~92 (never
# ~30). De-duplicating consecutive repeats must reproduce the exact
# script.
expected = [f"word_{i}" for i in range(92)]
deduped = [w for i, w in enumerate(words) if i == 0 or w != words[i - 1]]
assert deduped == expected, (
f"transcript truncated/garbled: deduped {len(deduped)} words "
f"(first={deduped[:3]}, last={deduped[-3:]}), expected 92 in order"
)
# Hard proof this is NOT the pre-fix 30s truncation.
assert len(words) >= 90
assert "word_91" in words and "word_60" in words
assert result.raw.get("chunked") is True
# 92s split at <=25s ceiling => at least 4 chunks.
assert result.raw.get("chunk_count", 0) >= 4
assert result.language_code == "en-IN"
def test_short_utterance_single_call_unchanged(patch_pydub, patch_httpx, monkeypatch):
"""A sub-ceiling clip must take exactly ONE Sarvam call (no behaviour
change / no extra latency for the common case)."""
# Patch the factory to return a short 8s clip.
short = FakeAudioSegment(0, 8_000)
monkeypatch.setattr(
FakeAudioSegmentFactory, "from_file", staticmethod(lambda *a, **k: short)
)
stt = SarvamSTT()
result = asyncio.run(
stt.transcribe(audio_bytes=b"\x00" * 4096, audio_format="webm")
)
words = result.text.split()
assert words == [f"word_{i}" for i in range(8)]
# Single-call path does NOT set the chunked marker.
assert result.raw.get("chunked") is not True
def test_no_chunk_exceeds_sarvam_safe_ceiling(patch_pydub):
"""Every produced chunk must be <= STT_CHUNK_MS so Sarvam never silently
truncates a chunk. This is the core invariant the fix rests on."""
seg = FakeAudioSegment(0, 92_000)
chunks = SarvamSTT._split_on_silence(seg, STT_CHUNK_MS)
assert len(chunks) >= 4
for c in chunks:
assert len(c) <= STT_CHUNK_MS, f"chunk {len(c)} ms exceeds {STT_CHUNK_MS}"
# Chunks must tile the whole utterance with no gap and no overlap.
assert chunks[0].start_ms == 0
assert chunks[-1].end_ms == 92_000
for a, b in zip(chunks, chunks[1:]):
assert a.end_ms == b.start_ms, "chunk boundary gap/overlap loses audio"