Spaces:

rohitsar567
/

InsuranceBot

Sleeping

App Files Files Community

InsuranceBot / tests /test_stt_long_audio_chunking.py

rohitsar567

fix(#40+voice+upload): SSOT scorecard parity + Sarvam 30s-truncation + uploaded-PDF retrieval gate

a5d9bfd about 2 months ago

Raw

History Blame Contribute Delete

9.37 kB

	"""Regression test for KI-302 (2026-05-18) — full voice transcript.

	BUG (user-reported, live): "Full transcript still not coming while
	speaking." A long spoken utterance was truncated to a partial transcript.

	ROOT CAUSE: Sarvam's saarika REST /speech-to-text endpoint has a hard ~30s
	audio limit. It does NOT 4xx on longer audio — it returns HTTP 200 with a
	`transcript` containing ONLY the first ~30s and silently drops the rest.
	The live-voice hook's grace-window batching deliberately merges multiple
	pause-separated speech bursts into ONE blob, so any real-world utterance
	with natural pauses easily exceeds 30s and was silently cut.

	FIX: `SarvamSTT.transcribe` now decodes the audio, and if it exceeds the
	safe REST ceiling it splits it into <= STT_CHUNK_MS chunks (at silence
	boundaries where possible), transcribes each chunk, and concatenates the
	transcripts in order so the COMPLETE utterance survives.

	This test pins that contract WITHOUT needing ffmpeg/pydub by stubbing a
	minimal AudioSegment that supports the slicing / len / dBFS / export
	surface the splitter uses, and a fake Sarvam HTTP layer that returns a
	DIFFERENT transcript per 25s window — so a regression to single-shot
	behaviour (only the first window transcribed) fails loudly.

	Run:
	cd /Users/rohitsar/Developer/Insurance\\ Sales\\ Bot
	PYTHONPATH=$PWD .venv/bin/python -m pytest \
	tests/test_stt_long_audio_chunking.py -v
	"""

	from __future__ import annotations

	import asyncio
	import os
	import sys
	import types

	import pytest

	os.environ.setdefault("SARVAM_API_KEY", "test-key-for-stt-chunking")

	from backend.providers.sarvam_stt import ( # noqa: E402
	STT_CHUNK_MS,
	SarvamSTT,
	)


	# ---------------------------------------------------------------------------
	# Minimal pydub.AudioSegment stub.
	# The splitter only uses: len(seg) -> ms, seg[a:b] -> sub-segment,
	# seg.dBFS -> float, seg.export(buf, format=...) , set_frame_rate/channels/
	# sample_width (chained no-ops), AudioSegment.from_file(...).
	# Each "ms" carries a sentinel byte so a concatenated/exported chunk can be
	# decoded back into the exact ms range it represents — that lets the fake
	# Sarvam server return the words spoken in that window.
	# ---------------------------------------------------------------------------
	class FakeAudioSegment:
	def __init__(self, start_ms: int, end_ms: int):
	self.start_ms = start_ms
	self.end_ms = end_ms # exclusive

	# --- duration -----------------------------------------------------------
	def __len__(self):
	return self.end_ms - self.start_ms

	# --- slicing (pydub uses ms-based slicing) ------------------------------
	def __getitem__(self, sl):
	if isinstance(sl, slice):
	lo = 0 if sl.start is None else sl.start
	hi = len(self) if sl.stop is None else sl.stop
	lo = max(0, min(lo, len(self)))
	hi = max(0, min(hi, len(self)))
	return FakeAudioSegment(self.start_ms + lo, self.start_ms + hi)
	raise TypeError("only slice indexing used by splitter")

	# --- loudness -----------------------------------------------------------
	@property
	def dBFS(self):
	# Constant moderate loudness everywhere EXCEPT a deliberate silent
	# gap at [24000, 24400) ms so _split_on_silence has a real pause to
	# snap to near the 25s ceiling.
	if self.end_ms - self.start_ms == 0:
	return float("-inf")
	if 24000 <= self.start_ms < 24400:
	return float("-inf")
	return -20.0

	# --- transcode no-ops ---------------------------------------------------
	def set_frame_rate(self, _):
	return self

	def set_channels(self, _):
	return self

	def set_sample_width(self, _):
	return self

	def export(self, buf, format="wav"): # noqa: A002 - pydub signature
	# Encode the ms range so the fake Sarvam server can recover which
	# words this chunk covers.
	buf.write(f"FAKEWAV:{self.start_ms}:{self.end_ms}".encode())
	buf.seek(0)
	return buf


	class FakeAudioSegmentFactory:
	@staticmethod
	def from_file(_bio, format=None): # noqa: A002 - pydub signature
	# A 92-second utterance (> 3 Sarvam windows of 25s).
	return FakeAudioSegment(0, 92_000)


	@pytest.fixture
	def patch_pydub(monkeypatch):
	fake_pydub = types.ModuleType("pydub")
	fake_pydub.AudioSegment = FakeAudioSegmentFactory
	monkeypatch.setitem(sys.modules, "pydub", fake_pydub)
	yield


	# ---------------------------------------------------------------------------
	# Fake Sarvam HTTP layer. Returns the words spoken in the chunk's ms window.
	# The "spoken script" is one word per second: word_0 .. word_91. Sarvam's
	# real 30s truncation is simulated by capping any single call at 30000 ms of
	# audio (it ignores everything past 30s of the blob it receives) — exactly
	# the silent-truncation behaviour the fix must defeat by never sending a
	# chunk longer than ~25s.
	# ---------------------------------------------------------------------------
	SARVAM_HARD_LIMIT_MS = 30_000


	class _FakeResp:
	def __init__(self, payload):
	self._payload = payload

	def raise_for_status(self):
	return None

	def json(self):
	return self._payload


	class _FakeClient:
	def __init__(self, a, *k):
	pass

	async def __aenter__(self):
	return self

	async def __aexit__(self, *a):
	return False

	async def post(self, url, headers=None, files=None, data=None):
	raw = files["file"][1].read()
	text = raw.decode()
	assert text.startswith("FAKEWAV:"), text
	_, s, e = text.split(":")
	start_ms, end_ms = int(s), int(e)
	# Simulate Sarvam's silent 30s truncation on the received blob.
	capped_end = min(end_ms, start_ms + SARVAM_HARD_LIMIT_MS)
	words = [
	f"word_{ms // 1000}"
	for ms in range(start_ms, capped_end, 1000)
	]
	return _FakeResp({
	"transcript": " ".join(words),
	"language_code": "en-IN",
	"language_probability": 0.99,
	})


	@pytest.fixture
	def patch_httpx(monkeypatch):
	import backend.providers.sarvam_stt as mod

	monkeypatch.setattr(mod.httpx, "AsyncClient", _FakeClient)
	yield


	def test_long_utterance_is_fully_transcribed(patch_pydub, patch_httpx):
	"""A 92s utterance must yield ALL 92 words, not just the first ~30."""
	stt = SarvamSTT()
	# > 1 KB so the short-audio guard doesn't short-circuit.
	audio = b"\x00" * 4096
	result = asyncio.run(
	stt.transcribe(audio_bytes=audio, audio_format="webm", language_code="en-IN")
	)

	words = result.text.split()
	# CORE CONTRACT: every one of the 92 spoken words must survive — the
	# pre-fix single-shot path returned only word_0..word_29 (Sarvam's 30s
	# silent truncation). A word landing exactly on a silence-snap seam can
	# legitimately appear in BOTH adjoining chunks (a duplicated word at the
	# seam is harmless; a LOST word is the bug). So we assert: (1) no word
	# is missing, (2) order is non-decreasing, (3) the count is ~92 (never
	# ~30). De-duplicating consecutive repeats must reproduce the exact
	# script.
	expected = [f"word_{i}" for i in range(92)]
	deduped = [w for i, w in enumerate(words) if i == 0 or w != words[i - 1]]
	assert deduped == expected, (
	f"transcript truncated/garbled: deduped {len(deduped)} words "
	f"(first={deduped[:3]}, last={deduped[-3:]}), expected 92 in order"
	)
	# Hard proof this is NOT the pre-fix 30s truncation.
	assert len(words) >= 90
	assert "word_91" in words and "word_60" in words
	assert result.raw.get("chunked") is True
	# 92s split at <=25s ceiling => at least 4 chunks.
	assert result.raw.get("chunk_count", 0) >= 4
	assert result.language_code == "en-IN"


	def test_short_utterance_single_call_unchanged(patch_pydub, patch_httpx, monkeypatch):
	"""A sub-ceiling clip must take exactly ONE Sarvam call (no behaviour
	change / no extra latency for the common case)."""
	# Patch the factory to return a short 8s clip.
	short = FakeAudioSegment(0, 8_000)
	monkeypatch.setattr(
	FakeAudioSegmentFactory, "from_file", staticmethod(lambda a, *k: short)
	)
	stt = SarvamSTT()
	result = asyncio.run(
	stt.transcribe(audio_bytes=b"\x00" * 4096, audio_format="webm")
	)
	words = result.text.split()
	assert words == [f"word_{i}" for i in range(8)]
	# Single-call path does NOT set the chunked marker.
	assert result.raw.get("chunked") is not True


	def test_no_chunk_exceeds_sarvam_safe_ceiling(patch_pydub):
	"""Every produced chunk must be <= STT_CHUNK_MS so Sarvam never silently
	truncates a chunk. This is the core invariant the fix rests on."""
	seg = FakeAudioSegment(0, 92_000)
	chunks = SarvamSTT._split_on_silence(seg, STT_CHUNK_MS)
	assert len(chunks) >= 4
	for c in chunks:
	assert len(c) <= STT_CHUNK_MS, f"chunk {len(c)} ms exceeds {STT_CHUNK_MS}"
	# Chunks must tile the whole utterance with no gap and no overlap.
	assert chunks[0].start_ms == 0
	assert chunks[-1].end_ms == 92_000
	for a, b in zip(chunks, chunks[1:]):
	assert a.end_ms == b.start_ms, "chunk boundary gap/overlap loses audio"