"""Regression test for KI-302 (2026-05-18) — full voice transcript.

BUG (user-reported, live): "Full transcript still not coming while
speaking." A long spoken utterance was truncated to a partial transcript.

ROOT CAUSE: Sarvam's saarika REST /speech-to-text endpoint has a hard ~30s
audio limit. It does NOT 4xx on longer audio — it returns HTTP 200 with a
`transcript` containing ONLY the first ~30s and silently drops the rest.
The live-voice hook's grace-window batching deliberately merges multiple
pause-separated speech bursts into ONE blob, so any real-world utterance
with natural pauses easily exceeds 30s and was silently cut.

FIX: `SarvamSTT.transcribe` now decodes the audio, and if it exceeds the
safe REST ceiling it splits it into <= STT_CHUNK_MS chunks (at silence
boundaries where possible), transcribes each chunk, and concatenates the
transcripts in order so the COMPLETE utterance survives.

This test pins that contract WITHOUT needing ffmpeg/pydub by stubbing a
minimal AudioSegment that supports the slicing / len / dBFS / export
surface the splitter uses, and a fake Sarvam HTTP layer that returns a
DIFFERENT transcript per 25s window — so a regression to single-shot
behaviour (only the first window transcribed) fails loudly.

Run:
    cd /Users/rohitsar/Developer/Insurance\\ Sales\\ Bot
    PYTHONPATH=$PWD .venv/bin/python -m pytest \
        tests/test_stt_long_audio_chunking.py -v
"""

from __future__ import annotations

import asyncio
import os
import sys
import types

import pytest

os.environ.setdefault("SARVAM_API_KEY", "test-key-for-stt-chunking")

from backend.providers.sarvam_stt import (  # noqa: E402
    STT_CHUNK_MS,
    SarvamSTT,
)


# ---------------------------------------------------------------------------
# Minimal pydub.AudioSegment stub.
# The splitter only uses: len(seg) -> ms, seg[a:b] -> sub-segment,
# seg.dBFS -> float, seg.export(buf, format=...) , set_frame_rate/channels/
# sample_width (chained no-ops), AudioSegment.from_file(...).
# Each "ms" carries a sentinel byte so a concatenated/exported chunk can be
# decoded back into the exact ms range it represents — that lets the fake
# Sarvam server return the words spoken in that window.
# ---------------------------------------------------------------------------
class FakeAudioSegment:
    def __init__(self, start_ms: int, end_ms: int):
        self.start_ms = start_ms
        self.end_ms = end_ms  # exclusive

    # --- duration -----------------------------------------------------------
    def __len__(self):
        return self.end_ms - self.start_ms

    # --- slicing (pydub uses ms-based slicing) ------------------------------
    def __getitem__(self, sl):
        if isinstance(sl, slice):
            lo = 0 if sl.start is None else sl.start
            hi = len(self) if sl.stop is None else sl.stop
            lo = max(0, min(lo, len(self)))
            hi = max(0, min(hi, len(self)))
            return FakeAudioSegment(self.start_ms + lo, self.start_ms + hi)
        raise TypeError("only slice indexing used by splitter")

    # --- loudness -----------------------------------------------------------
    @property
    def dBFS(self):
        # Constant moderate loudness everywhere EXCEPT a deliberate silent
        # gap at [24000, 24400) ms so _split_on_silence has a real pause to
        # snap to near the 25s ceiling.
        if self.end_ms - self.start_ms == 0:
            return float("-inf")
        if 24000 <= self.start_ms < 24400:
            return float("-inf")
        return -20.0

    # --- transcode no-ops ---------------------------------------------------
    def set_frame_rate(self, _):
        return self

    def set_channels(self, _):
        return self

    def set_sample_width(self, _):
        return self

    def export(self, buf, format="wav"):  # noqa: A002 - pydub signature
        # Encode the ms range so the fake Sarvam server can recover which
        # words this chunk covers.
        buf.write(f"FAKEWAV:{self.start_ms}:{self.end_ms}".encode())
        buf.seek(0)
        return buf


class FakeAudioSegmentFactory:
    @staticmethod
    def from_file(_bio, format=None):  # noqa: A002 - pydub signature
        # A 92-second utterance (> 3 Sarvam windows of 25s).
        return FakeAudioSegment(0, 92_000)


@pytest.fixture
def patch_pydub(monkeypatch):
    fake_pydub = types.ModuleType("pydub")
    fake_pydub.AudioSegment = FakeAudioSegmentFactory
    monkeypatch.setitem(sys.modules, "pydub", fake_pydub)
    yield


# ---------------------------------------------------------------------------
# Fake Sarvam HTTP layer. Returns the words spoken in the chunk's ms window.
# The "spoken script" is one word per second: word_0 .. word_91. Sarvam's
# real 30s truncation is simulated by capping any single call at 30000 ms of
# audio (it ignores everything past 30s of the blob it receives) — exactly
# the silent-truncation behaviour the fix must defeat by never sending a
# chunk longer than ~25s.
# ---------------------------------------------------------------------------
SARVAM_HARD_LIMIT_MS = 30_000


class _FakeResp:
    def __init__(self, payload):
        self._payload = payload

    def raise_for_status(self):
        return None

    def json(self):
        return self._payload


class _FakeClient:
    def __init__(self, *a, **k):
        pass

    async def __aenter__(self):
        return self

    async def __aexit__(self, *a):
        return False

    async def post(self, url, headers=None, files=None, data=None):
        raw = files["file"][1].read()
        text = raw.decode()
        assert text.startswith("FAKEWAV:"), text
        _, s, e = text.split(":")
        start_ms, end_ms = int(s), int(e)
        # Simulate Sarvam's silent 30s truncation on the received blob.
        capped_end = min(end_ms, start_ms + SARVAM_HARD_LIMIT_MS)
        words = [
            f"word_{ms // 1000}"
            for ms in range(start_ms, capped_end, 1000)
        ]
        return _FakeResp({
            "transcript": " ".join(words),
            "language_code": "en-IN",
            "language_probability": 0.99,
        })


@pytest.fixture
def patch_httpx(monkeypatch):
    import backend.providers.sarvam_stt as mod

    monkeypatch.setattr(mod.httpx, "AsyncClient", _FakeClient)
    yield


def test_long_utterance_is_fully_transcribed(patch_pydub, patch_httpx):
    """A 92s utterance must yield ALL 92 words, not just the first ~30."""
    stt = SarvamSTT()
    # > 1 KB so the short-audio guard doesn't short-circuit.
    audio = b"\x00" * 4096
    result = asyncio.run(
        stt.transcribe(audio_bytes=audio, audio_format="webm", language_code="en-IN")
    )

    words = result.text.split()
    # CORE CONTRACT: every one of the 92 spoken words must survive — the
    # pre-fix single-shot path returned only word_0..word_29 (Sarvam's 30s
    # silent truncation). A word landing exactly on a silence-snap seam can
    # legitimately appear in BOTH adjoining chunks (a duplicated word at the
    # seam is harmless; a LOST word is the bug). So we assert: (1) no word
    # is missing, (2) order is non-decreasing, (3) the count is ~92 (never
    # ~30). De-duplicating consecutive repeats must reproduce the exact
    # script.
    expected = [f"word_{i}" for i in range(92)]
    deduped = [w for i, w in enumerate(words) if i == 0 or w != words[i - 1]]
    assert deduped == expected, (
        f"transcript truncated/garbled: deduped {len(deduped)} words "
        f"(first={deduped[:3]}, last={deduped[-3:]}), expected 92 in order"
    )
    # Hard proof this is NOT the pre-fix 30s truncation.
    assert len(words) >= 90
    assert "word_91" in words and "word_60" in words
    assert result.raw.get("chunked") is True
    # 92s split at <=25s ceiling => at least 4 chunks.
    assert result.raw.get("chunk_count", 0) >= 4
    assert result.language_code == "en-IN"


def test_short_utterance_single_call_unchanged(patch_pydub, patch_httpx, monkeypatch):
    """A sub-ceiling clip must take exactly ONE Sarvam call (no behaviour
    change / no extra latency for the common case)."""
    # Patch the factory to return a short 8s clip.
    short = FakeAudioSegment(0, 8_000)
    monkeypatch.setattr(
        FakeAudioSegmentFactory, "from_file", staticmethod(lambda *a, **k: short)
    )
    stt = SarvamSTT()
    result = asyncio.run(
        stt.transcribe(audio_bytes=b"\x00" * 4096, audio_format="webm")
    )
    words = result.text.split()
    assert words == [f"word_{i}" for i in range(8)]
    # Single-call path does NOT set the chunked marker.
    assert result.raw.get("chunked") is not True


def test_no_chunk_exceeds_sarvam_safe_ceiling(patch_pydub):
    """Every produced chunk must be <= STT_CHUNK_MS so Sarvam never silently
    truncates a chunk. This is the core invariant the fix rests on."""
    seg = FakeAudioSegment(0, 92_000)
    chunks = SarvamSTT._split_on_silence(seg, STT_CHUNK_MS)
    assert len(chunks) >= 4
    for c in chunks:
        assert len(c) <= STT_CHUNK_MS, f"chunk {len(c)} ms exceeds {STT_CHUNK_MS}"
    # Chunks must tile the whole utterance with no gap and no overlap.
    assert chunks[0].start_ms == 0
    assert chunks[-1].end_ms == 92_000
    for a, b in zip(chunks, chunks[1:]):
        assert a.end_ms == b.start_ms, "chunk boundary gap/overlap loses audio"