"""In-process inference backend for Hugging Face ZeroGPU Spaces.

All three stages run in one Python environment on a ZeroGPU slice, exposed as
plain functions (``describe_scene``, ``transcribe_audio``, ``speak``) so the
Gradio app can call them exactly like the Modal backend.

Model stack (single, Transformers >= 5.4 compatible environment):
  * Vision / OCR      -> Qwen/Qwen2.5-VL-3B-Instruct  (bilingual EN/ZH, < 4B)
  * Speech-to-text    -> CohereLabs/cohere-transcribe-03-2026 (via cohere_stt)
  * Text-to-speech    -> openbmb/VoxCPM2

Models are lazy-loaded once and cached; loading happens inside the GPU context
so it works under ZeroGPU's on-demand allocation.
"""

from __future__ import annotations

import io

# ``spaces`` only exists on a Hugging Face Space. Fall back to a no-op decorator
# so this module still imports in a plain environment (e.g. for unit tests).
try:
    import spaces

    GPU = spaces.GPU
except Exception:  # pragma: no cover - exercised only off-Space

    def GPU(*args, **kwargs):
        if len(args) == 1 and callable(args[0]) and not kwargs:
            return args[0]

        def decorator(fn):
            return fn

        return decorator


VISION_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
TTS_MODEL_ID = "openbmb/VoxCPM2"

_vision = None
_tts = None


# --------------------------------------------------------------------------- #
# Pure helper (unit-testable, no GPU): stitch overlapping OCR bands.
# --------------------------------------------------------------------------- #
def stitch_overlapping_text(parts: list[str], max_overlap_words: int = 12) -> str:
    """Join OCR results from overlapping image bands, removing the duplicated
    region. Finds the longest suffix of the running text that matches the prefix
    of the next part (case-insensitive) and drops it."""
    parts = [p.strip() for p in parts if p and p.strip()]
    if not parts:
        return ""
    words = parts[0].split()
    for nxt in parts[1:]:
        nwords = nxt.split()
        limit = min(len(words), len(nwords), max_overlap_words)
        overlap = 0
        for k in range(limit, 0, -1):
            if [w.lower() for w in words[-k:]] == [w.lower() for w in nwords[:k]]:
                overlap = k
                break
        words += nwords[overlap:]
    return " ".join(words)


# --------------------------------------------------------------------------- #
# Vision / OCR — Qwen2.5-VL
# --------------------------------------------------------------------------- #
def _load_vision():
    global _vision
    if _vision is None:
        import torch
        from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        model = (
            Qwen2_5_VLForConditionalGeneration.from_pretrained(
                VISION_MODEL_ID,
                torch_dtype=dtype,
            )
            .to("cuda")
            .eval()
        )
        # Cap visual tokens so menus/labels stay fast without losing legible text.
        processor = AutoProcessor.from_pretrained(
            VISION_MODEL_ID,
            min_pixels=256 * 28 * 28,
            max_pixels=1280 * 28 * 28,
        )
        param = next(model.parameters())
        print(
            f"[third-eye VISION] loaded {VISION_MODEL_ID} "
            f"| device={param.device} | dtype={param.dtype}",
            flush=True,
        )
        _vision = (model, processor)
    return _vision


def _chat_once(model, processor, image, prompt: str) -> str:
    import torch

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ],
        }
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[text],
        images=[image],
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.inference_mode():
        generated = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.2,
        )
    trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated)]
    answer = processor.batch_decode(
        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    return answer.strip()


@GPU(duration=120)
def describe_scene(
    image_bytes: bytes, question: str, lang: str = "en", tile: bool = False
) -> str:
    import time

    from PIL import Image

    model, processor = _load_vision()
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    prompt = question.strip() or "Describe everything visible for a blind user."
    if lang == "zh":
        prompt += " Answer in Chinese."

    start = time.time()
    if not tile:
        answer = _chat_once(model, processor, image, prompt)
        print(f"[third-eye VISION] chat: {time.time() - start:.2f}s", flush=True)
        return answer

    # Tiled OCR for verbatim Read Text mode: splitting into overlapping top/bottom
    # bands enlarges the relative text per call; the overlap is stitched away.
    w, h = image.size
    bands = [(0, 0, w, int(h * 0.55)), (0, int(h * 0.45), w, h)]
    parts = [_chat_once(model, processor, image.crop(box), prompt) for box in bands]
    answer = stitch_overlapping_text(parts)
    print(
        f"[third-eye VISION] tiled chat ({len(bands)} bands): "
        f"{time.time() - start:.2f}s",
        flush=True,
    )
    return answer


# --------------------------------------------------------------------------- #
# Speech-to-text — Cohere Transcribe (shared with the Modal backend)
# --------------------------------------------------------------------------- #
@GPU(duration=120)
def transcribe_audio(audio_bytes: bytes, language: str = "en") -> str:
    from cohere_stt import transcribe_wav_bytes

    return transcribe_wav_bytes(audio_bytes, language)


# --------------------------------------------------------------------------- #
# Text-to-speech — VoxCPM2
# --------------------------------------------------------------------------- #
def _load_tts():
    global _tts
    if _tts is None:
        from voxcpm import VoxCPM

        _tts = VoxCPM.from_pretrained(TTS_MODEL_ID, load_denoiser=False)
    return _tts


@GPU(duration=120)
def speak(text: str, lang: str = "en") -> bytes:
    import numpy as np
    import soundfile as sf

    if not text.strip():
        raise ValueError("Cannot synthesize empty text.")

    model = _load_tts()
    waveform = model.generate(
        text=text.strip()[:500],
        cfg_value=2.0,
        inference_timesteps=10,
    )
    output = io.BytesIO()
    sf.write(
        output,
        np.asarray(waveform, dtype=np.float32),
        model.tts_model.sample_rate,
        format="WAV",
    )
    return output.getvalue()