"""
HuggingFace Inference Endpoints custom handler for microsoft/VibeVoice-ASR-HF.

Deploy steps:
  1. Fork microsoft/VibeVoice-ASR-HF on HuggingFace
  2. Copy THIS file into the fork root as `handler.py`
  3. Create HF Inference Endpoint pointing at your fork
  4. Set VIBEVOICE_HF_ENDPOINT_URL + HF_TOKEN in .env

Input  : raw audio bytes (wav/mp3/flac/m4a/ogg)
Output : {"transcript": str, "segments": [{"Start": float, "End": float, "Speaker": int, "Content": str}]}

Docs:
  https://huggingface.co/docs/inference-endpoints/guides/custom_handler
"""
from __future__ import annotations

import tempfile
import os
from typing import Any, Dict

SAMPLE_RATE = 24_000   # VibeVoice-ASR-HF requires 24 kHz


class EndpointHandler:

    def __init__(self, path: str = ""):
        from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
        import torch

        model_path = path or "microsoft/VibeVoice-ASR-HF"

        self.processor = AutoProcessor.from_pretrained(model_path)
        self.model = VibeVoiceAsrForConditionalGeneration.from_pretrained(
            model_path,
            device_map="auto",
            torch_dtype=torch.bfloat16,
        )
        self.model.eval()

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Args:
            data["inputs"]     : raw audio bytes (any ffmpeg-supported format)
            data["parameters"] : optional dict
                prompt              (str)  context hint, e.g. "Medical midwife consultation"
                tokenizer_chunk_size (int) samples per chunk — reduce for low VRAM
        """
        import torch
        from transformers.pipelines.audio_utils import ffmpeg_read

        audio_bytes: bytes = data.pop("inputs", data)
        parameters: dict   = data.pop("parameters", {}) or {}

        prompt               = parameters.get("prompt", "Midwife medical consultation in German")
        tokenizer_chunk_size = parameters.get("tokenizer_chunk_size", None)

        # Decode audio bytes → numpy array at 24 kHz → temp wav file
        # processor.apply_transcription_request() requires a file path
        audio_np = ffmpeg_read(audio_bytes, SAMPLE_RATE)

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            tmp_path = tmp.name
            import soundfile as sf
            sf.write(tmp_path, audio_np, SAMPLE_RATE)

        try:
            inputs = self.processor.apply_transcription_request(
                audio=tmp_path,
                prompt=prompt,
            ).to(self.model.device, self.model.dtype)

            generate_kwargs: dict = {}
            if tokenizer_chunk_size is not None:
                generate_kwargs["tokenizer_chunk_size"] = int(tokenizer_chunk_size)

            with torch.inference_mode():
                output_ids = self.model.generate(**inputs, **generate_kwargs)

            generated_ids = output_ids[:, inputs["input_ids"].shape[1]:]
            decoded = self.processor.decode(generated_ids, return_format="parsed")
            segments: list[dict] = decoded[0] if decoded else []

        finally:
            os.unlink(tmp_path)

        transcript = " ".join(s.get("Content", "").strip() for s in segments).strip()

        return {
            "transcript": transcript,
            "segments": [
                {
                    "Start":   float(s.get("Start", 0)),
                    "End":     float(s.get("End",   0)),
                    "Speaker": s.get("Speaker"),
                    "Content": s.get("Content", "").strip(),
                }
                for s in segments
            ],
        }