import json
import whisper
from typing import Dict

from huggingface_hub import hf_hub_download
from whisper import load_model, transcribe
from transformers.pipelines.audio_utils import ffmpeg_read

SAMPLE_RATE = 16000


class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        self.model = whisper.load_model("large-v3")

    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (:obj:):
                includes the deserialized audio file as bytes
        Return:
            A :obj:`dict`:. base64 encoded image
        """
        # process input
        inputs = data.pop("inputs", data)
        audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)

        pred_out = transcribe(
            self.model,
            audio=audio_nparray,
            language="en",
            word_timestamps=True,
        )
        segments = pred_out['segments']
        result = []
        for segment in segments:
            start = segment['start']
            end = segment['end']
            text = segment['text']
            words = segment['words']
            print(f"Segment: {start} - {end}: {text}\n")
            result.append({
                "start": start,
                "end": end,
                "text": text,
                "words": [{
                    "start": word['start'],
                    "end": word['end'],
                    "word": word['word']
                } for word in words]
            })
        return {"text": json.dumps(result, indent=2, ensure_ascii=False)}