# app/engines/parler_engine.py
# Parler-TTS neural engine — instruction-controlled voice synthesis.
# Voice is described in plain English rather than selected by ID.
# Directly maps to Bantrly's grade-band persona system.
# Paper: "Parler-TTS: Parallel Inference for Text-to-Speech" (HuggingFace, 2024)

import time
import torch
import numpy as np
import soundfile as sf
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer

from engines.base import TTSEngine

# load once at module level
_device = "cuda" if torch.cuda.is_available() else "cpu"
_model = None
_tokenizer = None

def _get_model():
    global _model, _tokenizer
    if _model is None:
        _model = ParlerTTSForConditionalGeneration.from_pretrained(
            "parler-tts/parler-tts-mini-v1"
        ).to(_device)
        _tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
    return _model, _tokenizer


class ParlerEngine(TTSEngine):

    name = "Parler-TTS (mini)"
    engine_type = "neural-local"
    cost_per_million_chars = 0.0
    is_production_ready = False  # 7-10s latency, needs optimization for production
    requires_internet = False

    # voice descriptions map directly to Moo persona specs
    BAND_CONFIG = {
        "K-2": {
            "description": (
                "A warm, friendly female voice speaking slowly and clearly, "
                "with an encouraging and celebratory tone, perfect for young children."
            ),
        },
        "3-5": {
            "description": (
                "A curious, energetic female voice speaking at a comfortable pace, "
                "with specific emphasis on key words and an upbeat coaching tone."
            ),
        },
        "6-8": {
            "description": (
                "A calm, direct female voice speaking at a natural pace, "
                "serious but approachable, validating feelings before coaching."
            ),
        },
        "9-12": {
            "description": (
                "A collegiate, honest male voice speaking at a brisk conversational pace, "
                "no hand-holding, multi-part reasoning, treating the listener as an adult."
            ),
        },
    }

    def synthesize(self, text: str, band: str, output_path: str) -> dict:
        config = self.get_band_config(band)
        description = config["description"]

        model, tokenizer = _get_model()

        input_ids = tokenizer(description, return_tensors="pt").input_ids.to(_device)
        prompt_ids = tokenizer(text, return_tensors="pt").input_ids.to(_device)

        start = time.time()
        with torch.no_grad():
            generation = model.generate(
                input_ids=input_ids,
                prompt_input_ids=prompt_ids,
            )
        latency = round(time.time() - start, 3)

        audio = generation.cpu().numpy().squeeze()
        full_path = output_path + ".wav"
        sf.write(full_path, audio, model.config.sampling_rate)

        return {
            "audio_path": full_path,
            "latency_seconds": latency,
            "voice": f"described:{band}",
            "speed": 1.0,
            "engine": self.name,
        }