import spaces
import torch
from transformers import AutoProcessor, BarkModel
import librosa
import numpy as np

VOICE_MODEL_ID = "suno/bark-small"

VOICE_DESIGN = {
    "sml": "v2/en_speaker_6",
    "chop": "v2/en_speaker_2",
    "agressor": "v2/en_speaker_1",
}

_processor = None
_model = None


def get_model():
    global _model, _processor
    if _model is None:
        print(f"[voice] loading {VOICE_MODEL_ID} ...", flush=True)
        # Bark runs natively in standard transformers. Using float16 to save VRAM.
        _processor = AutoProcessor.from_pretrained(VOICE_MODEL_ID)
        _model = BarkModel.from_pretrained(
            VOICE_MODEL_ID, torch_dtype=torch.float16
        ).to("cuda")
        print("[voice] model is ready.")
    return _processor, _model


@spaces.GPU(duration=50)
def synthesize(text: str, voice_key: str):
    if not text:
        return None
    processor, model = get_model()
    voice_preset = VOICE_DESIGN.get(voice_key, "v2/en_speaker_6")

    print(f"[voice] generating voice from {text}", flush=True)
    input = processor(text, voice_preset=voice_preset, return_tensors="pt").to("cuda")

    with torch.no_grad():
        speech_output = model.generate(**input)
    
    wav = speech_output[0].cpu().numpy()

    # Convert the float16 array to float32 so Numba can process it
    wav = wav.astype(np.float32)
    
    # Speed up the voice by 1.25x to reduce duration without altering pitch
    wav = librosa.effects.time_stretch(wav, rate=1.25)

    return (24000, wav)


@spaces.GPU(duration=150)
def warmup():
    get_model()
    print("[voice] warmup complete.", flush=True)