# app/utils.py
import torch
import threading
from models import build_model
from kokoro import generate
from typing import Tuple, Dict
import numpy as np
from functools import lru_cache
from bs4 import BeautifulSoup
from markdown import markdown
import re

AVAILABLE_VOICES = {
    'af': 'Default (Bella & Sarah mix)',
    'af_bella': 'American Female - Bella',
    'af_sarah': 'American Female - Sarah',
    'am_adam': 'American Male - Adam',
    'am_michael': 'American Male - Michael',
    'bf_emma': 'British Female - Emma',
    'bf_isabella': 'British Female - Isabella',
    'bm_george': 'British Male - George',
    'bm_lewis': 'British Male - Lewis',
    'af_nicole': 'American Female - Nicole',
    'af_sky': 'American Female - Sky'
}

class TTSManager:
    _instance = None
    _lock = threading.Lock()
    
    def __new__(cls):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super().__new__(cls)
            return cls._instance

    def __init__(self):
        if not hasattr(self, 'initialized'):
            torch.set_num_threads(4)
            self.device = 'cpu'
            self.model = None
            self.voicepacks: Dict[str, torch.Tensor] = {}
            self._initialize_model()
            self.initialized = True

    def _initialize_model(self):
        try:
            with torch.no_grad():
                model_dict = build_model('kokoro-v0_19.pth', self.device)
                if isinstance(model_dict, dict):
                    self.model = model_dict
                else:
                    self.model = model_dict.to(self.device)
        except Exception as e:
            print(f"Error initializing model: {str(e)}")
            raise

    def _load_voicepack(self, voice_name: str) -> torch.Tensor:
        if voice_name not in self.voicepacks:
            with torch.no_grad():
                voicepack = torch.load(
                    f'voices/{voice_name}.pt',
                    weights_only=True,
                    map_location=self.device
                )
                self.voicepacks[voice_name] = voicepack.to(self.device)
        return self.voicepacks[voice_name]

    @lru_cache(maxsize=100)
    def _generate_speech_cached(self, text: str, voice_name: str) -> Tuple[bytes, str]:
        with torch.no_grad():
            voicepack = self._load_voicepack(voice_name)
            audio, phonemes = generate(
                self.model,
                text,
                voicepack,
                lang=voice_name[0]
            )
            return audio.tobytes(), phonemes

    def generate_speech(self, text: str, voice_name: str = 'af') -> Tuple[np.ndarray, str]:
        if voice_name not in AVAILABLE_VOICES:
            voice_name = 'af'
        audio_bytes, phonemes = self._generate_speech_cached(text, voice_name)
        audio = np.frombuffer(audio_bytes, dtype=np.float32)
        return audio, phonemes

def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))

    return text