# app/utils.py import torch import threading from models import build_model from kokoro import generate from typing import Tuple, Dict import numpy as np from functools import lru_cache from bs4 import BeautifulSoup from markdown import markdown import re AVAILABLE_VOICES = { 'af': 'Default (Bella & Sarah mix)', 'af_bella': 'American Female - Bella', 'af_sarah': 'American Female - Sarah', 'am_adam': 'American Male - Adam', 'am_michael': 'American Male - Michael', 'bf_emma': 'British Female - Emma', 'bf_isabella': 'British Female - Isabella', 'bm_george': 'British Male - George', 'bm_lewis': 'British Male - Lewis', 'af_nicole': 'American Female - Nicole', 'af_sky': 'American Female - Sky' } class TTSManager: _instance = None _lock = threading.Lock() def __new__(cls): with cls._lock: if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self): if not hasattr(self, 'initialized'): torch.set_num_threads(4) self.device = 'cpu' self.model = None self.voicepacks: Dict[str, torch.Tensor] = {} self._initialize_model() self.initialized = True def _initialize_model(self): try: with torch.no_grad(): model_dict = build_model('kokoro-v0_19.pth', self.device) if isinstance(model_dict, dict): self.model = model_dict else: self.model = model_dict.to(self.device) except Exception as e: print(f"Error initializing model: {str(e)}") raise def _load_voicepack(self, voice_name: str) -> torch.Tensor: if voice_name not in self.voicepacks: with torch.no_grad(): voicepack = torch.load( f'voices/{voice_name}.pt', weights_only=True, map_location=self.device ) self.voicepacks[voice_name] = voicepack.to(self.device) return self.voicepacks[voice_name] @lru_cache(maxsize=100) def _generate_speech_cached(self, text: str, voice_name: str) -> Tuple[bytes, str]: with torch.no_grad(): voicepack = self._load_voicepack(voice_name) audio, phonemes = generate( self.model, text, voicepack, lang=voice_name[0] ) return audio.tobytes(), phonemes def generate_speech(self, text: str, voice_name: str = 'af') -> Tuple[np.ndarray, str]: if voice_name not in AVAILABLE_VOICES: voice_name = 'af' audio_bytes, phonemes = self._generate_speech_cached(text, voice_name) audio = np.frombuffer(audio_bytes, dtype=np.float32) return audio, phonemes def markdown_to_text(markdown_string): """ Converts a markdown string to plaintext """ # md -> html -> text since BeautifulSoup can extract text cleanly html = markdown(markdown_string) # remove code snippets html = re.sub(r'
(.*?)
', ' ', html) html = re.sub(r'(.*?)', ' ', html) # extract text soup = BeautifulSoup(html, "html.parser") text = ''.join(soup.findAll(text=True)) return text