Kokoro-tts-api / app /utils.py
pvanand's picture
Update app/utils.py
afe9b6c verified
# app/utils.py
import torch
import threading
from models import build_model
from kokoro import generate
from typing import Tuple, Dict
import numpy as np
from functools import lru_cache
from bs4 import BeautifulSoup
from markdown import markdown
import re
AVAILABLE_VOICES = {
'af': 'Default (Bella & Sarah mix)',
'af_bella': 'American Female - Bella',
'af_sarah': 'American Female - Sarah',
'am_adam': 'American Male - Adam',
'am_michael': 'American Male - Michael',
'bf_emma': 'British Female - Emma',
'bf_isabella': 'British Female - Isabella',
'bm_george': 'British Male - George',
'bm_lewis': 'British Male - Lewis',
'af_nicole': 'American Female - Nicole',
'af_sky': 'American Female - Sky'
}
class TTSManager:
_instance = None
_lock = threading.Lock()
def __new__(cls):
with cls._lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
if not hasattr(self, 'initialized'):
torch.set_num_threads(4)
self.device = 'cpu'
self.model = None
self.voicepacks: Dict[str, torch.Tensor] = {}
self._initialize_model()
self.initialized = True
def _initialize_model(self):
try:
with torch.no_grad():
model_dict = build_model('kokoro-v0_19.pth', self.device)
if isinstance(model_dict, dict):
self.model = model_dict
else:
self.model = model_dict.to(self.device)
except Exception as e:
print(f"Error initializing model: {str(e)}")
raise
def _load_voicepack(self, voice_name: str) -> torch.Tensor:
if voice_name not in self.voicepacks:
with torch.no_grad():
voicepack = torch.load(
f'voices/{voice_name}.pt',
weights_only=True,
map_location=self.device
)
self.voicepacks[voice_name] = voicepack.to(self.device)
return self.voicepacks[voice_name]
@lru_cache(maxsize=100)
def _generate_speech_cached(self, text: str, voice_name: str) -> Tuple[bytes, str]:
with torch.no_grad():
voicepack = self._load_voicepack(voice_name)
audio, phonemes = generate(
self.model,
text,
voicepack,
lang=voice_name[0]
)
return audio.tobytes(), phonemes
def generate_speech(self, text: str, voice_name: str = 'af') -> Tuple[np.ndarray, str]:
if voice_name not in AVAILABLE_VOICES:
voice_name = 'af'
audio_bytes, phonemes = self._generate_speech_cached(text, voice_name)
audio = np.frombuffer(audio_bytes, dtype=np.float32)
return audio, phonemes
def markdown_to_text(markdown_string):
""" Converts a markdown string to plaintext """
# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_string)
# remove code snippets
html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
html = re.sub(r'<code>(.*?)</code >', ' ', html)
# extract text
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
return text