Spaces:
Sleeping
Sleeping
| # app/utils.py | |
| import torch | |
| import threading | |
| from models import build_model | |
| from kokoro import generate | |
| from typing import Tuple, Dict | |
| import numpy as np | |
| from functools import lru_cache | |
| from bs4 import BeautifulSoup | |
| from markdown import markdown | |
| import re | |
| AVAILABLE_VOICES = { | |
| 'af': 'Default (Bella & Sarah mix)', | |
| 'af_bella': 'American Female - Bella', | |
| 'af_sarah': 'American Female - Sarah', | |
| 'am_adam': 'American Male - Adam', | |
| 'am_michael': 'American Male - Michael', | |
| 'bf_emma': 'British Female - Emma', | |
| 'bf_isabella': 'British Female - Isabella', | |
| 'bm_george': 'British Male - George', | |
| 'bm_lewis': 'British Male - Lewis', | |
| 'af_nicole': 'American Female - Nicole', | |
| 'af_sky': 'American Female - Sky' | |
| } | |
| class TTSManager: | |
| _instance = None | |
| _lock = threading.Lock() | |
| def __new__(cls): | |
| with cls._lock: | |
| if cls._instance is None: | |
| cls._instance = super().__new__(cls) | |
| return cls._instance | |
| def __init__(self): | |
| if not hasattr(self, 'initialized'): | |
| torch.set_num_threads(4) | |
| self.device = 'cpu' | |
| self.model = None | |
| self.voicepacks: Dict[str, torch.Tensor] = {} | |
| self._initialize_model() | |
| self.initialized = True | |
| def _initialize_model(self): | |
| try: | |
| with torch.no_grad(): | |
| model_dict = build_model('kokoro-v0_19.pth', self.device) | |
| if isinstance(model_dict, dict): | |
| self.model = model_dict | |
| else: | |
| self.model = model_dict.to(self.device) | |
| except Exception as e: | |
| print(f"Error initializing model: {str(e)}") | |
| raise | |
| def _load_voicepack(self, voice_name: str) -> torch.Tensor: | |
| if voice_name not in self.voicepacks: | |
| with torch.no_grad(): | |
| voicepack = torch.load( | |
| f'voices/{voice_name}.pt', | |
| weights_only=True, | |
| map_location=self.device | |
| ) | |
| self.voicepacks[voice_name] = voicepack.to(self.device) | |
| return self.voicepacks[voice_name] | |
| def _generate_speech_cached(self, text: str, voice_name: str) -> Tuple[bytes, str]: | |
| with torch.no_grad(): | |
| voicepack = self._load_voicepack(voice_name) | |
| audio, phonemes = generate( | |
| self.model, | |
| text, | |
| voicepack, | |
| lang=voice_name[0] | |
| ) | |
| return audio.tobytes(), phonemes | |
| def generate_speech(self, text: str, voice_name: str = 'af') -> Tuple[np.ndarray, str]: | |
| if voice_name not in AVAILABLE_VOICES: | |
| voice_name = 'af' | |
| audio_bytes, phonemes = self._generate_speech_cached(text, voice_name) | |
| audio = np.frombuffer(audio_bytes, dtype=np.float32) | |
| return audio, phonemes | |
| def markdown_to_text(markdown_string): | |
| """ Converts a markdown string to plaintext """ | |
| # md -> html -> text since BeautifulSoup can extract text cleanly | |
| html = markdown(markdown_string) | |
| # remove code snippets | |
| html = re.sub(r'<pre>(.*?)</pre>', ' ', html) | |
| html = re.sub(r'<code>(.*?)</code >', ' ', html) | |
| # extract text | |
| soup = BeautifulSoup(html, "html.parser") | |
| text = ''.join(soup.findAll(text=True)) | |
| return text |