Spaces:

pvanand
/

Kokoro-tts-api

Sleeping

App Files Files Community

Kokoro-tts-api / app /utils.py

pvanand

Update app/utils.py

afe9b6c verified about 1 year ago

raw

history blame contribute delete

3.39 kB

	# app/utils.py
	import torch
	import threading
	from models import build_model
	from kokoro import generate
	from typing import Tuple, Dict
	import numpy as np
	from functools import lru_cache
	from bs4 import BeautifulSoup
	from markdown import markdown
	import re

	AVAILABLE_VOICES = {
	'af': 'Default (Bella & Sarah mix)',
	'af_bella': 'American Female - Bella',
	'af_sarah': 'American Female - Sarah',
	'am_adam': 'American Male - Adam',
	'am_michael': 'American Male - Michael',
	'bf_emma': 'British Female - Emma',
	'bf_isabella': 'British Female - Isabella',
	'bm_george': 'British Male - George',
	'bm_lewis': 'British Male - Lewis',
	'af_nicole': 'American Female - Nicole',
	'af_sky': 'American Female - Sky'
	}

	class TTSManager:
	_instance = None
	_lock = threading.Lock()

	def __new__(cls):
	with cls._lock:
	if cls._instance is None:
	cls._instance = super().__new__(cls)
	return cls._instance

	def __init__(self):
	if not hasattr(self, 'initialized'):
	torch.set_num_threads(4)
	self.device = 'cpu'
	self.model = None
	self.voicepacks: Dict[str, torch.Tensor] = {}
	self._initialize_model()
	self.initialized = True

	def _initialize_model(self):
	try:
	with torch.no_grad():
	model_dict = build_model('kokoro-v0_19.pth', self.device)
	if isinstance(model_dict, dict):
	self.model = model_dict
	else:
	self.model = model_dict.to(self.device)
	except Exception as e:
	print(f"Error initializing model: {str(e)}")
	raise

	def _load_voicepack(self, voice_name: str) -> torch.Tensor:
	if voice_name not in self.voicepacks:
	with torch.no_grad():
	voicepack = torch.load(
	f'voices/{voice_name}.pt',
	weights_only=True,
	map_location=self.device
	)
	self.voicepacks[voice_name] = voicepack.to(self.device)
	return self.voicepacks[voice_name]

	@lru_cache(maxsize=100)
	def _generate_speech_cached(self, text: str, voice_name: str) -> Tuple[bytes, str]:
	with torch.no_grad():
	voicepack = self._load_voicepack(voice_name)
	audio, phonemes = generate(
	self.model,
	text,
	voicepack,
	lang=voice_name[0]
	)
	return audio.tobytes(), phonemes

	def generate_speech(self, text: str, voice_name: str = 'af') -> Tuple[np.ndarray, str]:
	if voice_name not in AVAILABLE_VOICES:
	voice_name = 'af'
	audio_bytes, phonemes = self._generate_speech_cached(text, voice_name)
	audio = np.frombuffer(audio_bytes, dtype=np.float32)
	return audio, phonemes

	def markdown_to_text(markdown_string):
	""" Converts a markdown string to plaintext """

	# md -> html -> text since BeautifulSoup can extract text cleanly
	html = markdown(markdown_string)

	# remove code snippets
	html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
	html = re.sub(r'<code>(.*?)</code >', ' ', html)

	# extract text
	soup = BeautifulSoup(html, "html.parser")
	text = ''.join(soup.findAll(text=True))

	return text