Spaces:

abersbail
/

tiny-code-only-tts

Sleeping

App Files Files Community

tiny-code-only-tts / mini_tts /synth.py

abersbail

Deploy tiny code-only TTS Space

0a88ee7 verified 5 days ago

raw

history blame contribute delete

9 kB

	from dataclasses import dataclass
	import math

	import numpy as np

	from .config import TTSConfig
	from .normalizer import normalize_text, text_to_symbols


	@dataclass(frozen=True)
	class VoiceProfile:
	pitch_scale: float
	formant_scale: float
	brightness: float


	VOICE_PROFILES = {
	"neutral": VoiceProfile(pitch_scale=1.0, formant_scale=1.0, brightness=1.0),
	"bright": VoiceProfile(pitch_scale=1.2, formant_scale=1.1, brightness=1.15),
	"deep": VoiceProfile(pitch_scale=0.82, formant_scale=0.9, brightness=0.85),
	}


	VOWELS = {
	"a": (800, 1200, 2500),
	"e": (530, 1850, 2500),
	"i": (300, 2200, 2900),
	"o": (500, 900, 2400),
	"u": (350, 800, 2200),
	"A": (650, 1600, 2550),
	"I": (320, 2400, 3000),
	"U": (380, 1000, 2300),
	"W": (450, 1100, 2350),
	}

	FRICATIVES = set("fszhvjxSFT")
	STOPS = set("pbtdkgcqC")
	NASALS = set("mn")
	LIQUIDS = set("lrwy")


	class TinyTTSSynthesizer:
	def __init__(self, config: TTSConfig \| None = None):
	self.config = config or TTSConfig()

	def synthesize(
	self,
	text: str,
	voice: str = "neutral",
	speed: float = 1.0,
	pitch_shift: float = 0.0,
	) -> tuple[int, np.ndarray, str]:
	normalized = normalize_text(text)
	symbols = text_to_symbols(text)
	profile = VOICE_PROFILES.get(voice, VOICE_PROFILES["neutral"])

	pieces: list[np.ndarray] = []
	for symbol in symbols:
	segment = self._render_symbol(
	symbol=symbol,
	profile=profile,
	speed=max(speed, 0.1),
	pitch_shift=pitch_shift,
	)
	if segment.size:
	pieces.append(segment)

	if not pieces:
	pieces.append(self._silence(0.25))

	audio = pieces[0]
	for piece in pieces[1:]:
	audio = self._crossfade(audio, piece)

	peak = np.max(np.abs(audio))
	if peak > 0:
	audio = (audio / peak) * self.config.amplitude

	return self.config.sample_rate, audio.astype(np.float32), normalized

	def _render_symbol(
	self,
	symbol: str,
	profile: VoiceProfile,
	speed: float,
	pitch_shift: float,
	) -> np.ndarray:
	if symbol == " ":
	return self._silence(self.config.pause_duration_ms / 1000 / speed)
	if symbol == "\|":
	return self._silence((self.config.pause_duration_ms * 2.2) / 1000 / speed)
	if symbol in VOWELS:
	return self._vowel(symbol, profile, speed, pitch_shift)
	if symbol in FRICATIVES:
	return self._fricative(profile, speed)
	if symbol in STOPS:
	return self._stop(profile, speed)
	if symbol in NASALS:
	return self._nasal(profile, speed, pitch_shift)
	if symbol in LIQUIDS:
	return self._liquid(profile, speed, pitch_shift)
	if symbol.isdigit():
	return self._digit(symbol, profile, speed, pitch_shift)
	return self._soft_noise(speed)

	def _vowel(
	self,
	symbol: str,
	profile: VoiceProfile,
	speed: float,
	pitch_shift: float,
	) -> np.ndarray:
	duration = self._duration(1.0, speed)
	t = self._timeline(duration)
	pitch = self.config.base_pitch_hz * profile.pitch_scale * (1.0 + pitch_shift)
	formants = [f * profile.formant_scale for f in VOWELS[symbol]]
	source = (
	np.sin(2 * math.pi * pitch * t)
	+ 0.35 * np.sin(2 * math.pi * pitch * 2.0 * t)
	+ 0.18 * np.sin(2 * math.pi * pitch * 3.0 * t)
	)
	resonance = (
	0.42 * np.sin(2 * math.pi * formants[0] * t)
	+ 0.22 * np.sin(2 * math.pi * formants[1] * t)
	+ 0.12 * np.sin(2 * math.pi * formants[2] * t)
	)
	envelope = self._adsr(len(t), attack=0.08, decay=0.12, sustain=0.82, release=0.18)
	return (0.7 * source + 0.5 * resonance) * envelope

	def _fricative(self, profile: VoiceProfile, speed: float) -> np.ndarray:
	duration = self._duration(0.8, speed)
	n = self._num_samples(duration)
	noise = np.random.uniform(-1.0, 1.0, n)
	tilt = np.concatenate(([noise[0]], np.diff(noise)))
	mix = 0.65 * tilt + 0.35 * noise * profile.brightness
	envelope = self._adsr(n, attack=0.02, decay=0.05, sustain=0.6, release=0.2)
	return mix * envelope * 0.7

	def _stop(self, profile: VoiceProfile, speed: float) -> np.ndarray:
	closure = self._silence(0.035 / speed)
	burst = self._fricative(profile, speed)[: self._num_samples(0.04 / speed)]
	return np.concatenate([closure, burst])

	def _nasal(
	self,
	profile: VoiceProfile,
	speed: float,
	pitch_shift: float,
	) -> np.ndarray:
	duration = self._duration(0.9, speed)
	t = self._timeline(duration)
	pitch = self.config.base_pitch_hz * 0.92 * profile.pitch_scale * (1.0 + pitch_shift)
	signal = (
	np.sin(2 * math.pi * pitch * t)
	+ 0.28 * np.sin(2 * math.pi * 280 * profile.formant_scale * t)
	+ 0.12 * np.sin(2 * math.pi * 900 * profile.formant_scale * t)
	)
	envelope = self._adsr(len(t), attack=0.05, decay=0.08, sustain=0.72, release=0.2)
	return signal * envelope * 0.7

	def _liquid(
	self,
	profile: VoiceProfile,
	speed: float,
	pitch_shift: float,
	) -> np.ndarray:
	duration = self._duration(0.75, speed)
	t = self._timeline(duration)
	pitch = self.config.base_pitch_hz * 1.05 * profile.pitch_scale * (1.0 + pitch_shift)
	glide = np.linspace(0.95, 1.05, len(t))
	signal = (
	np.sin(2 * math.pi * pitch * glide * t)
	+ 0.22 * np.sin(2 * math.pi * 700 * profile.formant_scale * t)
	+ 0.1 * np.sin(2 * math.pi * 1500 * profile.formant_scale * t)
	)
	envelope = self._adsr(len(t), attack=0.04, decay=0.08, sustain=0.7, release=0.18)
	return signal * envelope * 0.65

	def _digit(
	self,
	symbol: str,
	profile: VoiceProfile,
	speed: float,
	pitch_shift: float,
	) -> np.ndarray:
	names = {
	"0": "zero",
	"1": "one",
	"2": "two",
	"3": "three",
	"4": "four",
	"5": "five",
	"6": "six",
	"7": "seven",
	"8": "eight",
	"9": "nine",
	}
	chunks = [self._render_symbol(s, profile, speed, pitch_shift) for s in text_to_symbols(names[symbol])]
	result = chunks[0] if chunks else self._silence(0.08)
	for chunk in chunks[1:]:
	result = self._crossfade(result, chunk)
	return result

	def _soft_noise(self, speed: float) -> np.ndarray:
	duration = self._duration(0.45, speed)
	n = self._num_samples(duration)
	noise = np.random.uniform(-0.3, 0.3, n)
	envelope = self._adsr(n, attack=0.03, decay=0.1, sustain=0.2, release=0.12)
	return noise * envelope

	def _crossfade(self, left: np.ndarray, right: np.ndarray) -> np.ndarray:
	fade = min(
	int(self.config.sample_rate * self.config.crossfade_ms / 1000),
	len(left),
	len(right),
	)
	if fade <= 0:
	return np.concatenate([left, right])

	curve_out = np.linspace(1.0, 0.0, fade)
	curve_in = np.linspace(0.0, 1.0, fade)
	mixed = left[-fade:] * curve_out + right[:fade] * curve_in
	return np.concatenate([left[:-fade], mixed, right[fade:]])

	def _duration(self, scale: float, speed: float) -> float:
	base = self.config.symbol_duration_ms / 1000
	return max(0.03, (base * scale) / speed)

	def _num_samples(self, duration: float) -> int:
	return max(1, int(self.config.sample_rate * duration))

	def _timeline(self, duration: float) -> np.ndarray:
	return np.linspace(0.0, duration, self._num_samples(duration), endpoint=False)

	def _silence(self, duration: float) -> np.ndarray:
	return np.zeros(self._num_samples(duration), dtype=np.float32)

	def _adsr(
	self,
	n: int,
	attack: float,
	decay: float,
	sustain: float,
	release: float,
	) -> np.ndarray:
	attack_n = max(1, int(n * attack))
	decay_n = max(1, int(n * decay))
	release_n = max(1, int(n * release))
	sustain_n = max(1, n - attack_n - decay_n - release_n)

	attack_curve = np.linspace(0.0, 1.0, attack_n, endpoint=False)
	decay_curve = np.linspace(1.0, sustain, decay_n, endpoint=False)
	sustain_curve = np.full(sustain_n, sustain)
	release_curve = np.linspace(sustain, 0.0, release_n, endpoint=True)
	envelope = np.concatenate([attack_curve, decay_curve, sustain_curve, release_curve])
	if len(envelope) < n:
	envelope = np.pad(envelope, (0, n - len(envelope)))
	return envelope[:n]