Spaces:
Sleeping
Sleeping
| from dataclasses import dataclass | |
| import math | |
| import numpy as np | |
| from .config import TTSConfig | |
| from .normalizer import normalize_text, text_to_symbols | |
| class VoiceProfile: | |
| pitch_scale: float | |
| formant_scale: float | |
| brightness: float | |
| VOICE_PROFILES = { | |
| "neutral": VoiceProfile(pitch_scale=1.0, formant_scale=1.0, brightness=1.0), | |
| "bright": VoiceProfile(pitch_scale=1.2, formant_scale=1.1, brightness=1.15), | |
| "deep": VoiceProfile(pitch_scale=0.82, formant_scale=0.9, brightness=0.85), | |
| } | |
| VOWELS = { | |
| "a": (800, 1200, 2500), | |
| "e": (530, 1850, 2500), | |
| "i": (300, 2200, 2900), | |
| "o": (500, 900, 2400), | |
| "u": (350, 800, 2200), | |
| "A": (650, 1600, 2550), | |
| "I": (320, 2400, 3000), | |
| "U": (380, 1000, 2300), | |
| "W": (450, 1100, 2350), | |
| } | |
| FRICATIVES = set("fszhvjxSFT") | |
| STOPS = set("pbtdkgcqC") | |
| NASALS = set("mn") | |
| LIQUIDS = set("lrwy") | |
| class TinyTTSSynthesizer: | |
| def __init__(self, config: TTSConfig | None = None): | |
| self.config = config or TTSConfig() | |
| def synthesize( | |
| self, | |
| text: str, | |
| voice: str = "neutral", | |
| speed: float = 1.0, | |
| pitch_shift: float = 0.0, | |
| ) -> tuple[int, np.ndarray, str]: | |
| normalized = normalize_text(text) | |
| symbols = text_to_symbols(text) | |
| profile = VOICE_PROFILES.get(voice, VOICE_PROFILES["neutral"]) | |
| pieces: list[np.ndarray] = [] | |
| for symbol in symbols: | |
| segment = self._render_symbol( | |
| symbol=symbol, | |
| profile=profile, | |
| speed=max(speed, 0.1), | |
| pitch_shift=pitch_shift, | |
| ) | |
| if segment.size: | |
| pieces.append(segment) | |
| if not pieces: | |
| pieces.append(self._silence(0.25)) | |
| audio = pieces[0] | |
| for piece in pieces[1:]: | |
| audio = self._crossfade(audio, piece) | |
| peak = np.max(np.abs(audio)) | |
| if peak > 0: | |
| audio = (audio / peak) * self.config.amplitude | |
| return self.config.sample_rate, audio.astype(np.float32), normalized | |
| def _render_symbol( | |
| self, | |
| symbol: str, | |
| profile: VoiceProfile, | |
| speed: float, | |
| pitch_shift: float, | |
| ) -> np.ndarray: | |
| if symbol == " ": | |
| return self._silence(self.config.pause_duration_ms / 1000 / speed) | |
| if symbol == "|": | |
| return self._silence((self.config.pause_duration_ms * 2.2) / 1000 / speed) | |
| if symbol in VOWELS: | |
| return self._vowel(symbol, profile, speed, pitch_shift) | |
| if symbol in FRICATIVES: | |
| return self._fricative(profile, speed) | |
| if symbol in STOPS: | |
| return self._stop(profile, speed) | |
| if symbol in NASALS: | |
| return self._nasal(profile, speed, pitch_shift) | |
| if symbol in LIQUIDS: | |
| return self._liquid(profile, speed, pitch_shift) | |
| if symbol.isdigit(): | |
| return self._digit(symbol, profile, speed, pitch_shift) | |
| return self._soft_noise(speed) | |
| def _vowel( | |
| self, | |
| symbol: str, | |
| profile: VoiceProfile, | |
| speed: float, | |
| pitch_shift: float, | |
| ) -> np.ndarray: | |
| duration = self._duration(1.0, speed) | |
| t = self._timeline(duration) | |
| pitch = self.config.base_pitch_hz * profile.pitch_scale * (1.0 + pitch_shift) | |
| formants = [f * profile.formant_scale for f in VOWELS[symbol]] | |
| source = ( | |
| np.sin(2 * math.pi * pitch * t) | |
| + 0.35 * np.sin(2 * math.pi * pitch * 2.0 * t) | |
| + 0.18 * np.sin(2 * math.pi * pitch * 3.0 * t) | |
| ) | |
| resonance = ( | |
| 0.42 * np.sin(2 * math.pi * formants[0] * t) | |
| + 0.22 * np.sin(2 * math.pi * formants[1] * t) | |
| + 0.12 * np.sin(2 * math.pi * formants[2] * t) | |
| ) | |
| envelope = self._adsr(len(t), attack=0.08, decay=0.12, sustain=0.82, release=0.18) | |
| return (0.7 * source + 0.5 * resonance) * envelope | |
| def _fricative(self, profile: VoiceProfile, speed: float) -> np.ndarray: | |
| duration = self._duration(0.8, speed) | |
| n = self._num_samples(duration) | |
| noise = np.random.uniform(-1.0, 1.0, n) | |
| tilt = np.concatenate(([noise[0]], np.diff(noise))) | |
| mix = 0.65 * tilt + 0.35 * noise * profile.brightness | |
| envelope = self._adsr(n, attack=0.02, decay=0.05, sustain=0.6, release=0.2) | |
| return mix * envelope * 0.7 | |
| def _stop(self, profile: VoiceProfile, speed: float) -> np.ndarray: | |
| closure = self._silence(0.035 / speed) | |
| burst = self._fricative(profile, speed)[: self._num_samples(0.04 / speed)] | |
| return np.concatenate([closure, burst]) | |
| def _nasal( | |
| self, | |
| profile: VoiceProfile, | |
| speed: float, | |
| pitch_shift: float, | |
| ) -> np.ndarray: | |
| duration = self._duration(0.9, speed) | |
| t = self._timeline(duration) | |
| pitch = self.config.base_pitch_hz * 0.92 * profile.pitch_scale * (1.0 + pitch_shift) | |
| signal = ( | |
| np.sin(2 * math.pi * pitch * t) | |
| + 0.28 * np.sin(2 * math.pi * 280 * profile.formant_scale * t) | |
| + 0.12 * np.sin(2 * math.pi * 900 * profile.formant_scale * t) | |
| ) | |
| envelope = self._adsr(len(t), attack=0.05, decay=0.08, sustain=0.72, release=0.2) | |
| return signal * envelope * 0.7 | |
| def _liquid( | |
| self, | |
| profile: VoiceProfile, | |
| speed: float, | |
| pitch_shift: float, | |
| ) -> np.ndarray: | |
| duration = self._duration(0.75, speed) | |
| t = self._timeline(duration) | |
| pitch = self.config.base_pitch_hz * 1.05 * profile.pitch_scale * (1.0 + pitch_shift) | |
| glide = np.linspace(0.95, 1.05, len(t)) | |
| signal = ( | |
| np.sin(2 * math.pi * pitch * glide * t) | |
| + 0.22 * np.sin(2 * math.pi * 700 * profile.formant_scale * t) | |
| + 0.1 * np.sin(2 * math.pi * 1500 * profile.formant_scale * t) | |
| ) | |
| envelope = self._adsr(len(t), attack=0.04, decay=0.08, sustain=0.7, release=0.18) | |
| return signal * envelope * 0.65 | |
| def _digit( | |
| self, | |
| symbol: str, | |
| profile: VoiceProfile, | |
| speed: float, | |
| pitch_shift: float, | |
| ) -> np.ndarray: | |
| names = { | |
| "0": "zero", | |
| "1": "one", | |
| "2": "two", | |
| "3": "three", | |
| "4": "four", | |
| "5": "five", | |
| "6": "six", | |
| "7": "seven", | |
| "8": "eight", | |
| "9": "nine", | |
| } | |
| chunks = [self._render_symbol(s, profile, speed, pitch_shift) for s in text_to_symbols(names[symbol])] | |
| result = chunks[0] if chunks else self._silence(0.08) | |
| for chunk in chunks[1:]: | |
| result = self._crossfade(result, chunk) | |
| return result | |
| def _soft_noise(self, speed: float) -> np.ndarray: | |
| duration = self._duration(0.45, speed) | |
| n = self._num_samples(duration) | |
| noise = np.random.uniform(-0.3, 0.3, n) | |
| envelope = self._adsr(n, attack=0.03, decay=0.1, sustain=0.2, release=0.12) | |
| return noise * envelope | |
| def _crossfade(self, left: np.ndarray, right: np.ndarray) -> np.ndarray: | |
| fade = min( | |
| int(self.config.sample_rate * self.config.crossfade_ms / 1000), | |
| len(left), | |
| len(right), | |
| ) | |
| if fade <= 0: | |
| return np.concatenate([left, right]) | |
| curve_out = np.linspace(1.0, 0.0, fade) | |
| curve_in = np.linspace(0.0, 1.0, fade) | |
| mixed = left[-fade:] * curve_out + right[:fade] * curve_in | |
| return np.concatenate([left[:-fade], mixed, right[fade:]]) | |
| def _duration(self, scale: float, speed: float) -> float: | |
| base = self.config.symbol_duration_ms / 1000 | |
| return max(0.03, (base * scale) / speed) | |
| def _num_samples(self, duration: float) -> int: | |
| return max(1, int(self.config.sample_rate * duration)) | |
| def _timeline(self, duration: float) -> np.ndarray: | |
| return np.linspace(0.0, duration, self._num_samples(duration), endpoint=False) | |
| def _silence(self, duration: float) -> np.ndarray: | |
| return np.zeros(self._num_samples(duration), dtype=np.float32) | |
| def _adsr( | |
| self, | |
| n: int, | |
| attack: float, | |
| decay: float, | |
| sustain: float, | |
| release: float, | |
| ) -> np.ndarray: | |
| attack_n = max(1, int(n * attack)) | |
| decay_n = max(1, int(n * decay)) | |
| release_n = max(1, int(n * release)) | |
| sustain_n = max(1, n - attack_n - decay_n - release_n) | |
| attack_curve = np.linspace(0.0, 1.0, attack_n, endpoint=False) | |
| decay_curve = np.linspace(1.0, sustain, decay_n, endpoint=False) | |
| sustain_curve = np.full(sustain_n, sustain) | |
| release_curve = np.linspace(sustain, 0.0, release_n, endpoint=True) | |
| envelope = np.concatenate([attack_curve, decay_curve, sustain_curve, release_curve]) | |
| if len(envelope) < n: | |
| envelope = np.pad(envelope, (0, n - len(envelope))) | |
| return envelope[:n] | |