Spaces:
Sleeping
Sleeping
| import torch | |
| import asyncio | |
| import logging | |
| import base64 | |
| import io | |
| import numpy as np | |
| from typing import Optional | |
| from backend.config import settings | |
| import math | |
| logger = logging.getLogger(__name__) | |
| class SimpleVoiceSynthesizer: | |
| """ | |
| An improved simple voice synthesizer that creates more realistic speech-like audio | |
| using phoneme patterns, formant synthesis, and prosody modeling. | |
| """ | |
| def __init__(self): | |
| self.character_voice_configs = {} | |
| self.initialized = False | |
| async def initialize(self): | |
| """Initialize simple voice synthesis""" | |
| if not settings.ENABLE_VOICE: | |
| logger.info("Voice synthesis disabled in config") | |
| return False | |
| logger.info("Initializing improved simple voice synthesizer...") | |
| try: | |
| # Setup character-specific voice parameters | |
| self._setup_character_voices() | |
| self.initialized = True | |
| logger.info("Improved simple voice synthesizer initialized successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to initialize simple voice synthesizer: {e}") | |
| return False | |
| def _setup_character_voices(self): | |
| """Setup character-specific voice configurations""" | |
| self.character_voice_configs = { | |
| "moses": { | |
| "base_frequency": 110, # Lower, more authoritative | |
| "speed": 0.85, # Slower, more measured | |
| "pitch_variance": 0.15, # Less pitch variation | |
| "formant_shift": -0.2, # Deeper formants | |
| "voice_quality": "deep", | |
| }, | |
| "samsung_employee": { | |
| "base_frequency": 140, # Professional, clear | |
| "speed": 1.0, # Normal speed | |
| "pitch_variance": 0.2, # Moderate variation | |
| "formant_shift": 0.0, # Neutral formants | |
| "voice_quality": "clear", | |
| }, | |
| "jinx": { | |
| "base_frequency": 180, # Higher, more energetic | |
| "speed": 1.2, # Faster speech | |
| "pitch_variance": 0.35, # More pitch variation | |
| "formant_shift": 0.3, # Brighter formants | |
| "voice_quality": "bright", | |
| } | |
| } | |
| async def synthesize(self, text: str, character_id: str) -> Optional[str]: | |
| """Synthesize speech for given text and character""" | |
| if not self.initialized or not settings.ENABLE_VOICE: | |
| return None | |
| try: | |
| # Get character voice config | |
| voice_config = self.character_voice_configs.get( | |
| character_id, | |
| self.character_voice_configs["samsung_employee"] # Default | |
| ) | |
| # Generate realistic speech audio | |
| audio_data = self._generate_realistic_speech(text, voice_config) | |
| # Convert to base64 for web transmission | |
| audio_base64 = self._audio_to_base64(audio_data) | |
| logger.info(f"Generated realistic speech for {character_id}: {len(text)} chars, {len(audio_data)} samples") | |
| return audio_base64 | |
| except Exception as e: | |
| logger.error(f"Error in simple voice synthesis: {e}") | |
| return None | |
| def _generate_realistic_speech(self, text: str, voice_config: dict) -> np.ndarray: | |
| """Generate realistic speech using advanced phoneme and prosody modeling""" | |
| # Calculate duration based on speaking rate | |
| words = len(text.split()) | |
| chars = len(text) | |
| # Realistic speaking rates: 150-180 words per minute | |
| base_wpm = 160 | |
| speed_factor = voice_config["speed"] | |
| actual_wpm = base_wpm * speed_factor | |
| # Calculate duration | |
| duration = (words / actual_wpm) * 60 # Convert to seconds | |
| duration = max(duration, chars / 15.0) # Minimum based on character count | |
| duration = min(duration, 30.0) # Maximum 30 seconds | |
| sample_rate = settings.SAMPLE_RATE | |
| num_samples = int(duration * sample_rate) | |
| # Create time array | |
| t = np.linspace(0, duration, num_samples) | |
| # Generate phoneme-based speech patterns | |
| audio = self._create_phoneme_speech(t, text, voice_config) | |
| # Apply prosody (intonation patterns) | |
| prosody = self._generate_prosody(t, text, voice_config) | |
| audio *= prosody | |
| # Apply character-specific voice quality | |
| audio = self._apply_voice_quality(audio, t, voice_config) | |
| # Add natural speech envelope | |
| envelope = self._create_speech_envelope(audio, t) | |
| audio *= envelope | |
| # Normalize and return | |
| if np.max(np.abs(audio)) > 0: | |
| audio = audio / np.max(np.abs(audio)) * 0.8 | |
| return audio.astype(np.float32) | |
| def _create_phoneme_speech(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray: | |
| """Create speech-like audio using phoneme patterns""" | |
| audio = np.zeros_like(t) | |
| base_freq = voice_config["base_frequency"] | |
| # Create syllable timing based on text | |
| syllable_rate = 4.0 * voice_config["speed"] # syllables per second | |
| syllable_duration = 1.0 / syllable_rate | |
| for i, sample_time in enumerate(t): | |
| # Determine current syllable position | |
| syllable_phase = (sample_time % syllable_duration) / syllable_duration | |
| # Create vowel/consonant pattern | |
| # Vowels: 0.2-0.8 of syllable, Consonants: 0.0-0.2 and 0.8-1.0 | |
| is_vowel = 0.2 < syllable_phase < 0.8 | |
| # Get fundamental frequency with natural variation | |
| pitch_variation = voice_config["pitch_variance"] | |
| f0 = base_freq * (1 + pitch_variation * np.sin(2 * np.pi * 2.3 * sample_time)) | |
| if is_vowel: | |
| # Generate vowel sound using formant synthesis | |
| vowel_sound = self._generate_vowel_formants(sample_time, f0, voice_config) | |
| audio[i] = vowel_sound | |
| else: | |
| # Generate consonant sound using filtered noise | |
| consonant_sound = self._generate_consonant(sample_time, f0, voice_config) | |
| audio[i] = consonant_sound | |
| return audio | |
| def _generate_vowel_formants(self, t: float, f0: float, voice_config: dict) -> float: | |
| """Generate vowel sounds using formant frequencies""" | |
| formant_shift = voice_config["formant_shift"] | |
| # Vowel formant frequencies (approximate average) | |
| f1 = 650 * (1 + formant_shift * 0.5) # First formant | |
| f2 = 1400 * (1 + formant_shift * 0.3) # Second formant | |
| f3 = 2500 * (1 + formant_shift * 0.2) # Third formant | |
| # Add slight formant movement for naturalness | |
| f1 += 50 * np.sin(2 * np.pi * 1.7 * t) | |
| f2 += 80 * np.sin(2 * np.pi * 2.1 * t) | |
| # Generate harmonic series for fundamental | |
| fundamental = 0.4 * np.sin(2 * np.pi * f0 * t) | |
| # Generate formant resonances | |
| formant1 = 0.3 * np.sin(2 * np.pi * f1 * t) * np.exp(-abs(f1 - f0*1) / 200) | |
| formant2 = 0.2 * np.sin(2 * np.pi * f2 * t) * np.exp(-abs(f2 - f0*2) / 300) | |
| formant3 = 0.1 * np.sin(2 * np.pi * f3 * t) * np.exp(-abs(f3 - f0*3) / 500) | |
| # Add harmonics | |
| harmonic2 = 0.2 * np.sin(2 * np.pi * f0 * 2 * t) | |
| harmonic3 = 0.1 * np.sin(2 * np.pi * f0 * 3 * t) | |
| return fundamental + formant1 + formant2 + formant3 + harmonic2 + harmonic3 | |
| def _generate_consonant(self, t: float, f0: float, voice_config: dict) -> float: | |
| """Generate consonant sounds using filtered noise and fricatives""" | |
| # Create noise component for fricatives | |
| noise = (np.random.randn() - 0.5) * 0.15 | |
| # Add some periodic component for voiced consonants | |
| periodic = 0.1 * np.sin(2 * np.pi * f0 * t) | |
| # Filter noise based on consonant type (simplified) | |
| filtered_noise = noise * (1 + 0.5 * np.sin(2 * np.pi * 3000 * t)) | |
| return filtered_noise + periodic * 0.3 | |
| def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray: | |
| """Generate natural prosody (intonation) patterns""" | |
| prosody = np.ones_like(t) | |
| duration = t[-1] if len(t) > 0 else 1.0 | |
| # Sentence-level intonation | |
| time_norm = t / duration | |
| if text.endswith('?'): | |
| # Question: rising intonation | |
| prosody *= (0.8 + 0.4 * time_norm) | |
| elif text.endswith('!'): | |
| # Exclamation: dramatic contour | |
| prosody *= (0.9 + 0.3 * np.sin(np.pi * time_norm) * np.exp(-time_norm)) | |
| else: | |
| # Statement: natural declination | |
| prosody *= (1.0 - 0.2 * time_norm) | |
| # Add micro-prosody for naturalness | |
| prosody *= (1 + 0.05 * np.sin(2 * np.pi * 8 * t)) | |
| # Character-specific prosody | |
| if voice_config.get("voice_quality") == "bright": | |
| # More animated prosody for energetic characters | |
| prosody *= (1 + 0.1 * np.sin(2 * np.pi * 2.5 * t)) | |
| elif voice_config.get("voice_quality") == "deep": | |
| # More steady prosody for authoritative characters | |
| prosody *= (1 + 0.03 * np.sin(2 * np.pi * 1.2 * t)) | |
| return prosody | |
| def _apply_voice_quality(self, audio: np.ndarray, t: np.ndarray, voice_config: dict) -> np.ndarray: | |
| """Apply character-specific voice quality effects""" | |
| quality = voice_config.get("voice_quality", "clear") | |
| if quality == "deep": | |
| # Add subtle sub-harmonics for deeper voice | |
| subharmonic = 0.05 * np.sin(np.pi * t) | |
| audio = audio + subharmonic[:len(audio)] | |
| elif quality == "bright": | |
| # Emphasize higher frequencies for brighter voice | |
| high_freq = 0.03 * np.sin(2 * np.pi * 4000 * t) | |
| audio = audio + high_freq[:len(audio)] | |
| # Add very subtle vocal fry for naturalness | |
| fry_rate = 70 # Hz | |
| fry = 0.01 * np.sin(2 * np.pi * fry_rate * t) * (np.random.randn(len(t)) * 0.5 + 0.5) | |
| audio = audio + fry[:len(audio)] | |
| return audio | |
| def _create_speech_envelope(self, audio: np.ndarray, t: np.ndarray) -> np.ndarray: | |
| """Create natural speech amplitude envelope""" | |
| envelope = np.ones_like(audio) | |
| # Fade in/out | |
| fade_samples = min(int(0.05 * len(audio)), 1000) | |
| if fade_samples > 0: | |
| envelope[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
| envelope[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
| # Add speech rhythm (breathing, pauses) | |
| breath_rate = 0.3 # Subtle breathing pattern | |
| envelope *= (0.95 + 0.05 * np.sin(2 * np.pi * breath_rate * t)) | |
| return envelope | |
| def _audio_to_base64(self, audio_data: np.ndarray) -> str: | |
| """Convert audio numpy array to base64 string""" | |
| # Convert to 16-bit PCM | |
| audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16) | |
| # Create WAV file in memory manually | |
| buffer = io.BytesIO() | |
| # WAV file parameters | |
| sample_rate = settings.SAMPLE_RATE | |
| num_channels = 1 # Mono | |
| bits_per_sample = 16 | |
| byte_rate = sample_rate * num_channels * bits_per_sample // 8 | |
| block_align = num_channels * bits_per_sample // 8 | |
| data_size = len(audio_int16) * 2 # 2 bytes per sample | |
| file_size = 36 + data_size | |
| # Write WAV header (44 bytes) | |
| buffer.write(b'RIFF') # Chunk ID (4 bytes) | |
| buffer.write(file_size.to_bytes(4, 'little')) # File size - 8 (4 bytes) | |
| buffer.write(b'WAVE') # Format (4 bytes) | |
| buffer.write(b'fmt ') # Subchunk1 ID (4 bytes) | |
| buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size (4 bytes) | |
| buffer.write((1).to_bytes(2, 'little')) # Audio format (PCM) (2 bytes) | |
| buffer.write(num_channels.to_bytes(2, 'little')) # Num channels (2 bytes) | |
| buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate (4 bytes) | |
| buffer.write(byte_rate.to_bytes(4, 'little')) # Byte rate (4 bytes) | |
| buffer.write(block_align.to_bytes(2, 'little')) # Block align (2 bytes) | |
| buffer.write(bits_per_sample.to_bytes(2, 'little')) # Bits per sample (2 bytes) | |
| buffer.write(b'data') # Subchunk2 ID (4 bytes) | |
| buffer.write(data_size.to_bytes(4, 'little')) # Subchunk2 size (4 bytes) | |
| # Write audio data | |
| buffer.write(audio_int16.tobytes()) | |
| # Get bytes and encode to base64 | |
| buffer.seek(0) | |
| audio_bytes = buffer.read() | |
| audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') | |
| return f"data:audio/wav;base64,{audio_base64}" | |
| def get_character_voice_info(self, character_id: str) -> dict: | |
| """Get voice configuration for character""" | |
| return self.character_voice_configs.get(character_id, {}) |