import torch import asyncio import logging import base64 import io import numpy as np from typing import Optional from backend.config import settings import math logger = logging.getLogger(__name__) class SimpleVoiceSynthesizer: """ A simple voice synthesizer that creates synthetic speech using basic audio generation. This is a fallback solution when VibeVoice is not available. """ def __init__(self): self.character_voice_configs = {} self.initialized = False async def initialize(self): """Initialize simple voice synthesis""" if not settings.ENABLE_VOICE: logger.info("Voice synthesis disabled in config") return False logger.info("Initializing simple voice synthesizer...") try: # Setup character-specific voice parameters self._setup_character_voices() self.initialized = True logger.info("Simple voice synthesizer initialized successfully") return True except Exception as e: logger.error(f"Failed to initialize simple voice synthesizer: {e}") return False def _setup_character_voices(self): """Setup character-specific voice configurations""" self.character_voice_configs = { "moses": { "base_frequency": 120, # Lower, more authoritative "speed": 0.9, # Slightly slower "vibrato_rate": 4.5, # Gentle vibrato "vibrato_depth": 0.02, "formant_shift": -0.1, # Deeper formants }, "samsung_employee": { "base_frequency": 150, # Professional, clear "speed": 1.0, # Normal speed "vibrato_rate": 5.0, "vibrato_depth": 0.015, "formant_shift": 0.0, # Neutral formants }, "jinx": { "base_frequency": 180, # Higher, more energetic "speed": 1.15, # Faster speech "vibrato_rate": 6.0, # More vibrato "vibrato_depth": 0.03, "formant_shift": 0.2, # Brighter formants } } async def synthesize(self, text: str, character_id: str) -> Optional[str]: """Synthesize speech for given text and character""" if not self.initialized or not settings.ENABLE_VOICE: return None try: # Get character voice config voice_config = self.character_voice_configs.get( character_id, self.character_voice_configs["samsung_employee"] # Default ) # Generate audio audio_data = self._generate_speech(text, voice_config) # Convert to base64 for web transmission audio_base64 = self._audio_to_base64(audio_data) logger.info(f"Generated speech for {character_id}: {len(text)} chars, audio: {len(audio_data)} samples, base64: {len(audio_base64)} chars") return audio_base64 except Exception as e: logger.error(f"Error in simple voice synthesis: {e}") return None def _generate_speech(self, text: str, voice_config: dict) -> np.ndarray: """Generate synthetic speech using formant synthesis""" # Estimate duration based on text length and speech rate words = len(text.split()) chars = len(text) # Rough estimation: 3-5 words per second, adjusted by speed base_duration = max(words / 4.0, chars / 15.0) # Minimum based on character count duration = base_duration / voice_config["speed"] duration = min(duration, 30.0) # Max 30 seconds sample_rate = settings.SAMPLE_RATE num_samples = int(duration * sample_rate) # Generate time array t = np.linspace(0, duration, num_samples) # Base frequency with subtle variation base_freq = voice_config["base_frequency"] # Add prosody (pitch contours for natural speech) prosody = self._generate_prosody(t, text, voice_config) frequency = base_freq * prosody # Add vibrato vibrato_rate = voice_config["vibrato_rate"] vibrato_depth = voice_config["vibrato_depth"] vibrato = 1 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t) frequency *= vibrato # Generate formants (multiple resonant frequencies) audio = self._generate_formants(t, frequency, voice_config) # Add speech-like envelope envelope = self._generate_envelope(t, text, voice_config) audio *= envelope # Normalize if np.max(np.abs(audio)) > 0: audio = audio / np.max(np.abs(audio)) * 0.7 return audio.astype(np.float32) def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray: """Generate pitch contours for natural-sounding speech""" # Basic prosody pattern prosody = np.ones_like(t) sentence_length = len(t) # Estimate word boundaries based on text length and spaces word_count = len(text.split()) words_per_second = 3.0 # Average speech rate # Create word-level pitch variations if word_count > 1: word_rate = word_count / (len(t) / settings.SAMPLE_RATE) word_stress = 1 + 0.15 * np.sin(2 * np.pi * word_rate * t / word_count) prosody *= word_stress # Add sentence-level intonation based on punctuation time_norm = np.linspace(0, 1, sentence_length) if text.endswith('?'): # Question: rising intonation (more pronounced) prosody *= (1 + 0.3 * time_norm) elif text.endswith('!'): # Exclamation: dramatic rise and fall prosody *= (1 + 0.4 * np.sin(1.2 * np.pi * time_norm)) else: # Statement: natural fall with slight initial rise prosody *= (1 + 0.2 * np.sin(np.pi * time_norm) * np.exp(-1.5 * time_norm)) # Add micro-variations for naturalness micro_variations = 1 + 0.03 * np.sin(2 * np.pi * 12 * t) # 12 Hz micro-variations prosody *= micro_variations # Character-specific prosody adjustments character_factor = voice_config.get("pitch", 1.0) if character_factor > 1.2: # High-pitched characters (like Jinx) # Add more dramatic pitch swings prosody *= (1 + 0.1 * np.sin(2 * np.pi * 3 * t)) elif character_factor < 0.9: # Low-pitched characters (like Moses) # More steady, authoritative prosody prosody *= (1 + 0.05 * np.sin(2 * np.pi * 1.5 * t)) return prosody def _generate_formants(self, t: np.ndarray, frequency: np.ndarray, voice_config: dict) -> np.ndarray: """Generate realistic speech using formant synthesis and phoneme patterns""" # Generate phase for continuous frequency changes phase = np.zeros_like(t) for i in range(1, len(t)): phase[i] = phase[i-1] + 2 * np.pi * frequency[i] / settings.SAMPLE_RATE # Create voiced/unvoiced pattern based on text characteristics voiced_pattern = self._create_phoneme_pattern(t) # Generate rich harmonic content for voiced sounds voiced_audio = np.zeros_like(t) for i, is_voiced in enumerate(voiced_pattern): if is_voiced > 0.5: # Voiced segments # Create rich harmonic series (like vocal cords) sample = 0 for harmonic in range(1, 12): if frequency[i] * harmonic < settings.SAMPLE_RATE / 2: # Avoid aliasing # Natural harmonic amplitude rolloff amplitude = 0.6 / (harmonic ** 0.8) * is_voiced # Add slight randomness to harmonics phase_noise = 0.1 * np.sin(2 * np.pi * 7 * t[i]) sample += amplitude * np.sin(harmonic * phase[i] + phase_noise) voiced_audio[i] = sample # Apply formant filtering for vowel-like quality formant_shift = voice_config.get("formant_shift", 0.0) # Dynamic vowel simulation vowel_rate = 3.0 # Vowel changes per second vowel_pattern = np.sin(2 * np.pi * vowel_rate * t) # Multiple vowel formant sets (approximating /a/, /e/, /i/, /o/, /u/) vowel_formants = { 'a': (730, 1090, 2440), # /a/ as in "father" 'e': (530, 1840, 2480), # /e/ as in "bed" 'i': (270, 2290, 3010), # /i/ as in "beat" 'o': (570, 840, 2410), # /o/ as in "boat" 'u': (440, 1020, 2240) # /u/ as in "boot" } # Interpolate between vowels over time vowel_keys = list(vowel_formants.keys()) vowel_index = ((vowel_pattern + 1) / 2) * (len(vowel_keys) - 1) # Apply formant filtering filtered_audio = np.zeros_like(voiced_audio) for i in range(len(t)): # Get current vowel formants by interpolation idx = int(vowel_index[i]) frac = vowel_index[i] - idx if idx < len(vowel_keys) - 1: f1_a, f2_a, f3_a = vowel_formants[vowel_keys[idx]] f1_b, f2_b, f3_b = vowel_formants[vowel_keys[idx + 1]] f1 = f1_a + (f1_b - f1_a) * frac f2 = f2_a + (f2_b - f2_a) * frac f3 = f3_a + (f3_b - f3_a) * frac else: f1, f2, f3 = vowel_formants[vowel_keys[-1]] # Apply character-specific formant shift f1 *= (1 + formant_shift * 0.3) f2 *= (1 + formant_shift * 0.2) f3 *= (1 + formant_shift * 0.1) # Simple formant filtering using resonance approximation if voiced_pattern[i] > 0.1: # Emphasize frequencies near formants sample = voiced_audio[i] # F1 resonance f1_resonance = 1 + 0.4 * np.exp(-((frequency[i] - f1) / 80) ** 2) # F2 resonance f2_resonance = 1 + 0.3 * np.exp(-((frequency[i] - f2) / 120) ** 2) # F3 resonance f3_resonance = 1 + 0.2 * np.exp(-((frequency[i] - f3) / 200) ** 2) filtered_audio[i] = sample * f1_resonance * f2_resonance * f3_resonance else: # Unvoiced segments - add fricative noise np.random.seed(int(t[i] * 1000) % 10000) noise_amp = (1 - voiced_pattern[i]) * 0.15 filtered_audio[i] = (np.random.random() - 0.5) * noise_amp return filtered_audio def _create_phoneme_pattern(self, t: np.ndarray) -> np.ndarray: """Create a pattern of voiced/unvoiced segments to simulate phonemes""" pattern = np.ones_like(t) # Create syllable-like rhythm syllable_rate = 4.5 # Syllables per second syllable_phase = 2 * np.pi * syllable_rate * t # Most of syllable is voiced (vowel), with brief unvoiced parts (consonants) voiced_base = 0.8 + 0.2 * np.sin(syllable_phase) # Add consonant-like unvoiced segments consonant_rate = 8.0 # Consonant events per second consonant_phase = 2 * np.pi * consonant_rate * t consonant_trigger = np.sin(consonant_phase + np.pi/4) # Sharp consonant transitions consonant_mask = (consonant_trigger > 0.85).astype(float) # Combine patterns - consonants reduce voicing pattern = voiced_base * (1 - consonant_mask * 0.7) # Smooth transitions to avoid clicks kernel_size = max(3, len(pattern) // 200) if kernel_size % 2 == 0: kernel_size += 1 if kernel_size >= 3 and kernel_size <= len(pattern) // 3: kernel = np.ones(kernel_size) / kernel_size pattern = np.convolve(pattern, kernel, mode='same') return np.clip(pattern, 0, 1) def _generate_envelope(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray: """Generate amplitude envelope for speech-like rhythm""" envelope = np.ones_like(t) # Overall fade in/out fade_samples = min(int(0.05 * len(t)), 500) # 50ms fade if fade_samples > 0: # Smooth fade in envelope[:fade_samples] *= np.sin(np.pi * np.linspace(0, 0.5, fade_samples)) ** 2 # Smooth fade out envelope[-fade_samples:] *= np.cos(np.pi * np.linspace(0, 0.5, fade_samples)) ** 2 # Estimate syllables from text length syllable_count = max(len(text.replace(' ', '')) // 3, 1) # Rough syllable estimate duration = len(t) / settings.SAMPLE_RATE syllable_rate = syllable_count / duration # Create syllable-like amplitude modulation syllable_pattern = 0.6 + 0.4 * (np.sin(2 * np.pi * syllable_rate * t) ** 2) envelope *= syllable_pattern # Add word boundaries (pauses between words) word_count = len(text.split()) if word_count > 1: word_rate = word_count / duration # Create brief pauses between words word_boundaries = np.sin(2 * np.pi * word_rate * t + np.pi/4) word_gates = np.where(word_boundaries < -0.8, 0.3, 1.0) # Brief pauses envelope *= word_gates # Add breath-like variations breath_rate = 0.5 # Breathing-like variations breath_mod = 1 + 0.1 * np.sin(2 * np.pi * breath_rate * t) envelope *= breath_mod # Character-specific envelope characteristics speed = voice_config.get("speed", 1.0) if speed > 1.1: # Fast talkers (like Jinx) # More staccato, energetic envelope energy_bursts = 1 + 0.2 * (np.random.rand(len(t)) > 0.7).astype(float) envelope *= energy_bursts elif speed < 0.95: # Slow, deliberate speakers (like Moses) # Smoother, more sustained envelope envelope = np.power(envelope, 0.7) # Gentler amplitude changes # Ensure envelope doesn't go below minimum level envelope = np.maximum(envelope, 0.1) return envelope def _audio_to_base64(self, audio_data: np.ndarray) -> str: """Convert audio numpy array to base64 string""" # Convert to 16-bit PCM audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16) # Create WAV file in memory manually buffer = io.BytesIO() # WAV file parameters sample_rate = settings.SAMPLE_RATE num_channels = 1 # Mono bits_per_sample = 16 byte_rate = sample_rate * num_channels * bits_per_sample // 8 block_align = num_channels * bits_per_sample // 8 data_size = len(audio_int16) * 2 # 2 bytes per sample file_size = 36 + data_size # Write WAV header (44 bytes) buffer.write(b'RIFF') # Chunk ID (4 bytes) buffer.write(file_size.to_bytes(4, 'little')) # File size - 8 (4 bytes) buffer.write(b'WAVE') # Format (4 bytes) buffer.write(b'fmt ') # Subchunk1 ID (4 bytes) buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size (4 bytes) buffer.write((1).to_bytes(2, 'little')) # Audio format (PCM) (2 bytes) buffer.write(num_channels.to_bytes(2, 'little')) # Num channels (2 bytes) buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate (4 bytes) buffer.write(byte_rate.to_bytes(4, 'little')) # Byte rate (4 bytes) buffer.write(block_align.to_bytes(2, 'little')) # Block align (2 bytes) buffer.write(bits_per_sample.to_bytes(2, 'little')) # Bits per sample (2 bytes) buffer.write(b'data') # Subchunk2 ID (4 bytes) buffer.write(data_size.to_bytes(4, 'little')) # Subchunk2 size (4 bytes) # Write audio data buffer.write(audio_int16.tobytes()) logger.debug(f"Generated WAV file: {file_size + 8} bytes total, {data_size} bytes audio data") # Get bytes and encode to base64 buffer.seek(0) audio_bytes = buffer.read() audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') return f"data:audio/wav;base64,{audio_base64}" def get_character_voice_info(self, character_id: str) -> dict: """Get voice configuration for character""" return self.character_voice_configs.get(character_id, {})