Spaces:

ButterM40
/

Roleplay-Chat-Box

Sleeping

File size: 13,742 Bytes

7e68852

import torch
import asyncio
import logging
import base64
import io
import numpy as np
from typing import Optional
from backend.config import settings
import math

logger = logging.getLogger(__name__)

class SimpleVoiceSynthesizer:
    """
    An improved simple voice synthesizer that creates more realistic speech-like audio
    using phoneme patterns, formant synthesis, and prosody modeling.
    """
    
    def __init__(self):
        self.character_voice_configs = {}
        self.initialized = False
        
    async def initialize(self):
        """Initialize simple voice synthesis"""
        if not settings.ENABLE_VOICE:
            logger.info("Voice synthesis disabled in config")
            return False
            
        logger.info("Initializing improved simple voice synthesizer...")
        
        try:
            # Setup character-specific voice parameters
            self._setup_character_voices()
            self.initialized = True
            logger.info("Improved simple voice synthesizer initialized successfully")
            return True
            
        except Exception as e:
            logger.error(f"Failed to initialize simple voice synthesizer: {e}")
            return False
            
    def _setup_character_voices(self):
        """Setup character-specific voice configurations"""
        self.character_voice_configs = {
            "moses": {
                "base_frequency": 110,  # Lower, more authoritative
                "speed": 0.85,  # Slower, more measured
                "pitch_variance": 0.15,  # Less pitch variation
                "formant_shift": -0.2,  # Deeper formants
                "voice_quality": "deep",
            },
            "samsung_employee": {
                "base_frequency": 140,  # Professional, clear
                "speed": 1.0,  # Normal speed
                "pitch_variance": 0.2,  # Moderate variation
                "formant_shift": 0.0,  # Neutral formants
                "voice_quality": "clear",
            },
            "jinx": {
                "base_frequency": 180,  # Higher, more energetic
                "speed": 1.2,  # Faster speech
                "pitch_variance": 0.35,  # More pitch variation
                "formant_shift": 0.3,  # Brighter formants
                "voice_quality": "bright",
            }
        }
        
    async def synthesize(self, text: str, character_id: str) -> Optional[str]:
        """Synthesize speech for given text and character"""
        if not self.initialized or not settings.ENABLE_VOICE:
            return None
            
        try:
            # Get character voice config
            voice_config = self.character_voice_configs.get(
                character_id, 
                self.character_voice_configs["samsung_employee"]  # Default
            )
            
            # Generate realistic speech audio
            audio_data = self._generate_realistic_speech(text, voice_config)
            
            # Convert to base64 for web transmission
            audio_base64 = self._audio_to_base64(audio_data)
            
            logger.info(f"Generated realistic speech for {character_id}: {len(text)} chars, {len(audio_data)} samples")
            return audio_base64
            
        except Exception as e:
            logger.error(f"Error in simple voice synthesis: {e}")
            return None
            
    def _generate_realistic_speech(self, text: str, voice_config: dict) -> np.ndarray:
        """Generate realistic speech using advanced phoneme and prosody modeling"""
        
        # Calculate duration based on speaking rate
        words = len(text.split())
        chars = len(text)
        
        # Realistic speaking rates: 150-180 words per minute
        base_wpm = 160
        speed_factor = voice_config["speed"]
        actual_wpm = base_wpm * speed_factor
        
        # Calculate duration
        duration = (words / actual_wpm) * 60  # Convert to seconds
        duration = max(duration, chars / 15.0)  # Minimum based on character count
        duration = min(duration, 30.0)  # Maximum 30 seconds
        
        sample_rate = settings.SAMPLE_RATE
        num_samples = int(duration * sample_rate)
        
        # Create time array
        t = np.linspace(0, duration, num_samples)
        
        # Generate phoneme-based speech patterns
        audio = self._create_phoneme_speech(t, text, voice_config)
        
        # Apply prosody (intonation patterns)
        prosody = self._generate_prosody(t, text, voice_config)
        audio *= prosody
        
        # Apply character-specific voice quality
        audio = self._apply_voice_quality(audio, t, voice_config)
        
        # Add natural speech envelope
        envelope = self._create_speech_envelope(audio, t)
        audio *= envelope
        
        # Normalize and return
        if np.max(np.abs(audio)) > 0:
            audio = audio / np.max(np.abs(audio)) * 0.8
            
        return audio.astype(np.float32)
        
    def _create_phoneme_speech(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
        """Create speech-like audio using phoneme patterns"""
        
        audio = np.zeros_like(t)
        base_freq = voice_config["base_frequency"]
        
        # Create syllable timing based on text
        syllable_rate = 4.0 * voice_config["speed"]  # syllables per second
        syllable_duration = 1.0 / syllable_rate
        
        for i, sample_time in enumerate(t):
            # Determine current syllable position
            syllable_phase = (sample_time % syllable_duration) / syllable_duration
            
            # Create vowel/consonant pattern
            # Vowels: 0.2-0.8 of syllable, Consonants: 0.0-0.2 and 0.8-1.0
            is_vowel = 0.2 < syllable_phase < 0.8
            
            # Get fundamental frequency with natural variation
            pitch_variation = voice_config["pitch_variance"]
            f0 = base_freq * (1 + pitch_variation * np.sin(2 * np.pi * 2.3 * sample_time))
            
            if is_vowel:
                # Generate vowel sound using formant synthesis
                vowel_sound = self._generate_vowel_formants(sample_time, f0, voice_config)
                audio[i] = vowel_sound
            else:
                # Generate consonant sound using filtered noise
                consonant_sound = self._generate_consonant(sample_time, f0, voice_config)
                audio[i] = consonant_sound
                
        return audio
        
    def _generate_vowel_formants(self, t: float, f0: float, voice_config: dict) -> float:
        """Generate vowel sounds using formant frequencies"""
        
        formant_shift = voice_config["formant_shift"]
        
        # Vowel formant frequencies (approximate average)
        f1 = 650 * (1 + formant_shift * 0.5)   # First formant
        f2 = 1400 * (1 + formant_shift * 0.3)  # Second formant
        f3 = 2500 * (1 + formant_shift * 0.2)  # Third formant
        
        # Add slight formant movement for naturalness
        f1 += 50 * np.sin(2 * np.pi * 1.7 * t)
        f2 += 80 * np.sin(2 * np.pi * 2.1 * t)
        
        # Generate harmonic series for fundamental
        fundamental = 0.4 * np.sin(2 * np.pi * f0 * t)
        
        # Generate formant resonances
        formant1 = 0.3 * np.sin(2 * np.pi * f1 * t) * np.exp(-abs(f1 - f0*1) / 200)
        formant2 = 0.2 * np.sin(2 * np.pi * f2 * t) * np.exp(-abs(f2 - f0*2) / 300)
        formant3 = 0.1 * np.sin(2 * np.pi * f3 * t) * np.exp(-abs(f3 - f0*3) / 500)
        
        # Add harmonics
        harmonic2 = 0.2 * np.sin(2 * np.pi * f0 * 2 * t)
        harmonic3 = 0.1 * np.sin(2 * np.pi * f0 * 3 * t)
        
        return fundamental + formant1 + formant2 + formant3 + harmonic2 + harmonic3
        
    def _generate_consonant(self, t: float, f0: float, voice_config: dict) -> float:
        """Generate consonant sounds using filtered noise and fricatives"""
        
        # Create noise component for fricatives
        noise = (np.random.randn() - 0.5) * 0.15
        
        # Add some periodic component for voiced consonants
        periodic = 0.1 * np.sin(2 * np.pi * f0 * t)
        
        # Filter noise based on consonant type (simplified)
        filtered_noise = noise * (1 + 0.5 * np.sin(2 * np.pi * 3000 * t))
        
        return filtered_noise + periodic * 0.3
        
    def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
        """Generate natural prosody (intonation) patterns"""
        
        prosody = np.ones_like(t)
        duration = t[-1] if len(t) > 0 else 1.0
        
        # Sentence-level intonation
        time_norm = t / duration
        
        if text.endswith('?'):
            # Question: rising intonation
            prosody *= (0.8 + 0.4 * time_norm)
        elif text.endswith('!'):
            # Exclamation: dramatic contour
            prosody *= (0.9 + 0.3 * np.sin(np.pi * time_norm) * np.exp(-time_norm))
        else:
            # Statement: natural declination
            prosody *= (1.0 - 0.2 * time_norm)
            
        # Add micro-prosody for naturalness
        prosody *= (1 + 0.05 * np.sin(2 * np.pi * 8 * t))
        
        # Character-specific prosody
        if voice_config.get("voice_quality") == "bright":
            # More animated prosody for energetic characters
            prosody *= (1 + 0.1 * np.sin(2 * np.pi * 2.5 * t))
        elif voice_config.get("voice_quality") == "deep":
            # More steady prosody for authoritative characters
            prosody *= (1 + 0.03 * np.sin(2 * np.pi * 1.2 * t))
            
        return prosody
        
    def _apply_voice_quality(self, audio: np.ndarray, t: np.ndarray, voice_config: dict) -> np.ndarray:
        """Apply character-specific voice quality effects"""
        
        quality = voice_config.get("voice_quality", "clear")
        
        if quality == "deep":
            # Add subtle sub-harmonics for deeper voice
            subharmonic = 0.05 * np.sin(np.pi * t)
            audio = audio + subharmonic[:len(audio)]
            
        elif quality == "bright":
            # Emphasize higher frequencies for brighter voice
            high_freq = 0.03 * np.sin(2 * np.pi * 4000 * t)
            audio = audio + high_freq[:len(audio)]
            
        # Add very subtle vocal fry for naturalness
        fry_rate = 70  # Hz
        fry = 0.01 * np.sin(2 * np.pi * fry_rate * t) * (np.random.randn(len(t)) * 0.5 + 0.5)
        audio = audio + fry[:len(audio)]
        
        return audio
        
    def _create_speech_envelope(self, audio: np.ndarray, t: np.ndarray) -> np.ndarray:
        """Create natural speech amplitude envelope"""
        
        envelope = np.ones_like(audio)
        
        # Fade in/out
        fade_samples = min(int(0.05 * len(audio)), 1000)
        if fade_samples > 0:
            envelope[:fade_samples] *= np.linspace(0, 1, fade_samples)
            envelope[-fade_samples:] *= np.linspace(1, 0, fade_samples)
            
        # Add speech rhythm (breathing, pauses)
        breath_rate = 0.3  # Subtle breathing pattern
        envelope *= (0.95 + 0.05 * np.sin(2 * np.pi * breath_rate * t))
        
        return envelope
        
    def _audio_to_base64(self, audio_data: np.ndarray) -> str:
        """Convert audio numpy array to base64 string"""
        # Convert to 16-bit PCM
        audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16)
        
        # Create WAV file in memory manually
        buffer = io.BytesIO()
        
        # WAV file parameters
        sample_rate = settings.SAMPLE_RATE
        num_channels = 1  # Mono
        bits_per_sample = 16
        byte_rate = sample_rate * num_channels * bits_per_sample // 8
        block_align = num_channels * bits_per_sample // 8
        data_size = len(audio_int16) * 2  # 2 bytes per sample
        file_size = 36 + data_size
        
        # Write WAV header (44 bytes)
        buffer.write(b'RIFF')                                    # Chunk ID (4 bytes)
        buffer.write(file_size.to_bytes(4, 'little'))           # File size - 8 (4 bytes)
        buffer.write(b'WAVE')                                    # Format (4 bytes)
        buffer.write(b'fmt ')                                    # Subchunk1 ID (4 bytes)
        buffer.write((16).to_bytes(4, 'little'))                # Subchunk1 size (4 bytes)
        buffer.write((1).to_bytes(2, 'little'))                 # Audio format (PCM) (2 bytes)
        buffer.write(num_channels.to_bytes(2, 'little'))        # Num channels (2 bytes)
        buffer.write(sample_rate.to_bytes(4, 'little'))         # Sample rate (4 bytes)
        buffer.write(byte_rate.to_bytes(4, 'little'))           # Byte rate (4 bytes)
        buffer.write(block_align.to_bytes(2, 'little'))         # Block align (2 bytes)
        buffer.write(bits_per_sample.to_bytes(2, 'little'))     # Bits per sample (2 bytes)
        buffer.write(b'data')                                    # Subchunk2 ID (4 bytes)
        buffer.write(data_size.to_bytes(4, 'little'))           # Subchunk2 size (4 bytes)
        
        # Write audio data
        buffer.write(audio_int16.tobytes())
        
        # Get bytes and encode to base64
        buffer.seek(0)
        audio_bytes = buffer.read()
        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
        
        return f"data:audio/wav;base64,{audio_base64}"
        
    def get_character_voice_info(self, character_id: str) -> dict:
        """Get voice configuration for character"""
        return self.character_voice_configs.get(character_id, {})