Roleplay-Chat-Box / backend /models /enhanced_voice_synthesizer.py
ButterM40's picture
Deploy Roleplay Chat Box - optimized version without large files
7e68852
import torch
import asyncio
import logging
import base64
import io
import numpy as np
from typing import Optional
from backend.config import settings
import math
logger = logging.getLogger(__name__)
class SimpleVoiceSynthesizer:
"""
An improved simple voice synthesizer that creates more realistic speech-like audio
using phoneme patterns, formant synthesis, and prosody modeling.
"""
def __init__(self):
self.character_voice_configs = {}
self.initialized = False
async def initialize(self):
"""Initialize simple voice synthesis"""
if not settings.ENABLE_VOICE:
logger.info("Voice synthesis disabled in config")
return False
logger.info("Initializing improved simple voice synthesizer...")
try:
# Setup character-specific voice parameters
self._setup_character_voices()
self.initialized = True
logger.info("Improved simple voice synthesizer initialized successfully")
return True
except Exception as e:
logger.error(f"Failed to initialize simple voice synthesizer: {e}")
return False
def _setup_character_voices(self):
"""Setup character-specific voice configurations"""
self.character_voice_configs = {
"moses": {
"base_frequency": 110, # Lower, more authoritative
"speed": 0.85, # Slower, more measured
"pitch_variance": 0.15, # Less pitch variation
"formant_shift": -0.2, # Deeper formants
"voice_quality": "deep",
},
"samsung_employee": {
"base_frequency": 140, # Professional, clear
"speed": 1.0, # Normal speed
"pitch_variance": 0.2, # Moderate variation
"formant_shift": 0.0, # Neutral formants
"voice_quality": "clear",
},
"jinx": {
"base_frequency": 180, # Higher, more energetic
"speed": 1.2, # Faster speech
"pitch_variance": 0.35, # More pitch variation
"formant_shift": 0.3, # Brighter formants
"voice_quality": "bright",
}
}
async def synthesize(self, text: str, character_id: str) -> Optional[str]:
"""Synthesize speech for given text and character"""
if not self.initialized or not settings.ENABLE_VOICE:
return None
try:
# Get character voice config
voice_config = self.character_voice_configs.get(
character_id,
self.character_voice_configs["samsung_employee"] # Default
)
# Generate realistic speech audio
audio_data = self._generate_realistic_speech(text, voice_config)
# Convert to base64 for web transmission
audio_base64 = self._audio_to_base64(audio_data)
logger.info(f"Generated realistic speech for {character_id}: {len(text)} chars, {len(audio_data)} samples")
return audio_base64
except Exception as e:
logger.error(f"Error in simple voice synthesis: {e}")
return None
def _generate_realistic_speech(self, text: str, voice_config: dict) -> np.ndarray:
"""Generate realistic speech using advanced phoneme and prosody modeling"""
# Calculate duration based on speaking rate
words = len(text.split())
chars = len(text)
# Realistic speaking rates: 150-180 words per minute
base_wpm = 160
speed_factor = voice_config["speed"]
actual_wpm = base_wpm * speed_factor
# Calculate duration
duration = (words / actual_wpm) * 60 # Convert to seconds
duration = max(duration, chars / 15.0) # Minimum based on character count
duration = min(duration, 30.0) # Maximum 30 seconds
sample_rate = settings.SAMPLE_RATE
num_samples = int(duration * sample_rate)
# Create time array
t = np.linspace(0, duration, num_samples)
# Generate phoneme-based speech patterns
audio = self._create_phoneme_speech(t, text, voice_config)
# Apply prosody (intonation patterns)
prosody = self._generate_prosody(t, text, voice_config)
audio *= prosody
# Apply character-specific voice quality
audio = self._apply_voice_quality(audio, t, voice_config)
# Add natural speech envelope
envelope = self._create_speech_envelope(audio, t)
audio *= envelope
# Normalize and return
if np.max(np.abs(audio)) > 0:
audio = audio / np.max(np.abs(audio)) * 0.8
return audio.astype(np.float32)
def _create_phoneme_speech(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
"""Create speech-like audio using phoneme patterns"""
audio = np.zeros_like(t)
base_freq = voice_config["base_frequency"]
# Create syllable timing based on text
syllable_rate = 4.0 * voice_config["speed"] # syllables per second
syllable_duration = 1.0 / syllable_rate
for i, sample_time in enumerate(t):
# Determine current syllable position
syllable_phase = (sample_time % syllable_duration) / syllable_duration
# Create vowel/consonant pattern
# Vowels: 0.2-0.8 of syllable, Consonants: 0.0-0.2 and 0.8-1.0
is_vowel = 0.2 < syllable_phase < 0.8
# Get fundamental frequency with natural variation
pitch_variation = voice_config["pitch_variance"]
f0 = base_freq * (1 + pitch_variation * np.sin(2 * np.pi * 2.3 * sample_time))
if is_vowel:
# Generate vowel sound using formant synthesis
vowel_sound = self._generate_vowel_formants(sample_time, f0, voice_config)
audio[i] = vowel_sound
else:
# Generate consonant sound using filtered noise
consonant_sound = self._generate_consonant(sample_time, f0, voice_config)
audio[i] = consonant_sound
return audio
def _generate_vowel_formants(self, t: float, f0: float, voice_config: dict) -> float:
"""Generate vowel sounds using formant frequencies"""
formant_shift = voice_config["formant_shift"]
# Vowel formant frequencies (approximate average)
f1 = 650 * (1 + formant_shift * 0.5) # First formant
f2 = 1400 * (1 + formant_shift * 0.3) # Second formant
f3 = 2500 * (1 + formant_shift * 0.2) # Third formant
# Add slight formant movement for naturalness
f1 += 50 * np.sin(2 * np.pi * 1.7 * t)
f2 += 80 * np.sin(2 * np.pi * 2.1 * t)
# Generate harmonic series for fundamental
fundamental = 0.4 * np.sin(2 * np.pi * f0 * t)
# Generate formant resonances
formant1 = 0.3 * np.sin(2 * np.pi * f1 * t) * np.exp(-abs(f1 - f0*1) / 200)
formant2 = 0.2 * np.sin(2 * np.pi * f2 * t) * np.exp(-abs(f2 - f0*2) / 300)
formant3 = 0.1 * np.sin(2 * np.pi * f3 * t) * np.exp(-abs(f3 - f0*3) / 500)
# Add harmonics
harmonic2 = 0.2 * np.sin(2 * np.pi * f0 * 2 * t)
harmonic3 = 0.1 * np.sin(2 * np.pi * f0 * 3 * t)
return fundamental + formant1 + formant2 + formant3 + harmonic2 + harmonic3
def _generate_consonant(self, t: float, f0: float, voice_config: dict) -> float:
"""Generate consonant sounds using filtered noise and fricatives"""
# Create noise component for fricatives
noise = (np.random.randn() - 0.5) * 0.15
# Add some periodic component for voiced consonants
periodic = 0.1 * np.sin(2 * np.pi * f0 * t)
# Filter noise based on consonant type (simplified)
filtered_noise = noise * (1 + 0.5 * np.sin(2 * np.pi * 3000 * t))
return filtered_noise + periodic * 0.3
def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
"""Generate natural prosody (intonation) patterns"""
prosody = np.ones_like(t)
duration = t[-1] if len(t) > 0 else 1.0
# Sentence-level intonation
time_norm = t / duration
if text.endswith('?'):
# Question: rising intonation
prosody *= (0.8 + 0.4 * time_norm)
elif text.endswith('!'):
# Exclamation: dramatic contour
prosody *= (0.9 + 0.3 * np.sin(np.pi * time_norm) * np.exp(-time_norm))
else:
# Statement: natural declination
prosody *= (1.0 - 0.2 * time_norm)
# Add micro-prosody for naturalness
prosody *= (1 + 0.05 * np.sin(2 * np.pi * 8 * t))
# Character-specific prosody
if voice_config.get("voice_quality") == "bright":
# More animated prosody for energetic characters
prosody *= (1 + 0.1 * np.sin(2 * np.pi * 2.5 * t))
elif voice_config.get("voice_quality") == "deep":
# More steady prosody for authoritative characters
prosody *= (1 + 0.03 * np.sin(2 * np.pi * 1.2 * t))
return prosody
def _apply_voice_quality(self, audio: np.ndarray, t: np.ndarray, voice_config: dict) -> np.ndarray:
"""Apply character-specific voice quality effects"""
quality = voice_config.get("voice_quality", "clear")
if quality == "deep":
# Add subtle sub-harmonics for deeper voice
subharmonic = 0.05 * np.sin(np.pi * t)
audio = audio + subharmonic[:len(audio)]
elif quality == "bright":
# Emphasize higher frequencies for brighter voice
high_freq = 0.03 * np.sin(2 * np.pi * 4000 * t)
audio = audio + high_freq[:len(audio)]
# Add very subtle vocal fry for naturalness
fry_rate = 70 # Hz
fry = 0.01 * np.sin(2 * np.pi * fry_rate * t) * (np.random.randn(len(t)) * 0.5 + 0.5)
audio = audio + fry[:len(audio)]
return audio
def _create_speech_envelope(self, audio: np.ndarray, t: np.ndarray) -> np.ndarray:
"""Create natural speech amplitude envelope"""
envelope = np.ones_like(audio)
# Fade in/out
fade_samples = min(int(0.05 * len(audio)), 1000)
if fade_samples > 0:
envelope[:fade_samples] *= np.linspace(0, 1, fade_samples)
envelope[-fade_samples:] *= np.linspace(1, 0, fade_samples)
# Add speech rhythm (breathing, pauses)
breath_rate = 0.3 # Subtle breathing pattern
envelope *= (0.95 + 0.05 * np.sin(2 * np.pi * breath_rate * t))
return envelope
def _audio_to_base64(self, audio_data: np.ndarray) -> str:
"""Convert audio numpy array to base64 string"""
# Convert to 16-bit PCM
audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16)
# Create WAV file in memory manually
buffer = io.BytesIO()
# WAV file parameters
sample_rate = settings.SAMPLE_RATE
num_channels = 1 # Mono
bits_per_sample = 16
byte_rate = sample_rate * num_channels * bits_per_sample // 8
block_align = num_channels * bits_per_sample // 8
data_size = len(audio_int16) * 2 # 2 bytes per sample
file_size = 36 + data_size
# Write WAV header (44 bytes)
buffer.write(b'RIFF') # Chunk ID (4 bytes)
buffer.write(file_size.to_bytes(4, 'little')) # File size - 8 (4 bytes)
buffer.write(b'WAVE') # Format (4 bytes)
buffer.write(b'fmt ') # Subchunk1 ID (4 bytes)
buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size (4 bytes)
buffer.write((1).to_bytes(2, 'little')) # Audio format (PCM) (2 bytes)
buffer.write(num_channels.to_bytes(2, 'little')) # Num channels (2 bytes)
buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate (4 bytes)
buffer.write(byte_rate.to_bytes(4, 'little')) # Byte rate (4 bytes)
buffer.write(block_align.to_bytes(2, 'little')) # Block align (2 bytes)
buffer.write(bits_per_sample.to_bytes(2, 'little')) # Bits per sample (2 bytes)
buffer.write(b'data') # Subchunk2 ID (4 bytes)
buffer.write(data_size.to_bytes(4, 'little')) # Subchunk2 size (4 bytes)
# Write audio data
buffer.write(audio_int16.tobytes())
# Get bytes and encode to base64
buffer.seek(0)
audio_bytes = buffer.read()
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
return f"data:audio/wav;base64,{audio_base64}"
def get_character_voice_info(self, character_id: str) -> dict:
"""Get voice configuration for character"""
return self.character_voice_configs.get(character_id, {})