Roleplay-Chat-Box / backend /models /simple_voice_synthesizer.py
ButterM40's picture
Deploy Roleplay Chat Box - optimized version without large files
7e68852
import torch
import asyncio
import logging
import base64
import io
import numpy as np
from typing import Optional
from backend.config import settings
import math
logger = logging.getLogger(__name__)
class SimpleVoiceSynthesizer:
"""
A simple voice synthesizer that creates synthetic speech using basic audio generation.
This is a fallback solution when VibeVoice is not available.
"""
def __init__(self):
self.character_voice_configs = {}
self.initialized = False
async def initialize(self):
"""Initialize simple voice synthesis"""
if not settings.ENABLE_VOICE:
logger.info("Voice synthesis disabled in config")
return False
logger.info("Initializing simple voice synthesizer...")
try:
# Setup character-specific voice parameters
self._setup_character_voices()
self.initialized = True
logger.info("Simple voice synthesizer initialized successfully")
return True
except Exception as e:
logger.error(f"Failed to initialize simple voice synthesizer: {e}")
return False
def _setup_character_voices(self):
"""Setup character-specific voice configurations"""
self.character_voice_configs = {
"moses": {
"base_frequency": 120, # Lower, more authoritative
"speed": 0.9, # Slightly slower
"vibrato_rate": 4.5, # Gentle vibrato
"vibrato_depth": 0.02,
"formant_shift": -0.1, # Deeper formants
},
"samsung_employee": {
"base_frequency": 150, # Professional, clear
"speed": 1.0, # Normal speed
"vibrato_rate": 5.0,
"vibrato_depth": 0.015,
"formant_shift": 0.0, # Neutral formants
},
"jinx": {
"base_frequency": 180, # Higher, more energetic
"speed": 1.15, # Faster speech
"vibrato_rate": 6.0, # More vibrato
"vibrato_depth": 0.03,
"formant_shift": 0.2, # Brighter formants
}
}
async def synthesize(self, text: str, character_id: str) -> Optional[str]:
"""Synthesize speech for given text and character"""
if not self.initialized or not settings.ENABLE_VOICE:
return None
try:
# Get character voice config
voice_config = self.character_voice_configs.get(
character_id,
self.character_voice_configs["samsung_employee"] # Default
)
# Generate audio
audio_data = self._generate_speech(text, voice_config)
# Convert to base64 for web transmission
audio_base64 = self._audio_to_base64(audio_data)
logger.info(f"Generated speech for {character_id}: {len(text)} chars, audio: {len(audio_data)} samples, base64: {len(audio_base64)} chars")
return audio_base64
except Exception as e:
logger.error(f"Error in simple voice synthesis: {e}")
return None
def _generate_speech(self, text: str, voice_config: dict) -> np.ndarray:
"""Generate synthetic speech using formant synthesis"""
# Estimate duration based on text length and speech rate
words = len(text.split())
chars = len(text)
# Rough estimation: 3-5 words per second, adjusted by speed
base_duration = max(words / 4.0, chars / 15.0) # Minimum based on character count
duration = base_duration / voice_config["speed"]
duration = min(duration, 30.0) # Max 30 seconds
sample_rate = settings.SAMPLE_RATE
num_samples = int(duration * sample_rate)
# Generate time array
t = np.linspace(0, duration, num_samples)
# Base frequency with subtle variation
base_freq = voice_config["base_frequency"]
# Add prosody (pitch contours for natural speech)
prosody = self._generate_prosody(t, text, voice_config)
frequency = base_freq * prosody
# Add vibrato
vibrato_rate = voice_config["vibrato_rate"]
vibrato_depth = voice_config["vibrato_depth"]
vibrato = 1 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)
frequency *= vibrato
# Generate formants (multiple resonant frequencies)
audio = self._generate_formants(t, frequency, voice_config)
# Add speech-like envelope
envelope = self._generate_envelope(t, text, voice_config)
audio *= envelope
# Normalize
if np.max(np.abs(audio)) > 0:
audio = audio / np.max(np.abs(audio)) * 0.7
return audio.astype(np.float32)
def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
"""Generate pitch contours for natural-sounding speech"""
# Basic prosody pattern
prosody = np.ones_like(t)
sentence_length = len(t)
# Estimate word boundaries based on text length and spaces
word_count = len(text.split())
words_per_second = 3.0 # Average speech rate
# Create word-level pitch variations
if word_count > 1:
word_rate = word_count / (len(t) / settings.SAMPLE_RATE)
word_stress = 1 + 0.15 * np.sin(2 * np.pi * word_rate * t / word_count)
prosody *= word_stress
# Add sentence-level intonation based on punctuation
time_norm = np.linspace(0, 1, sentence_length)
if text.endswith('?'):
# Question: rising intonation (more pronounced)
prosody *= (1 + 0.3 * time_norm)
elif text.endswith('!'):
# Exclamation: dramatic rise and fall
prosody *= (1 + 0.4 * np.sin(1.2 * np.pi * time_norm))
else:
# Statement: natural fall with slight initial rise
prosody *= (1 + 0.2 * np.sin(np.pi * time_norm) * np.exp(-1.5 * time_norm))
# Add micro-variations for naturalness
micro_variations = 1 + 0.03 * np.sin(2 * np.pi * 12 * t) # 12 Hz micro-variations
prosody *= micro_variations
# Character-specific prosody adjustments
character_factor = voice_config.get("pitch", 1.0)
if character_factor > 1.2: # High-pitched characters (like Jinx)
# Add more dramatic pitch swings
prosody *= (1 + 0.1 * np.sin(2 * np.pi * 3 * t))
elif character_factor < 0.9: # Low-pitched characters (like Moses)
# More steady, authoritative prosody
prosody *= (1 + 0.05 * np.sin(2 * np.pi * 1.5 * t))
return prosody
def _generate_formants(self, t: np.ndarray, frequency: np.ndarray, voice_config: dict) -> np.ndarray:
"""Generate realistic speech using formant synthesis and phoneme patterns"""
# Generate phase for continuous frequency changes
phase = np.zeros_like(t)
for i in range(1, len(t)):
phase[i] = phase[i-1] + 2 * np.pi * frequency[i] / settings.SAMPLE_RATE
# Create voiced/unvoiced pattern based on text characteristics
voiced_pattern = self._create_phoneme_pattern(t)
# Generate rich harmonic content for voiced sounds
voiced_audio = np.zeros_like(t)
for i, is_voiced in enumerate(voiced_pattern):
if is_voiced > 0.5: # Voiced segments
# Create rich harmonic series (like vocal cords)
sample = 0
for harmonic in range(1, 12):
if frequency[i] * harmonic < settings.SAMPLE_RATE / 2: # Avoid aliasing
# Natural harmonic amplitude rolloff
amplitude = 0.6 / (harmonic ** 0.8) * is_voiced
# Add slight randomness to harmonics
phase_noise = 0.1 * np.sin(2 * np.pi * 7 * t[i])
sample += amplitude * np.sin(harmonic * phase[i] + phase_noise)
voiced_audio[i] = sample
# Apply formant filtering for vowel-like quality
formant_shift = voice_config.get("formant_shift", 0.0)
# Dynamic vowel simulation
vowel_rate = 3.0 # Vowel changes per second
vowel_pattern = np.sin(2 * np.pi * vowel_rate * t)
# Multiple vowel formant sets (approximating /a/, /e/, /i/, /o/, /u/)
vowel_formants = {
'a': (730, 1090, 2440), # /a/ as in "father"
'e': (530, 1840, 2480), # /e/ as in "bed"
'i': (270, 2290, 3010), # /i/ as in "beat"
'o': (570, 840, 2410), # /o/ as in "boat"
'u': (440, 1020, 2240) # /u/ as in "boot"
}
# Interpolate between vowels over time
vowel_keys = list(vowel_formants.keys())
vowel_index = ((vowel_pattern + 1) / 2) * (len(vowel_keys) - 1)
# Apply formant filtering
filtered_audio = np.zeros_like(voiced_audio)
for i in range(len(t)):
# Get current vowel formants by interpolation
idx = int(vowel_index[i])
frac = vowel_index[i] - idx
if idx < len(vowel_keys) - 1:
f1_a, f2_a, f3_a = vowel_formants[vowel_keys[idx]]
f1_b, f2_b, f3_b = vowel_formants[vowel_keys[idx + 1]]
f1 = f1_a + (f1_b - f1_a) * frac
f2 = f2_a + (f2_b - f2_a) * frac
f3 = f3_a + (f3_b - f3_a) * frac
else:
f1, f2, f3 = vowel_formants[vowel_keys[-1]]
# Apply character-specific formant shift
f1 *= (1 + formant_shift * 0.3)
f2 *= (1 + formant_shift * 0.2)
f3 *= (1 + formant_shift * 0.1)
# Simple formant filtering using resonance approximation
if voiced_pattern[i] > 0.1:
# Emphasize frequencies near formants
sample = voiced_audio[i]
# F1 resonance
f1_resonance = 1 + 0.4 * np.exp(-((frequency[i] - f1) / 80) ** 2)
# F2 resonance
f2_resonance = 1 + 0.3 * np.exp(-((frequency[i] - f2) / 120) ** 2)
# F3 resonance
f3_resonance = 1 + 0.2 * np.exp(-((frequency[i] - f3) / 200) ** 2)
filtered_audio[i] = sample * f1_resonance * f2_resonance * f3_resonance
else:
# Unvoiced segments - add fricative noise
np.random.seed(int(t[i] * 1000) % 10000)
noise_amp = (1 - voiced_pattern[i]) * 0.15
filtered_audio[i] = (np.random.random() - 0.5) * noise_amp
return filtered_audio
def _create_phoneme_pattern(self, t: np.ndarray) -> np.ndarray:
"""Create a pattern of voiced/unvoiced segments to simulate phonemes"""
pattern = np.ones_like(t)
# Create syllable-like rhythm
syllable_rate = 4.5 # Syllables per second
syllable_phase = 2 * np.pi * syllable_rate * t
# Most of syllable is voiced (vowel), with brief unvoiced parts (consonants)
voiced_base = 0.8 + 0.2 * np.sin(syllable_phase)
# Add consonant-like unvoiced segments
consonant_rate = 8.0 # Consonant events per second
consonant_phase = 2 * np.pi * consonant_rate * t
consonant_trigger = np.sin(consonant_phase + np.pi/4)
# Sharp consonant transitions
consonant_mask = (consonant_trigger > 0.85).astype(float)
# Combine patterns - consonants reduce voicing
pattern = voiced_base * (1 - consonant_mask * 0.7)
# Smooth transitions to avoid clicks
kernel_size = max(3, len(pattern) // 200)
if kernel_size % 2 == 0:
kernel_size += 1
if kernel_size >= 3 and kernel_size <= len(pattern) // 3:
kernel = np.ones(kernel_size) / kernel_size
pattern = np.convolve(pattern, kernel, mode='same')
return np.clip(pattern, 0, 1)
def _generate_envelope(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
"""Generate amplitude envelope for speech-like rhythm"""
envelope = np.ones_like(t)
# Overall fade in/out
fade_samples = min(int(0.05 * len(t)), 500) # 50ms fade
if fade_samples > 0:
# Smooth fade in
envelope[:fade_samples] *= np.sin(np.pi * np.linspace(0, 0.5, fade_samples)) ** 2
# Smooth fade out
envelope[-fade_samples:] *= np.cos(np.pi * np.linspace(0, 0.5, fade_samples)) ** 2
# Estimate syllables from text length
syllable_count = max(len(text.replace(' ', '')) // 3, 1) # Rough syllable estimate
duration = len(t) / settings.SAMPLE_RATE
syllable_rate = syllable_count / duration
# Create syllable-like amplitude modulation
syllable_pattern = 0.6 + 0.4 * (np.sin(2 * np.pi * syllable_rate * t) ** 2)
envelope *= syllable_pattern
# Add word boundaries (pauses between words)
word_count = len(text.split())
if word_count > 1:
word_rate = word_count / duration
# Create brief pauses between words
word_boundaries = np.sin(2 * np.pi * word_rate * t + np.pi/4)
word_gates = np.where(word_boundaries < -0.8, 0.3, 1.0) # Brief pauses
envelope *= word_gates
# Add breath-like variations
breath_rate = 0.5 # Breathing-like variations
breath_mod = 1 + 0.1 * np.sin(2 * np.pi * breath_rate * t)
envelope *= breath_mod
# Character-specific envelope characteristics
speed = voice_config.get("speed", 1.0)
if speed > 1.1: # Fast talkers (like Jinx)
# More staccato, energetic envelope
energy_bursts = 1 + 0.2 * (np.random.rand(len(t)) > 0.7).astype(float)
envelope *= energy_bursts
elif speed < 0.95: # Slow, deliberate speakers (like Moses)
# Smoother, more sustained envelope
envelope = np.power(envelope, 0.7) # Gentler amplitude changes
# Ensure envelope doesn't go below minimum level
envelope = np.maximum(envelope, 0.1)
return envelope
def _audio_to_base64(self, audio_data: np.ndarray) -> str:
"""Convert audio numpy array to base64 string"""
# Convert to 16-bit PCM
audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16)
# Create WAV file in memory manually
buffer = io.BytesIO()
# WAV file parameters
sample_rate = settings.SAMPLE_RATE
num_channels = 1 # Mono
bits_per_sample = 16
byte_rate = sample_rate * num_channels * bits_per_sample // 8
block_align = num_channels * bits_per_sample // 8
data_size = len(audio_int16) * 2 # 2 bytes per sample
file_size = 36 + data_size
# Write WAV header (44 bytes)
buffer.write(b'RIFF') # Chunk ID (4 bytes)
buffer.write(file_size.to_bytes(4, 'little')) # File size - 8 (4 bytes)
buffer.write(b'WAVE') # Format (4 bytes)
buffer.write(b'fmt ') # Subchunk1 ID (4 bytes)
buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size (4 bytes)
buffer.write((1).to_bytes(2, 'little')) # Audio format (PCM) (2 bytes)
buffer.write(num_channels.to_bytes(2, 'little')) # Num channels (2 bytes)
buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate (4 bytes)
buffer.write(byte_rate.to_bytes(4, 'little')) # Byte rate (4 bytes)
buffer.write(block_align.to_bytes(2, 'little')) # Block align (2 bytes)
buffer.write(bits_per_sample.to_bytes(2, 'little')) # Bits per sample (2 bytes)
buffer.write(b'data') # Subchunk2 ID (4 bytes)
buffer.write(data_size.to_bytes(4, 'little')) # Subchunk2 size (4 bytes)
# Write audio data
buffer.write(audio_int16.tobytes())
logger.debug(f"Generated WAV file: {file_size + 8} bytes total, {data_size} bytes audio data")
# Get bytes and encode to base64
buffer.seek(0)
audio_bytes = buffer.read()
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
return f"data:audio/wav;base64,{audio_base64}"
def get_character_voice_info(self, character_id: str) -> dict:
"""Get voice configuration for character"""
return self.character_voice_configs.get(character_id, {})