Spaces:
Sleeping
Sleeping
File size: 13,742 Bytes
7e68852 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 |
import torch
import asyncio
import logging
import base64
import io
import numpy as np
from typing import Optional
from backend.config import settings
import math
logger = logging.getLogger(__name__)
class SimpleVoiceSynthesizer:
"""
An improved simple voice synthesizer that creates more realistic speech-like audio
using phoneme patterns, formant synthesis, and prosody modeling.
"""
def __init__(self):
self.character_voice_configs = {}
self.initialized = False
async def initialize(self):
"""Initialize simple voice synthesis"""
if not settings.ENABLE_VOICE:
logger.info("Voice synthesis disabled in config")
return False
logger.info("Initializing improved simple voice synthesizer...")
try:
# Setup character-specific voice parameters
self._setup_character_voices()
self.initialized = True
logger.info("Improved simple voice synthesizer initialized successfully")
return True
except Exception as e:
logger.error(f"Failed to initialize simple voice synthesizer: {e}")
return False
def _setup_character_voices(self):
"""Setup character-specific voice configurations"""
self.character_voice_configs = {
"moses": {
"base_frequency": 110, # Lower, more authoritative
"speed": 0.85, # Slower, more measured
"pitch_variance": 0.15, # Less pitch variation
"formant_shift": -0.2, # Deeper formants
"voice_quality": "deep",
},
"samsung_employee": {
"base_frequency": 140, # Professional, clear
"speed": 1.0, # Normal speed
"pitch_variance": 0.2, # Moderate variation
"formant_shift": 0.0, # Neutral formants
"voice_quality": "clear",
},
"jinx": {
"base_frequency": 180, # Higher, more energetic
"speed": 1.2, # Faster speech
"pitch_variance": 0.35, # More pitch variation
"formant_shift": 0.3, # Brighter formants
"voice_quality": "bright",
}
}
async def synthesize(self, text: str, character_id: str) -> Optional[str]:
"""Synthesize speech for given text and character"""
if not self.initialized or not settings.ENABLE_VOICE:
return None
try:
# Get character voice config
voice_config = self.character_voice_configs.get(
character_id,
self.character_voice_configs["samsung_employee"] # Default
)
# Generate realistic speech audio
audio_data = self._generate_realistic_speech(text, voice_config)
# Convert to base64 for web transmission
audio_base64 = self._audio_to_base64(audio_data)
logger.info(f"Generated realistic speech for {character_id}: {len(text)} chars, {len(audio_data)} samples")
return audio_base64
except Exception as e:
logger.error(f"Error in simple voice synthesis: {e}")
return None
def _generate_realistic_speech(self, text: str, voice_config: dict) -> np.ndarray:
"""Generate realistic speech using advanced phoneme and prosody modeling"""
# Calculate duration based on speaking rate
words = len(text.split())
chars = len(text)
# Realistic speaking rates: 150-180 words per minute
base_wpm = 160
speed_factor = voice_config["speed"]
actual_wpm = base_wpm * speed_factor
# Calculate duration
duration = (words / actual_wpm) * 60 # Convert to seconds
duration = max(duration, chars / 15.0) # Minimum based on character count
duration = min(duration, 30.0) # Maximum 30 seconds
sample_rate = settings.SAMPLE_RATE
num_samples = int(duration * sample_rate)
# Create time array
t = np.linspace(0, duration, num_samples)
# Generate phoneme-based speech patterns
audio = self._create_phoneme_speech(t, text, voice_config)
# Apply prosody (intonation patterns)
prosody = self._generate_prosody(t, text, voice_config)
audio *= prosody
# Apply character-specific voice quality
audio = self._apply_voice_quality(audio, t, voice_config)
# Add natural speech envelope
envelope = self._create_speech_envelope(audio, t)
audio *= envelope
# Normalize and return
if np.max(np.abs(audio)) > 0:
audio = audio / np.max(np.abs(audio)) * 0.8
return audio.astype(np.float32)
def _create_phoneme_speech(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
"""Create speech-like audio using phoneme patterns"""
audio = np.zeros_like(t)
base_freq = voice_config["base_frequency"]
# Create syllable timing based on text
syllable_rate = 4.0 * voice_config["speed"] # syllables per second
syllable_duration = 1.0 / syllable_rate
for i, sample_time in enumerate(t):
# Determine current syllable position
syllable_phase = (sample_time % syllable_duration) / syllable_duration
# Create vowel/consonant pattern
# Vowels: 0.2-0.8 of syllable, Consonants: 0.0-0.2 and 0.8-1.0
is_vowel = 0.2 < syllable_phase < 0.8
# Get fundamental frequency with natural variation
pitch_variation = voice_config["pitch_variance"]
f0 = base_freq * (1 + pitch_variation * np.sin(2 * np.pi * 2.3 * sample_time))
if is_vowel:
# Generate vowel sound using formant synthesis
vowel_sound = self._generate_vowel_formants(sample_time, f0, voice_config)
audio[i] = vowel_sound
else:
# Generate consonant sound using filtered noise
consonant_sound = self._generate_consonant(sample_time, f0, voice_config)
audio[i] = consonant_sound
return audio
def _generate_vowel_formants(self, t: float, f0: float, voice_config: dict) -> float:
"""Generate vowel sounds using formant frequencies"""
formant_shift = voice_config["formant_shift"]
# Vowel formant frequencies (approximate average)
f1 = 650 * (1 + formant_shift * 0.5) # First formant
f2 = 1400 * (1 + formant_shift * 0.3) # Second formant
f3 = 2500 * (1 + formant_shift * 0.2) # Third formant
# Add slight formant movement for naturalness
f1 += 50 * np.sin(2 * np.pi * 1.7 * t)
f2 += 80 * np.sin(2 * np.pi * 2.1 * t)
# Generate harmonic series for fundamental
fundamental = 0.4 * np.sin(2 * np.pi * f0 * t)
# Generate formant resonances
formant1 = 0.3 * np.sin(2 * np.pi * f1 * t) * np.exp(-abs(f1 - f0*1) / 200)
formant2 = 0.2 * np.sin(2 * np.pi * f2 * t) * np.exp(-abs(f2 - f0*2) / 300)
formant3 = 0.1 * np.sin(2 * np.pi * f3 * t) * np.exp(-abs(f3 - f0*3) / 500)
# Add harmonics
harmonic2 = 0.2 * np.sin(2 * np.pi * f0 * 2 * t)
harmonic3 = 0.1 * np.sin(2 * np.pi * f0 * 3 * t)
return fundamental + formant1 + formant2 + formant3 + harmonic2 + harmonic3
def _generate_consonant(self, t: float, f0: float, voice_config: dict) -> float:
"""Generate consonant sounds using filtered noise and fricatives"""
# Create noise component for fricatives
noise = (np.random.randn() - 0.5) * 0.15
# Add some periodic component for voiced consonants
periodic = 0.1 * np.sin(2 * np.pi * f0 * t)
# Filter noise based on consonant type (simplified)
filtered_noise = noise * (1 + 0.5 * np.sin(2 * np.pi * 3000 * t))
return filtered_noise + periodic * 0.3
def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
"""Generate natural prosody (intonation) patterns"""
prosody = np.ones_like(t)
duration = t[-1] if len(t) > 0 else 1.0
# Sentence-level intonation
time_norm = t / duration
if text.endswith('?'):
# Question: rising intonation
prosody *= (0.8 + 0.4 * time_norm)
elif text.endswith('!'):
# Exclamation: dramatic contour
prosody *= (0.9 + 0.3 * np.sin(np.pi * time_norm) * np.exp(-time_norm))
else:
# Statement: natural declination
prosody *= (1.0 - 0.2 * time_norm)
# Add micro-prosody for naturalness
prosody *= (1 + 0.05 * np.sin(2 * np.pi * 8 * t))
# Character-specific prosody
if voice_config.get("voice_quality") == "bright":
# More animated prosody for energetic characters
prosody *= (1 + 0.1 * np.sin(2 * np.pi * 2.5 * t))
elif voice_config.get("voice_quality") == "deep":
# More steady prosody for authoritative characters
prosody *= (1 + 0.03 * np.sin(2 * np.pi * 1.2 * t))
return prosody
def _apply_voice_quality(self, audio: np.ndarray, t: np.ndarray, voice_config: dict) -> np.ndarray:
"""Apply character-specific voice quality effects"""
quality = voice_config.get("voice_quality", "clear")
if quality == "deep":
# Add subtle sub-harmonics for deeper voice
subharmonic = 0.05 * np.sin(np.pi * t)
audio = audio + subharmonic[:len(audio)]
elif quality == "bright":
# Emphasize higher frequencies for brighter voice
high_freq = 0.03 * np.sin(2 * np.pi * 4000 * t)
audio = audio + high_freq[:len(audio)]
# Add very subtle vocal fry for naturalness
fry_rate = 70 # Hz
fry = 0.01 * np.sin(2 * np.pi * fry_rate * t) * (np.random.randn(len(t)) * 0.5 + 0.5)
audio = audio + fry[:len(audio)]
return audio
def _create_speech_envelope(self, audio: np.ndarray, t: np.ndarray) -> np.ndarray:
"""Create natural speech amplitude envelope"""
envelope = np.ones_like(audio)
# Fade in/out
fade_samples = min(int(0.05 * len(audio)), 1000)
if fade_samples > 0:
envelope[:fade_samples] *= np.linspace(0, 1, fade_samples)
envelope[-fade_samples:] *= np.linspace(1, 0, fade_samples)
# Add speech rhythm (breathing, pauses)
breath_rate = 0.3 # Subtle breathing pattern
envelope *= (0.95 + 0.05 * np.sin(2 * np.pi * breath_rate * t))
return envelope
def _audio_to_base64(self, audio_data: np.ndarray) -> str:
"""Convert audio numpy array to base64 string"""
# Convert to 16-bit PCM
audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16)
# Create WAV file in memory manually
buffer = io.BytesIO()
# WAV file parameters
sample_rate = settings.SAMPLE_RATE
num_channels = 1 # Mono
bits_per_sample = 16
byte_rate = sample_rate * num_channels * bits_per_sample // 8
block_align = num_channels * bits_per_sample // 8
data_size = len(audio_int16) * 2 # 2 bytes per sample
file_size = 36 + data_size
# Write WAV header (44 bytes)
buffer.write(b'RIFF') # Chunk ID (4 bytes)
buffer.write(file_size.to_bytes(4, 'little')) # File size - 8 (4 bytes)
buffer.write(b'WAVE') # Format (4 bytes)
buffer.write(b'fmt ') # Subchunk1 ID (4 bytes)
buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size (4 bytes)
buffer.write((1).to_bytes(2, 'little')) # Audio format (PCM) (2 bytes)
buffer.write(num_channels.to_bytes(2, 'little')) # Num channels (2 bytes)
buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate (4 bytes)
buffer.write(byte_rate.to_bytes(4, 'little')) # Byte rate (4 bytes)
buffer.write(block_align.to_bytes(2, 'little')) # Block align (2 bytes)
buffer.write(bits_per_sample.to_bytes(2, 'little')) # Bits per sample (2 bytes)
buffer.write(b'data') # Subchunk2 ID (4 bytes)
buffer.write(data_size.to_bytes(4, 'little')) # Subchunk2 size (4 bytes)
# Write audio data
buffer.write(audio_int16.tobytes())
# Get bytes and encode to base64
buffer.seek(0)
audio_bytes = buffer.read()
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
return f"data:audio/wav;base64,{audio_base64}"
def get_character_voice_info(self, character_id: str) -> dict:
"""Get voice configuration for character"""
return self.character_voice_configs.get(character_id, {}) |