File size: 17,451 Bytes
7e68852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
import torch
import asyncio
import logging
import base64
import io
import numpy as np
from typing import Optional
from backend.config import settings
import math

logger = logging.getLogger(__name__)

class SimpleVoiceSynthesizer:
    """
    A simple voice synthesizer that creates synthetic speech using basic audio generation.
    This is a fallback solution when VibeVoice is not available.
    """
    
    def __init__(self):
        self.character_voice_configs = {}
        self.initialized = False
        
    async def initialize(self):
        """Initialize simple voice synthesis"""
        if not settings.ENABLE_VOICE:
            logger.info("Voice synthesis disabled in config")
            return False
            
        logger.info("Initializing simple voice synthesizer...")
        
        try:
            # Setup character-specific voice parameters
            self._setup_character_voices()
            self.initialized = True
            logger.info("Simple voice synthesizer initialized successfully")
            return True
            
        except Exception as e:
            logger.error(f"Failed to initialize simple voice synthesizer: {e}")
            return False
            
    def _setup_character_voices(self):
        """Setup character-specific voice configurations"""
        self.character_voice_configs = {
            "moses": {
                "base_frequency": 120,  # Lower, more authoritative
                "speed": 0.9,  # Slightly slower
                "vibrato_rate": 4.5,  # Gentle vibrato
                "vibrato_depth": 0.02,
                "formant_shift": -0.1,  # Deeper formants
            },
            "samsung_employee": {
                "base_frequency": 150,  # Professional, clear
                "speed": 1.0,  # Normal speed
                "vibrato_rate": 5.0,
                "vibrato_depth": 0.015,
                "formant_shift": 0.0,  # Neutral formants
            },
            "jinx": {
                "base_frequency": 180,  # Higher, more energetic
                "speed": 1.15,  # Faster speech
                "vibrato_rate": 6.0,  # More vibrato
                "vibrato_depth": 0.03,
                "formant_shift": 0.2,  # Brighter formants
            }
        }
        
    async def synthesize(self, text: str, character_id: str) -> Optional[str]:
        """Synthesize speech for given text and character"""
        if not self.initialized or not settings.ENABLE_VOICE:
            return None
            
        try:
            # Get character voice config
            voice_config = self.character_voice_configs.get(
                character_id, 
                self.character_voice_configs["samsung_employee"]  # Default
            )
            
            # Generate audio
            audio_data = self._generate_speech(text, voice_config)
            
            # Convert to base64 for web transmission
            audio_base64 = self._audio_to_base64(audio_data)
            
            logger.info(f"Generated speech for {character_id}: {len(text)} chars, audio: {len(audio_data)} samples, base64: {len(audio_base64)} chars")
            return audio_base64
            
        except Exception as e:
            logger.error(f"Error in simple voice synthesis: {e}")
            return None
            
    def _generate_speech(self, text: str, voice_config: dict) -> np.ndarray:
        """Generate synthetic speech using formant synthesis"""
        
        # Estimate duration based on text length and speech rate
        words = len(text.split())
        chars = len(text)
        
        # Rough estimation: 3-5 words per second, adjusted by speed
        base_duration = max(words / 4.0, chars / 15.0)  # Minimum based on character count
        duration = base_duration / voice_config["speed"]
        duration = min(duration, 30.0)  # Max 30 seconds
        
        sample_rate = settings.SAMPLE_RATE
        num_samples = int(duration * sample_rate)
        
        # Generate time array
        t = np.linspace(0, duration, num_samples)
        
        # Base frequency with subtle variation
        base_freq = voice_config["base_frequency"]
        
        # Add prosody (pitch contours for natural speech)
        prosody = self._generate_prosody(t, text, voice_config)
        frequency = base_freq * prosody
        
        # Add vibrato
        vibrato_rate = voice_config["vibrato_rate"]
        vibrato_depth = voice_config["vibrato_depth"]
        vibrato = 1 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)
        frequency *= vibrato
        
        # Generate formants (multiple resonant frequencies)
        audio = self._generate_formants(t, frequency, voice_config)
        
        # Add speech-like envelope
        envelope = self._generate_envelope(t, text, voice_config)
        audio *= envelope
        
        # Normalize
        if np.max(np.abs(audio)) > 0:
            audio = audio / np.max(np.abs(audio)) * 0.7
            
        return audio.astype(np.float32)
        
    def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
        """Generate pitch contours for natural-sounding speech"""
        
        # Basic prosody pattern
        prosody = np.ones_like(t)
        sentence_length = len(t)
        
        # Estimate word boundaries based on text length and spaces
        word_count = len(text.split())
        words_per_second = 3.0  # Average speech rate
        
        # Create word-level pitch variations
        if word_count > 1:
            word_rate = word_count / (len(t) / settings.SAMPLE_RATE)
            word_stress = 1 + 0.15 * np.sin(2 * np.pi * word_rate * t / word_count)
            prosody *= word_stress
        
        # Add sentence-level intonation based on punctuation
        time_norm = np.linspace(0, 1, sentence_length)
        
        if text.endswith('?'):
            # Question: rising intonation (more pronounced)
            prosody *= (1 + 0.3 * time_norm)
            
        elif text.endswith('!'):
            # Exclamation: dramatic rise and fall
            prosody *= (1 + 0.4 * np.sin(1.2 * np.pi * time_norm))
            
        else:
            # Statement: natural fall with slight initial rise
            prosody *= (1 + 0.2 * np.sin(np.pi * time_norm) * np.exp(-1.5 * time_norm))
        
        # Add micro-variations for naturalness
        micro_variations = 1 + 0.03 * np.sin(2 * np.pi * 12 * t)  # 12 Hz micro-variations
        prosody *= micro_variations
        
        # Character-specific prosody adjustments
        character_factor = voice_config.get("pitch", 1.0)
        if character_factor > 1.2:  # High-pitched characters (like Jinx)
            # Add more dramatic pitch swings
            prosody *= (1 + 0.1 * np.sin(2 * np.pi * 3 * t))
        elif character_factor < 0.9:  # Low-pitched characters (like Moses)
            # More steady, authoritative prosody
            prosody *= (1 + 0.05 * np.sin(2 * np.pi * 1.5 * t))
        
        return prosody
        
    def _generate_formants(self, t: np.ndarray, frequency: np.ndarray, voice_config: dict) -> np.ndarray:
        """Generate realistic speech using formant synthesis and phoneme patterns"""
        
        # Generate phase for continuous frequency changes
        phase = np.zeros_like(t)
        for i in range(1, len(t)):
            phase[i] = phase[i-1] + 2 * np.pi * frequency[i] / settings.SAMPLE_RATE
            
        # Create voiced/unvoiced pattern based on text characteristics
        voiced_pattern = self._create_phoneme_pattern(t)
        
        # Generate rich harmonic content for voiced sounds
        voiced_audio = np.zeros_like(t)
        for i, is_voiced in enumerate(voiced_pattern):
            if is_voiced > 0.5:  # Voiced segments
                # Create rich harmonic series (like vocal cords)
                sample = 0
                for harmonic in range(1, 12):
                    if frequency[i] * harmonic < settings.SAMPLE_RATE / 2:  # Avoid aliasing
                        # Natural harmonic amplitude rolloff
                        amplitude = 0.6 / (harmonic ** 0.8) * is_voiced
                        # Add slight randomness to harmonics
                        phase_noise = 0.1 * np.sin(2 * np.pi * 7 * t[i])
                        sample += amplitude * np.sin(harmonic * phase[i] + phase_noise)
                voiced_audio[i] = sample
        
        # Apply formant filtering for vowel-like quality
        formant_shift = voice_config.get("formant_shift", 0.0)
        
        # Dynamic vowel simulation
        vowel_rate = 3.0  # Vowel changes per second
        vowel_pattern = np.sin(2 * np.pi * vowel_rate * t)
        
        # Multiple vowel formant sets (approximating /a/, /e/, /i/, /o/, /u/)
        vowel_formants = {
            'a': (730, 1090, 2440),   # /a/ as in "father"
            'e': (530, 1840, 2480),   # /e/ as in "bed" 
            'i': (270, 2290, 3010),   # /i/ as in "beat"
            'o': (570, 840, 2410),    # /o/ as in "boat"
            'u': (440, 1020, 2240)    # /u/ as in "boot"
        }
        
        # Interpolate between vowels over time
        vowel_keys = list(vowel_formants.keys())
        vowel_index = ((vowel_pattern + 1) / 2) * (len(vowel_keys) - 1)
        
        # Apply formant filtering
        filtered_audio = np.zeros_like(voiced_audio)
        
        for i in range(len(t)):
            # Get current vowel formants by interpolation
            idx = int(vowel_index[i])
            frac = vowel_index[i] - idx
            
            if idx < len(vowel_keys) - 1:
                f1_a, f2_a, f3_a = vowel_formants[vowel_keys[idx]]
                f1_b, f2_b, f3_b = vowel_formants[vowel_keys[idx + 1]]
                
                f1 = f1_a + (f1_b - f1_a) * frac
                f2 = f2_a + (f2_b - f2_a) * frac  
                f3 = f3_a + (f3_b - f3_a) * frac
            else:
                f1, f2, f3 = vowel_formants[vowel_keys[-1]]
            
            # Apply character-specific formant shift
            f1 *= (1 + formant_shift * 0.3)
            f2 *= (1 + formant_shift * 0.2) 
            f3 *= (1 + formant_shift * 0.1)
            
            # Simple formant filtering using resonance approximation
            if voiced_pattern[i] > 0.1:
                # Emphasize frequencies near formants
                sample = voiced_audio[i]
                
                # F1 resonance
                f1_resonance = 1 + 0.4 * np.exp(-((frequency[i] - f1) / 80) ** 2)
                # F2 resonance  
                f2_resonance = 1 + 0.3 * np.exp(-((frequency[i] - f2) / 120) ** 2)
                # F3 resonance
                f3_resonance = 1 + 0.2 * np.exp(-((frequency[i] - f3) / 200) ** 2)
                
                filtered_audio[i] = sample * f1_resonance * f2_resonance * f3_resonance
            else:
                # Unvoiced segments - add fricative noise
                np.random.seed(int(t[i] * 1000) % 10000)
                noise_amp = (1 - voiced_pattern[i]) * 0.15
                filtered_audio[i] = (np.random.random() - 0.5) * noise_amp
        
        return filtered_audio
    
    def _create_phoneme_pattern(self, t: np.ndarray) -> np.ndarray:
        """Create a pattern of voiced/unvoiced segments to simulate phonemes"""
        
        pattern = np.ones_like(t)
        
        # Create syllable-like rhythm
        syllable_rate = 4.5  # Syllables per second
        syllable_phase = 2 * np.pi * syllable_rate * t
        
        # Most of syllable is voiced (vowel), with brief unvoiced parts (consonants)
        voiced_base = 0.8 + 0.2 * np.sin(syllable_phase)
        
        # Add consonant-like unvoiced segments
        consonant_rate = 8.0  # Consonant events per second
        consonant_phase = 2 * np.pi * consonant_rate * t
        consonant_trigger = np.sin(consonant_phase + np.pi/4)
        
        # Sharp consonant transitions
        consonant_mask = (consonant_trigger > 0.85).astype(float)
        
        # Combine patterns - consonants reduce voicing
        pattern = voiced_base * (1 - consonant_mask * 0.7)
        
        # Smooth transitions to avoid clicks
        kernel_size = max(3, len(pattern) // 200)
        if kernel_size % 2 == 0:
            kernel_size += 1
        
        if kernel_size >= 3 and kernel_size <= len(pattern) // 3:
            kernel = np.ones(kernel_size) / kernel_size
            pattern = np.convolve(pattern, kernel, mode='same')
        
        return np.clip(pattern, 0, 1)
        
    def _generate_envelope(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
        """Generate amplitude envelope for speech-like rhythm"""
        
        envelope = np.ones_like(t)
        
        # Overall fade in/out
        fade_samples = min(int(0.05 * len(t)), 500)  # 50ms fade
        if fade_samples > 0:
            # Smooth fade in
            envelope[:fade_samples] *= np.sin(np.pi * np.linspace(0, 0.5, fade_samples)) ** 2
            # Smooth fade out
            envelope[-fade_samples:] *= np.cos(np.pi * np.linspace(0, 0.5, fade_samples)) ** 2
        
        # Estimate syllables from text length
        syllable_count = max(len(text.replace(' ', '')) // 3, 1)  # Rough syllable estimate
        duration = len(t) / settings.SAMPLE_RATE
        syllable_rate = syllable_count / duration
        
        # Create syllable-like amplitude modulation
        syllable_pattern = 0.6 + 0.4 * (np.sin(2 * np.pi * syllable_rate * t) ** 2)
        envelope *= syllable_pattern
        
        # Add word boundaries (pauses between words)
        word_count = len(text.split())
        if word_count > 1:
            word_rate = word_count / duration
            # Create brief pauses between words
            word_boundaries = np.sin(2 * np.pi * word_rate * t + np.pi/4)
            word_gates = np.where(word_boundaries < -0.8, 0.3, 1.0)  # Brief pauses
            envelope *= word_gates
        
        # Add breath-like variations
        breath_rate = 0.5  # Breathing-like variations
        breath_mod = 1 + 0.1 * np.sin(2 * np.pi * breath_rate * t)
        envelope *= breath_mod
        
        # Character-specific envelope characteristics
        speed = voice_config.get("speed", 1.0)
        if speed > 1.1:  # Fast talkers (like Jinx)
            # More staccato, energetic envelope
            energy_bursts = 1 + 0.2 * (np.random.rand(len(t)) > 0.7).astype(float)
            envelope *= energy_bursts
        elif speed < 0.95:  # Slow, deliberate speakers (like Moses)
            # Smoother, more sustained envelope
            envelope = np.power(envelope, 0.7)  # Gentler amplitude changes
        
        # Ensure envelope doesn't go below minimum level
        envelope = np.maximum(envelope, 0.1)
        
        return envelope
        
    def _audio_to_base64(self, audio_data: np.ndarray) -> str:
        """Convert audio numpy array to base64 string"""
        # Convert to 16-bit PCM
        audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16)
        
        # Create WAV file in memory manually
        buffer = io.BytesIO()
        
        # WAV file parameters
        sample_rate = settings.SAMPLE_RATE
        num_channels = 1  # Mono
        bits_per_sample = 16
        byte_rate = sample_rate * num_channels * bits_per_sample // 8
        block_align = num_channels * bits_per_sample // 8
        data_size = len(audio_int16) * 2  # 2 bytes per sample
        file_size = 36 + data_size
        
        # Write WAV header (44 bytes)
        buffer.write(b'RIFF')                                    # Chunk ID (4 bytes)
        buffer.write(file_size.to_bytes(4, 'little'))           # File size - 8 (4 bytes)
        buffer.write(b'WAVE')                                    # Format (4 bytes)
        buffer.write(b'fmt ')                                    # Subchunk1 ID (4 bytes)
        buffer.write((16).to_bytes(4, 'little'))                # Subchunk1 size (4 bytes)
        buffer.write((1).to_bytes(2, 'little'))                 # Audio format (PCM) (2 bytes)
        buffer.write(num_channels.to_bytes(2, 'little'))        # Num channels (2 bytes)
        buffer.write(sample_rate.to_bytes(4, 'little'))         # Sample rate (4 bytes)
        buffer.write(byte_rate.to_bytes(4, 'little'))           # Byte rate (4 bytes)
        buffer.write(block_align.to_bytes(2, 'little'))         # Block align (2 bytes)
        buffer.write(bits_per_sample.to_bytes(2, 'little'))     # Bits per sample (2 bytes)
        buffer.write(b'data')                                    # Subchunk2 ID (4 bytes)
        buffer.write(data_size.to_bytes(4, 'little'))           # Subchunk2 size (4 bytes)
        
        # Write audio data
        buffer.write(audio_int16.tobytes())
        
        logger.debug(f"Generated WAV file: {file_size + 8} bytes total, {data_size} bytes audio data")
        
        # Get bytes and encode to base64
        buffer.seek(0)
        audio_bytes = buffer.read()
        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
        
        return f"data:audio/wav;base64,{audio_base64}"
        
    def get_character_voice_info(self, character_id: str) -> dict:
        """Get voice configuration for character"""
        return self.character_voice_configs.get(character_id, {})