File size: 10,973 Bytes
03fe1d8
 
aeb26b6
 
 
3072798
aeb26b6
7fcb2a7
aeb26b6
7fcb2a7
 
 
3072798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aeb26b6
03fe1d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6a9194
03fe1d8
e6a9194
03fe1d8
e6a9194
 
 
 
 
 
 
3072798
e6a9194
03fe1d8
e6a9194
 
03fe1d8
d9c4b3e
e6a9194
 
03fe1d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6a9194
03fe1d8
 
 
 
f152556
e6a9194
 
 
 
 
 
 
 
 
 
 
 
 
 
e1c7f06
e6a9194
 
e1c7f06
e6a9194
 
 
 
e1c7f06
e6a9194
 
 
 
e1c7f06
e6a9194
e1c7f06
ccd13e3
 
 
03fe1d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
"""Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS)."""

import os
import sys

# Set environment variables BEFORE any TTS imports to bypass CPML prompt
os.environ['TTS_HOME'] = '/tmp/tts_models'
os.environ['TTS_CPML'] = '1'
os.environ['TTS_SKIP_TOS'] = '1'
os.environ['TTS_DISABLE_WEB_VERSION_PROMPT'] = '1'
os.environ['COQUI_TOS_AGREED'] = '1'

# Create a silent TTS manager that handles model initialization without prompts
def _create_silent_tts_manager():
    """Create a TTS manager configured to skip all interactive prompts."""
    try:
        from TTS.utils.manage import ModelManager
        from pathlib import Path
        
        # Set model manager to use our TTS_HOME directory
        model_dir = Path(os.environ.get('TTS_HOME', '/tmp/tts_models'))
        model_dir.mkdir(parents=True, exist_ok=True)
        
        manager = ModelManager(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
        # Mark TOS as agreed in the manager to prevent prompts
        manager.tos_agreed = True
        
        return manager, model_dir
    except Exception as e:
        print(f"[WARNING] Could not create silent TTS manager: {e}")
        return None, None

import gc
import torch
import numpy as np
from pathlib import Path
from typing import Optional, Union
from enum import Enum


class Language(str, Enum):
    """Supported languages."""
    ENGLISH = "english"
    HINDI = "hindi"


class MultilingualTTSService:
    """
    Unified TTS service supporting multiple languages.
    
    - English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder
    - Hindi: Uses XTTS (Coqui TTS) model
    """
    
    def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
        """
        Initialize multilingual TTS service.
        
        Args:
            models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt)
            hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled.
        """
        self.models_dir = Path(models_dir)
        self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
        
        # Track loaded models
        self._encoder_model = None
        self._synthesizer_model = None
        self._vocoder_model = None
        self._xtts_model = None
        
        self.sr = 16000
        
        print("[MultilingualTTSService] Initialized")
        print(f"[MultilingualTTSService] English models dir: {self.models_dir}")
        if self.hindi_model_dir:
            print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}")
        else:
            print("[MultilingualTTSService] Hindi support: DISABLED (no model path)")
    
    def _load_english_models(self):
        """Load English voice cloning models (lazy load)."""
        if self._encoder_model is None:
            print("[MultilingualTTSService] Loading English encoder...")
            from encoder import inference as encoder_infer
            enc_path = self.models_dir / "default" / "encoder.pt"
            if not enc_path.exists():
                raise RuntimeError(f"English encoder model missing: {enc_path}")
            encoder_infer.load_model(enc_path)
            self._encoder_model = True
            print("[MultilingualTTSService] βœ“ English encoder loaded")
        
        if self._synthesizer_model is None:
            print("[MultilingualTTSService] Loading English synthesizer...")
            from synthesizer import inference as synthesizer_infer
            syn_path = self.models_dir / "default" / "synthesizer.pt"
            if not syn_path.exists():
                raise RuntimeError(f"English synthesizer model missing: {syn_path}")
            self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path)
            print("[MultilingualTTSService] βœ“ English synthesizer loaded")
        
        if self._vocoder_model is None:
            print("[MultilingualTTSService] Loading English vocoder...")
            from app.vocoder import inference as vocoder_infer
            voc_path = self.models_dir / "default" / "vocoder.pt"
            if not voc_path.exists():
                raise RuntimeError(f"English vocoder model missing: {voc_path}")
            vocoder_infer.load_model(voc_path)
            self._vocoder_model = True
            print("[MultilingualTTSService] βœ“ English vocoder loaded")
    
    def _load_hindi_models(self):
        """Load Hindi models - using Google Text-to-Speech (gTTS)."""
        if self._xtts_model is None:
            print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
            try:
                from gtts import gTTS
                print("[MultilingualTTSService] βœ“ Hindi gTTS support loaded")
                print("[MultilingualTTSService]   Engine: Google Text-to-Speech (gTTS)")
                print("[MultilingualTTSService]   Language: Hindi (hin)")
                print("[MultilingualTTSService]   TOS: No (Google Cloud)")
                # Mark as loaded (gTTS doesn't require actual model loading)
                self._xtts_model = True
                    
            except ImportError:
                raise ImportError(
                    "gTTS library required for Hindi support. "
                    "Install with: pip install gtts"
                )
            except Exception as e:
                print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
                raise RuntimeError(f"Failed to load Hindi support: {e}")
    
    def synthesize(self, text: str, voice_sample_path: Union[str, Path],
                  language: str = "english") -> np.ndarray:
        """
        Synthesize speech in specified language.
        
        Args:
            text: Text to synthesize
            voice_sample_path: Path to reference voice sample
            language: "english" or "hindi"
            
        Returns:
            Audio waveform as numpy array
        """
        language = language.lower()
        
        if language == Language.ENGLISH:
            return self._synthesize_english(text, voice_sample_path)
        elif language == Language.HINDI:
            return self._synthesize_hindi(text, voice_sample_path)
        else:
            raise ValueError(f"Unsupported language: {language}")
    
    def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
        """Synthesize English speech using WaveRNN + Tacotron2."""
        from encoder import inference as encoder_infer
        from app.vocoder import inference as vocoder_infer
        
        self._load_english_models()
        
        print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...")
        
        # Embed voice
        wav = encoder_infer.preprocess_wav(voice_sample_path)
        embed = encoder_infer.embed_utterance(wav)
        
        # Generate mel
        mels = self._synthesizer_model.synthesize_spectrograms([text], [embed])
        mel = mels[0]
        
        # Vocalize
        try:
            synthesized = vocoder_infer.infer_waveform(
                mel, normalize=True, batched=False, target=8000, overlap=800
            ).astype(np.float32)
        except Exception as e:
            print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback")
            synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32)
        
        # Normalize
        max_val = np.max(np.abs(synthesized))
        if max_val > 0:
            target_level = 0.707
            synthesized = synthesized * (target_level / max_val)
        
        return np.clip(synthesized, -1.0, 1.0)
    
    def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
        """Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
        self._load_hindi_models()
        
        print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
        
        try:
            from gtts import gTTS
            import io
            from pydub import AudioSegment
            
            # Generate speech using Google TTS
            tts = gTTS(text=text, lang='hi', slow=False)
            
            # Save to BytesIO buffer
            buffer = io.BytesIO()
            tts.write_to_fp(buffer)
            buffer.seek(0)
            
            # Load audio from buffer
            audio_segment = AudioSegment.from_mp3(buffer)
            
            # Convert to numpy array (mono, float32)
            samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
            
            # Handle stereo to mono conversion
            if audio_segment.channels == 2:
                # Convert stereo to mono by averaging channels
                samples = samples.reshape((-1, 2)).mean(axis=1)
            
            # Normalize to [-1, 1] range
            max_val = np.max(np.abs(samples))
            if max_val > 0:
                samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)
            
            return np.clip(samples, -1.0, 1.0)
            
        except Exception as e:
            print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
            raise RuntimeError(f"Hindi synthesis failed: {e}")
    
    def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
                           output_path: Union[str, Path], language: str = "english") -> Path:
        """
        Synthesize and save to file.
        
        Args:
            text: Text to synthesize
            voice_sample_path: Path to reference voice
            output_path: Where to save audio
            language: "english" or "hindi"
            
        Returns:
            Path to output file
        """
        import soundfile as sf
        
        output_path = Path(output_path)
        
        try:
            audio = self.synthesize(text, voice_sample_path, language)
            
            # Determine sample rate based on language
            sr = 24000 if language.lower() == Language.HINDI else 16000
            
            sf.write(output_path, audio, sr)
            print(f"[MultilingualTTSService] Audio saved: {output_path}")
            return output_path
            
        except Exception as e:
            print(f"[MultilingualTTSService] Error during synthesis: {e}")
            raise
    
    def cleanup(self):
        """Release model memory."""
        print("[MultilingualTTSService] Cleaning up models...")
        try:
            self._encoder_model = None
            self._synthesizer_model = None
            self._vocoder_model = None
            self._xtts_model = None
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        except Exception as e:
            print(f"[MultilingualTTSService] Cleanup warning: {e}")