AJ50 commited on
Commit
e1c7f06
·
1 Parent(s): 9333545

Switch to gTTS for Hindi - reliable Google API, no local models

Browse files
backend/app/multilingual_tts.py CHANGED
@@ -110,33 +110,26 @@ class MultilingualTTSService:
110
  print("[MultilingualTTSService] ✓ English vocoder loaded")
111
 
112
  def _load_hindi_models(self):
113
- """Load Hindi Facebook MMS model - no TOS required, lightweight."""
114
  if self._xtts_model is None:
115
- print("[MultilingualTTSService] Loading Hindi Facebook MMS model...")
116
  try:
117
- from TTS.api import TTS
118
-
119
- # Facebook MMS: No TOS required, lightweight (200MB vs XTTS 1.8GB)
120
- # Downloads once and caches locally
121
- self._xtts_model = TTS(
122
- model_name="tts_models/hin/facebook/mms-tts-hin",
123
- gpu=False,
124
- progress_bar=False
125
- )
126
- print("[MultilingualTTSService] ✓ Hindi Facebook MMS loaded successfully")
127
- print("[MultilingualTTSService] Model: Facebook Massively Multilingual Speech (MMS)")
128
  print("[MultilingualTTSService] Language: Hindi (hin)")
129
- print("[MultilingualTTSService] TOS: No (Open model)")
 
 
130
 
131
  except ImportError:
132
  raise ImportError(
133
- "TTS library required for Hindi support. "
134
- "Install with: pip install TTS>=0.21.0"
135
  )
136
  except Exception as e:
137
- print(f"[MultilingualTTSService] Error loading Hindi MMS model: {e}")
138
- print(f"[MultilingualTTSService] Make sure TTS library is properly installed")
139
- raise RuntimeError(f"Failed to load Hindi MMS model: {e}")
140
 
141
  def synthesize(self, text: str, voice_sample_path: Union[str, Path],
142
  language: str = "english") -> np.ndarray:
@@ -195,32 +188,45 @@ class MultilingualTTSService:
195
  return np.clip(synthesized, -1.0, 1.0)
196
 
197
  def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
198
- """Synthesize Hindi speech using Facebook MMS model."""
199
  self._load_hindi_models()
200
 
201
  print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
202
 
203
- # Facebook MMS uses simple TTS interface (no language parameter needed)
204
- # MMS model is language-specific, already tuned for Hindi
205
  try:
206
- audio = self._xtts_model.tts(
207
- text=text,
208
- speaker_wav=None # MMS doesn't use speaker adaptation
209
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  except Exception as e:
211
  print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
212
  raise RuntimeError(f"Hindi synthesis failed: {e}")
213
-
214
- # Convert to float32 if needed
215
- audio = np.asarray(audio, dtype=np.float32)
216
-
217
- # Normalize
218
- max_val = np.max(np.abs(audio))
219
- if max_val > 0:
220
- target_level = 0.707
221
- audio = audio * (target_level / max_val)
222
-
223
- return np.clip(audio, -1.0, 1.0)
224
 
225
  def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
226
  output_path: Union[str, Path], language: str = "english") -> Path:
 
110
  print("[MultilingualTTSService] ✓ English vocoder loaded")
111
 
112
  def _load_hindi_models(self):
113
+ """Load Hindi models - using Google Text-to-Speech (gTTS)."""
114
  if self._xtts_model is None:
115
+ print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
116
  try:
117
+ from gtts import gTTS
118
+ print("[MultilingualTTSService] ✓ Hindi gTTS support loaded")
119
+ print("[MultilingualTTSService] Engine: Google Text-to-Speech (gTTS)")
 
 
 
 
 
 
 
 
120
  print("[MultilingualTTSService] Language: Hindi (hin)")
121
+ print("[MultilingualTTSService] TOS: No (Google Cloud)")
122
+ # Mark as loaded (gTTS doesn't require actual model loading)
123
+ self._xtts_model = True
124
 
125
  except ImportError:
126
  raise ImportError(
127
+ "gTTS library required for Hindi support. "
128
+ "Install with: pip install gtts"
129
  )
130
  except Exception as e:
131
+ print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
132
+ raise RuntimeError(f"Failed to load Hindi support: {e}")
 
133
 
134
  def synthesize(self, text: str, voice_sample_path: Union[str, Path],
135
  language: str = "english") -> np.ndarray:
 
188
  return np.clip(synthesized, -1.0, 1.0)
189
 
190
  def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
191
+ """Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
192
  self._load_hindi_models()
193
 
194
  print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
195
 
 
 
196
  try:
197
+ from gtts import gTTS
198
+ import io
199
+ from pydub import AudioSegment
200
+
201
+ # Generate speech using Google TTS
202
+ tts = gTTS(text=text, lang='hi', slow=False)
203
+
204
+ # Save to BytesIO buffer
205
+ buffer = io.BytesIO()
206
+ tts.write_to_fp(buffer)
207
+ buffer.seek(0)
208
+
209
+ # Load audio from buffer
210
+ audio_segment = AudioSegment.from_mp3(buffer)
211
+
212
+ # Convert to numpy array (mono, float32)
213
+ samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
214
+
215
+ # Handle stereo to mono conversion
216
+ if audio_segment.channels == 2:
217
+ # Convert stereo to mono by averaging channels
218
+ samples = samples.reshape((-1, 2)).mean(axis=1)
219
+
220
+ # Normalize to [-1, 1] range
221
+ max_val = np.max(np.abs(samples))
222
+ if max_val > 0:
223
+ samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)
224
+
225
+ return np.clip(samples, -1.0, 1.0)
226
+
227
  except Exception as e:
228
  print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
229
  raise RuntimeError(f"Hindi synthesis failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
230
 
231
  def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
232
  output_path: Union[str, Path], language: str = "english") -> Path:
backend/requirements.txt CHANGED
@@ -15,3 +15,4 @@ inflect==7.0.0
15
  unidecode>=1.3.2
16
  webrtcvad==2.0.10
17
  demucs==4.0.1
 
 
15
  unidecode>=1.3.2
16
  webrtcvad==2.0.10
17
  demucs==4.0.1
18
+ gtts==2.4.0