Spaces:

AJ50
/

voice-cloning-backend

Sleeping

App Files Files Community

AJ50 commited on 19 days ago

Commit

e1c7f06

1 Parent(s): 9333545

Switch to gTTS for Hindi - reliable Google API, no local models

Browse files

Files changed (2) hide show

backend/app/multilingual_tts.py +43 -37
backend/requirements.txt +1 -0

backend/app/multilingual_tts.py CHANGED Viewed

@@ -110,33 +110,26 @@ class MultilingualTTSService:
             print("[MultilingualTTSService] ✓ English vocoder loaded")
     def _load_hindi_models(self):
-        """Load Hindi Facebook MMS model - no TOS required, lightweight."""
         if self._xtts_model is None:
-            print("[MultilingualTTSService] Loading Hindi Facebook MMS model...")
             try:
-                from TTS.api import TTS
-                # Facebook MMS: No TOS required, lightweight (200MB vs XTTS 1.8GB)
-                # Downloads once and caches locally
-                self._xtts_model = TTS(
-                    model_name="tts_models/hin/facebook/mms-tts-hin",
-                    gpu=False,
-                    progress_bar=False
-                )
-                print("[MultilingualTTSService] ✓ Hindi Facebook MMS loaded successfully")
-                print("[MultilingualTTSService]   Model: Facebook Massively Multilingual Speech (MMS)")
                 print("[MultilingualTTSService]   Language: Hindi (hin)")
-                print("[MultilingualTTSService]   TOS: No (Open model)")
             except ImportError:
                 raise ImportError(
-                    "TTS library required for Hindi support. "
-                    "Install with: pip install TTS>=0.21.0"
                 )
             except Exception as e:
-                print(f"[MultilingualTTSService] Error loading Hindi MMS model: {e}")
-                print(f"[MultilingualTTSService] Make sure TTS library is properly installed")
-                raise RuntimeError(f"Failed to load Hindi MMS model: {e}")
     def synthesize(self, text: str, voice_sample_path: Union[str, Path],
                   language: str = "english") -> np.ndarray:
@@ -195,32 +188,45 @@ class MultilingualTTSService:
         return np.clip(synthesized, -1.0, 1.0)
     def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
-        """Synthesize Hindi speech using Facebook MMS model."""
         self._load_hindi_models()
         print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
-        # Facebook MMS uses simple TTS interface (no language parameter needed)
-        # MMS model is language-specific, already tuned for Hindi
         try:
-            audio = self._xtts_model.tts(
-                text=text,
-                speaker_wav=None  # MMS doesn't use speaker adaptation
-            )
         except Exception as e:
             print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
             raise RuntimeError(f"Hindi synthesis failed: {e}")
-        # Convert to float32 if needed
-        audio = np.asarray(audio, dtype=np.float32)
-        # Normalize
-        max_val = np.max(np.abs(audio))
-        if max_val > 0:
-            target_level = 0.707
-            audio = audio * (target_level / max_val)
-        return np.clip(audio, -1.0, 1.0)
     def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
                            output_path: Union[str, Path], language: str = "english") -> Path:

             print("[MultilingualTTSService] ✓ English vocoder loaded")
     def _load_hindi_models(self):
+        """Load Hindi models - using Google Text-to-Speech (gTTS)."""
         if self._xtts_model is None:
+            print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
             try:
+                from gtts import gTTS
+                print("[MultilingualTTSService] ✓ Hindi gTTS support loaded")
+                print("[MultilingualTTSService]   Engine: Google Text-to-Speech (gTTS)")
                 print("[MultilingualTTSService]   Language: Hindi (hin)")
+                print("[MultilingualTTSService]   TOS: No (Google Cloud)")
+                # Mark as loaded (gTTS doesn't require actual model loading)
+                self._xtts_model = True
             except ImportError:
                 raise ImportError(
+                    "gTTS library required for Hindi support. "
+                    "Install with: pip install gtts"
                 )
             except Exception as e:
+                print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
+                raise RuntimeError(f"Failed to load Hindi support: {e}")
     def synthesize(self, text: str, voice_sample_path: Union[str, Path],
                   language: str = "english") -> np.ndarray:
         return np.clip(synthesized, -1.0, 1.0)
     def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
+        """Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
         self._load_hindi_models()
         print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
         try:
+            from gtts import gTTS
+            import io
+            from pydub import AudioSegment
+            # Generate speech using Google TTS
+            tts = gTTS(text=text, lang='hi', slow=False)
+            # Save to BytesIO buffer
+            buffer = io.BytesIO()
+            tts.write_to_fp(buffer)
+            buffer.seek(0)
+            # Load audio from buffer
+            audio_segment = AudioSegment.from_mp3(buffer)
+            # Convert to numpy array (mono, float32)
+            samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
+            # Handle stereo to mono conversion
+            if audio_segment.channels == 2:
+                # Convert stereo to mono by averaging channels
+                samples = samples.reshape((-1, 2)).mean(axis=1)
+            # Normalize to [-1, 1] range
+            max_val = np.max(np.abs(samples))
+            if max_val > 0:
+                samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)
+            return np.clip(samples, -1.0, 1.0)
         except Exception as e:
             print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
             raise RuntimeError(f"Hindi synthesis failed: {e}")
     def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
                            output_path: Union[str, Path], language: str = "english") -> Path:

backend/requirements.txt CHANGED Viewed

@@ -15,3 +15,4 @@ inflect==7.0.0
 unidecode>=1.3.2
 webrtcvad==2.0.10
 demucs==4.0.1

 unidecode>=1.3.2
 webrtcvad==2.0.10
 demucs==4.0.1
+gtts==2.4.0