Spaces:

AJ50
/

voice-cloning-backend

Sleeping

File size: 10,973 Bytes

"""Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS)."""

import os
import sys

# Set environment variables BEFORE any TTS imports to bypass CPML prompt
os.environ['TTS_HOME'] = '/tmp/tts_models'
os.environ['TTS_CPML'] = '1'
os.environ['TTS_SKIP_TOS'] = '1'
os.environ['TTS_DISABLE_WEB_VERSION_PROMPT'] = '1'
os.environ['COQUI_TOS_AGREED'] = '1'

# Create a silent TTS manager that handles model initialization without prompts
def _create_silent_tts_manager():
    """Create a TTS manager configured to skip all interactive prompts."""
    try:
        from TTS.utils.manage import ModelManager
        from pathlib import Path
        
        # Set model manager to use our TTS_HOME directory
        model_dir = Path(os.environ.get('TTS_HOME', '/tmp/tts_models'))
        model_dir.mkdir(parents=True, exist_ok=True)
        
        manager = ModelManager(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
        # Mark TOS as agreed in the manager to prevent prompts
        manager.tos_agreed = True
        
        return manager, model_dir
    except Exception as e:
        print(f"[WARNING] Could not create silent TTS manager: {e}")
        return None, None

import gc
import torch
import numpy as np
from pathlib import Path
from typing import Optional, Union
from enum import Enum


class Language(str, Enum):
    """Supported languages."""
    ENGLISH = "english"
    HINDI = "hindi"


class MultilingualTTSService:
    """
    Unified TTS service supporting multiple languages.
    
    - English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder
    - Hindi: Uses XTTS (Coqui TTS) model
    """
    
    def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
        """
        Initialize multilingual TTS service.
        
        Args:
            models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt)
            hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled.
        """
        self.models_dir = Path(models_dir)
        self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
        
        # Track loaded models
        self._encoder_model = None
        self._synthesizer_model = None
        self._vocoder_model = None
        self._xtts_model = None
        
        self.sr = 16000
        
        print("[MultilingualTTSService] Initialized")
        print(f"[MultilingualTTSService] English models dir: {self.models_dir}")
        if self.hindi_model_dir:
            print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}")
        else:
            print("[MultilingualTTSService] Hindi support: DISABLED (no model path)")
    
    def _load_english_models(self):
        """Load English voice cloning models (lazy load)."""
        if self._encoder_model is None:
            print("[MultilingualTTSService] Loading English encoder...")
            from encoder import inference as encoder_infer
            enc_path = self.models_dir / "default" / "encoder.pt"
            if not enc_path.exists():
                raise RuntimeError(f"English encoder model missing: {enc_path}")
            encoder_infer.load_model(enc_path)
            self._encoder_model = True
            print("[MultilingualTTSService] ✓ English encoder loaded")
        
        if self._synthesizer_model is None:
            print("[MultilingualTTSService] Loading English synthesizer...")
            from synthesizer import inference as synthesizer_infer
            syn_path = self.models_dir / "default" / "synthesizer.pt"
            if not syn_path.exists():
                raise RuntimeError(f"English synthesizer model missing: {syn_path}")
            self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path)
            print("[MultilingualTTSService] ✓ English synthesizer loaded")
        
        if self._vocoder_model is None:
            print("[MultilingualTTSService] Loading English vocoder...")
            from app.vocoder import inference as vocoder_infer
            voc_path = self.models_dir / "default" / "vocoder.pt"
            if not voc_path.exists():
                raise RuntimeError(f"English vocoder model missing: {voc_path}")
            vocoder_infer.load_model(voc_path)
            self._vocoder_model = True
            print("[MultilingualTTSService] ✓ English vocoder loaded")
    
    def _load_hindi_models(self):
        """Load Hindi models - using Google Text-to-Speech (gTTS)."""
        if self._xtts_model is None:
            print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
            try:
                from gtts import gTTS
                print("[MultilingualTTSService] ✓ Hindi gTTS support loaded")
                print("[MultilingualTTSService]   Engine: Google Text-to-Speech (gTTS)")
                print("[MultilingualTTSService]   Language: Hindi (hin)")
                print("[MultilingualTTSService]   TOS: No (Google Cloud)")
                # Mark as loaded (gTTS doesn't require actual model loading)
                self._xtts_model = True
                    
            except ImportError:
                raise ImportError(
                    "gTTS library required for Hindi support. "
                    "Install with: pip install gtts"
                )
            except Exception as e:
                print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
                raise RuntimeError(f"Failed to load Hindi support: {e}")
    
    def synthesize(self, text: str, voice_sample_path: Union[str, Path],
                  language: str = "english") -> np.ndarray:
        """
        Synthesize speech in specified language.
        
        Args:
            text: Text to synthesize
            voice_sample_path: Path to reference voice sample
            language: "english" or "hindi"
            
        Returns:
            Audio waveform as numpy array
        """
        language = language.lower()
        
        if language == Language.ENGLISH:
            return self._synthesize_english(text, voice_sample_path)
        elif language == Language.HINDI:
            return self._synthesize_hindi(text, voice_sample_path)
        else:
            raise ValueError(f"Unsupported language: {language}")
    
    def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
        """Synthesize English speech using WaveRNN + Tacotron2."""
        from encoder import inference as encoder_infer
        from app.vocoder import inference as vocoder_infer
        
        self._load_english_models()
        
        print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...")
        
        # Embed voice
        wav = encoder_infer.preprocess_wav(voice_sample_path)
        embed = encoder_infer.embed_utterance(wav)
        
        # Generate mel
        mels = self._synthesizer_model.synthesize_spectrograms([text], [embed])
        mel = mels[0]
        
        # Vocalize
        try:
            synthesized = vocoder_infer.infer_waveform(
                mel, normalize=True, batched=False, target=8000, overlap=800
            ).astype(np.float32)
        except Exception as e:
            print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback")
            synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32)
        
        # Normalize
        max_val = np.max(np.abs(synthesized))
        if max_val > 0:
            target_level = 0.707
            synthesized = synthesized * (target_level / max_val)
        
        return np.clip(synthesized, -1.0, 1.0)
    
    def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
        """Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
        self._load_hindi_models()
        
        print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
        
        try:
            from gtts import gTTS
            import io
            from pydub import AudioSegment
            
            # Generate speech using Google TTS
            tts = gTTS(text=text, lang='hi', slow=False)
            
            # Save to BytesIO buffer
            buffer = io.BytesIO()
            tts.write_to_fp(buffer)
            buffer.seek(0)
            
            # Load audio from buffer
            audio_segment = AudioSegment.from_mp3(buffer)
            
            # Convert to numpy array (mono, float32)
            samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
            
            # Handle stereo to mono conversion
            if audio_segment.channels == 2:
                # Convert stereo to mono by averaging channels
                samples = samples.reshape((-1, 2)).mean(axis=1)
            
            # Normalize to [-1, 1] range
            max_val = np.max(np.abs(samples))
            if max_val > 0:
                samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)
            
            return np.clip(samples, -1.0, 1.0)
            
        except Exception as e:
            print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
            raise RuntimeError(f"Hindi synthesis failed: {e}")
    
    def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
                           output_path: Union[str, Path], language: str = "english") -> Path:
        """
        Synthesize and save to file.
        
        Args:
            text: Text to synthesize
            voice_sample_path: Path to reference voice
            output_path: Where to save audio
            language: "english" or "hindi"
            
        Returns:
            Path to output file
        """
        import soundfile as sf
        
        output_path = Path(output_path)
        
        try:
            audio = self.synthesize(text, voice_sample_path, language)
            
            # Determine sample rate based on language
            sr = 24000 if language.lower() == Language.HINDI else 16000
            
            sf.write(output_path, audio, sr)
            print(f"[MultilingualTTSService] Audio saved: {output_path}")
            return output_path
            
        except Exception as e:
            print(f"[MultilingualTTSService] Error during synthesis: {e}")
            raise
    
    def cleanup(self):
        """Release model memory."""
        print("[MultilingualTTSService] Cleaning up models...")
        try:
            self._encoder_model = None
            self._synthesizer_model = None
            self._vocoder_model = None
            self._xtts_model = None
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        except Exception as e:
            print(f"[MultilingualTTSService] Cleanup warning: {e}")