Spaces:

CarsaAI
/

carsa_api

Running

File size: 12,691 Bytes

d01de5d

"""
Text-to-Speech Engine for Voice Assistant

A complete, self-contained Python class that provides high-quality text-to-speech
synthesis using the Coqui TTS library with multi-speaker support.

Author: Voice Assistant Team
Version: 1.0.0
"""

import os
import torch
from TTS.api import TTS
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class TTSEngine:
    """
    A high-quality Text-to-Speech engine using Coqui TTS library.
    
    This class provides text-to-speech synthesis capabilities with support for
    multi-speaker models and custom voice cloning.
    """
    
    def __init__(self, model_name="tts_models/en/vctk/vits"):
        """
        Initialize the TTS engine with a pre-trained model.
        
        Args:
            model_name (str): The name of the TTS model to load.
                             Default: "tts_models/en/vctk/vits" (multi-speaker English)
        
        Raises:
            Exception: If model loading fails
        """
        try:
            logger.info("Initializing TTS Engine...")
            
            # Check for GPU availability
            device = "cuda" if torch.cuda.is_available() else "cpu"
            logger.info(f"Using device: {device}")
            if torch.cuda.is_available():
                logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
                logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
            else:
                logger.warning("No GPU detected - using CPU")
            
            # Load the TTS model
            logger.info(f"Loading model: {model_name}")
            self.tts = TTS(model_name=model_name).to(device)
            
            # Store model information
            self.model_name = model_name
            self.device = device
            
            logger.info(f"✅ TTS Engine initialized successfully!")
            logger.info(f"   Model: {model_name}")
            logger.info(f"   Device: {device}")
            
            # Print available speakers if it's a multi-speaker model
            if hasattr(self.tts, 'speakers') and self.tts.speakers:
                logger.info(f"   Available speakers: {len(self.tts.speakers)}")
                logger.info(f"   Sample speakers: {list(self.tts.speakers)[:5]}...")
            
        except Exception as e:
            logger.error(f"❌ Failed to initialize TTS Engine: {str(e)}")
            raise Exception(f"TTS Engine initialization failed: {str(e)}")
    
    def synthesize(self, text, output_path, speaker="p225", language=None):
        """
        Synthesize speech from text and save to file.
        
        Args:
            text (str): The text to convert to speech
            output_path (str): File path to save the generated audio (.wav)
            speaker (str): Speaker ID for multi-speaker models (default: "p225")
            language (str): Language code (optional, auto-detected if None)
        
        Returns:
            bool: True if synthesis was successful, False otherwise
        
        Raises:
            Exception: If synthesis fails
        """
        try:
            # Validate input
            if not text or not text.strip():
                raise ValueError("Text cannot be empty")
            
            if not output_path:
                raise ValueError("Output path cannot be empty")
            
            # Ensure output directory exists
            output_dir = os.path.dirname(output_path)
            if output_dir and not os.path.exists(output_dir):
                os.makedirs(output_dir)
                logger.info(f"Created output directory: {output_dir}")
            
            # Ensure output path has .wav extension
            if not output_path.lower().endswith('.wav'):
                output_path += '.wav'
            
            logger.info(f"Synthesizing speech...")
            logger.info(f"   Text: {text[:50]}{'...' if len(text) > 50 else ''}")
            logger.info(f"   Speaker: {speaker}")
            logger.info(f"   Output: {output_path}")
            
            # Perform text-to-speech synthesis
            if language:
                # With explicit language
                self.tts.tts_to_file(
                    text=text,
                    speaker=speaker,
                    language=language,
                    file_path=output_path
                )
            else:
                # Auto-detect language
                self.tts.tts_to_file(
                    text=text,
                    speaker=speaker,
                    file_path=output_path
                )
            
            # Verify the file was created
            if os.path.exists(output_path):
                file_size = os.path.getsize(output_path)
                logger.info(f"✅ Speech synthesis completed successfully!")
                logger.info(f"   Output file: {output_path}")
                logger.info(f"   File size: {file_size} bytes")
                return True
            else:
                raise Exception("Output file was not created")
                
        except Exception as e:
            logger.error(f"❌ Speech synthesis failed: {str(e)}")
            raise Exception(f"Speech synthesis failed: {str(e)}")
    
    def get_available_speakers(self):
        """
        Get list of available speakers for the loaded model.
        
        Returns:
            list: List of available speaker IDs, or empty list if not a multi-speaker model
        """
        try:
            if hasattr(self.tts, 'speakers') and self.tts.speakers:
                return list(self.tts.speakers)
            else:
                return []
        except Exception as e:
            logger.error(f"Failed to get available speakers: {str(e)}")
            return []
    
    def get_model_info(self):
        """
        Get information about the loaded model.
        
        Returns:
            dict: Dictionary containing model information
        """
        try:
            info = {
                "model_name": self.model_name,
                "device": self.device,
                "available_speakers": self.get_available_speakers(),
                "is_multi_speaker": hasattr(self.tts, 'speakers') and bool(self.tts.speakers)
            }
            return info
        except Exception as e:
            logger.error(f"Failed to get model info: {str(e)}")
            return {}

    @property
    def model(self):
        """
        Property to check if the TTS model is loaded.
        
        Returns:
            bool: True if model is loaded, False otherwise
        """
        return hasattr(self, 'tts') and self.tts is not None

    def synthesize_to_bytes(self, text, speaker="p225", language=None):
        """
        Synthesize speech from text and return audio bytes directly.
        
        Args:
            text (str): The text to convert to speech
            speaker (str): Speaker ID for multi-speaker models (default: "p225")
            language (str): Language code (optional, auto-detected if None)
        
        Returns:
            bytes: Audio data as WAV bytes
        
        Raises:
            Exception: If synthesis fails
        """
        try:
            # Validate input
            if not text or not text.strip():
                raise ValueError("Text cannot be empty")
            
            logger.info(f"Synthesizing speech to bytes...")
            logger.info(f"   Text: {text[:50]}{'...' if len(text) > 50 else ''}")
            logger.info(f"   Speaker: {speaker}")
            
            # Perform text-to-speech synthesis and get audio data
            if language:
                # With explicit language
                audio_data = self.tts.tts(
                    text=text,
                    speaker=speaker,
                    language=language
                )
            else:
                # Auto-detect language
                audio_data = self.tts.tts(
                    text=text,
                    speaker=speaker
                )
            
            # Convert audio data to WAV bytes
            import io
            import soundfile as sf
            
            audio_bytes = io.BytesIO()
            sf.write(audio_bytes, audio_data, self.tts.synthesizer.output_sample_rate, format='WAV')
            audio_bytes.seek(0)
            
            logger.info(f"✅ Speech synthesis to bytes completed successfully!")
            logger.info(f"   Audio size: {len(audio_bytes.getvalue())} bytes")
            
            return audio_bytes.getvalue()
                
        except Exception as e:
            logger.error(f"❌ Speech synthesis to bytes failed: {str(e)}")
            raise Exception(f"Speech synthesis to bytes failed: {str(e)}")


# =============================================================================
# CUSTOM VOICE CLONING INITIALIZATION (COMMENTED OUT)
# =============================================================================
"""
# Alternative __init__ method for custom voice cloning
# Uncomment and modify this section when you have a custom cloned voice model

def __init__(self, model_path="path/to/your/custom/model", speaker_wav="speaker.wav"):
    '''
    Initialize the TTS engine with a custom cloned voice model.
    
    Args:
        model_path (str): Path to the custom TTS model directory
        speaker_wav (str): Path to the speaker reference audio file
    
    Raises:
        Exception: If model loading fails
    '''
    try:
        logger.info("Initializing TTS Engine with custom voice model...")
        
        # Check for GPU availability
        device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {device}")
        
        # Load the custom TTS model
        logger.info(f"Loading custom model from: {model_path}")
        self.tts = TTS(model_path=model_path).to(device)
        
        # Store model information
        self.model_path = model_path
        self.speaker_wav = speaker_wav
        self.device = device
        
        logger.info(f"✅ Custom TTS Engine initialized successfully!")
        logger.info(f"   Model path: {model_path}")
        logger.info(f"   Speaker file: {speaker_wav}")
        logger.info(f"   Device: {device}")
        
    except Exception as e:
        logger.error(f"❌ Failed to initialize custom TTS Engine: {str(e)}")
        raise Exception(f"Custom TTS Engine initialization failed: {str(e)}")

# Custom synthesis method for voice cloning
def synthesize_with_cloned_voice(self, text, output_path):
    '''
    Synthesize speech using the cloned voice.
    
    Args:
        text (str): The text to convert to speech
        output_path (str): File path to save the generated audio
    
    Returns:
        bool: True if synthesis was successful
    '''
    try:
        logger.info(f"Synthesizing speech with cloned voice...")
        
        # Perform text-to-speech synthesis with cloned voice
        self.tts.tts_to_file(
            text=text,
            speaker_wav=self.speaker_wav,
            file_path=output_path
        )
        
        logger.info(f"✅ Cloned voice synthesis completed!")
        return True
        
    except Exception as e:
        logger.error(f"❌ Cloned voice synthesis failed: {str(e)}")
        raise Exception(f"Cloned voice synthesis failed: {str(e)}")
"""


def main():
    """Example usage of the TTSEngine class."""
    try:
        # Create TTS engine instance
        logger.info("Creating TTS Engine instance...")
        tts_engine = TTSEngine()
        
        # Display model information
        model_info = tts_engine.get_model_info()
        logger.info(f"Model Information: {model_info}")
        
        # Test text for synthesis
        test_text = "Hello! This is a test of the text-to-speech engine. The voice synthesis is working perfectly."
        
        # Synthesize speech
        output_file = "test_output.wav"
        success = tts_engine.synthesize(
            text=test_text,
            output_path=output_file,
            speaker="p225"  # Using a specific speaker from the VCTK dataset
        )
        
        if success:
            logger.info("🎉 Test completed successfully!")
            logger.info(f"Check the generated audio file: {output_file}")
        else:
            logger.error("❌ Test failed!")
            
    except Exception as e:
        logger.error(f"❌ Example usage failed: {str(e)}")


if __name__ == "__main__":
    main()