Spaces:

toshuu
/

speak

Runtime error

File size: 7,605 Bytes

bad894e
de64ba8
bad894e
 
de64ba8
 
5df7f8b
 
 
 
 
 
 
 
 
 
 
 
 
bad894e
de64ba8
bad894e
4f01cd3
de64ba8
bad894e
de64ba8
bad894e
de64ba8
 
 
 
3a268b8
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a268b8
de64ba8
3a268b8
de64ba8
 
 
 
 
30cf8cd
 
 
 
de64ba8
30cf8cd
 
de64ba8
30cf8cd
 
 
 
 
 
 
 
 
 
 
 
3a268b8
30cf8cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a268b8
bad894e
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
5df7f8b
 
 
 
 
 
 
 
 
 
 
 
de64ba8
 
 
 
5df7f8b
 
 
 
 
 
 
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cf8cd
de64ba8
30cf8cd
 
 
 
 
 
 
 
 
 
 
de64ba8
 
 
 
 
 
30cf8cd
de64ba8
 
 
 
 
30cf8cd
 
 
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cf8cd
 
 
 
 
 
 
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
4f01cd3
bad894e
de64ba8
 
 
4f01cd3
de64ba8

import os
import sys
import tempfile
import torch
import gradio as gr
from datetime import datetime
import numpy as np

# Try to import audio libraries
try:
    import scipy.io.wavfile as wavfile
    USE_SCIPY = True
except ImportError:
    USE_SCIPY = False
    try:
        import soundfile as sf
        USE_SOUNDFILE = True
    except ImportError:
        USE_SOUNDFILE = False

# Configuration
MODEL_PATH = "v4_indic.pt"
DEFAULT_SPEAKER = "hindi_female"
DEFAULT_SAMPLE_RATE = 48000

print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")

# Load the model
print(f"Loading model from {MODEL_PATH}")
m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model")
print(f"Model object loaded: {type(m).__name__}")

# Inspect apply_tts signature
import inspect
sig = inspect.signature(m.apply_tts)
print(f"apply_tts signature: {sig}")

# Available speakers
AVAILABLE_SPEAKERS = [
    "bengali_female", "bengali_male",
    "gujarati_female", "gujarati_male",
    "hindi_female", "hindi_male",
    "kannada_female", "kannada_male",
    "malayalam_female", "malayalam_male",
    "manipuri_female",
    "rajasthani_female", "rajasthani_male",
    "tamil_female", "tamil_male",
    "telugu_female", "telugu_male"
]

def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
    """
    Wrapper to call apply_tts with proper error handling.
    """
    # Validate speaker
    if speaker not in AVAILABLE_SPEAKERS:
        print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'")
        speaker = DEFAULT_SPEAKER
    
    # Clean and validate text
    text = text.strip()
    if not text:
        raise ValueError("Text cannot be empty")
    
    # Remove zero-width characters and normalize
    text = text.replace('\u200d', '').replace('\u200c', '')
    
    print(f"Calling apply_tts with text: '{text}', speaker: '{speaker}', sample_rate: {sample_rate}")
    
    try:
        # Try with ssml_text parameter (some models prefer this)
        res = m.apply_tts(
            ssml_text=text,
            speaker=speaker,
            sample_rate=sample_rate
        )
        print("Success with ssml_text parameter")
    except Exception as e1:
        print(f"ssml_text attempt failed: {e1}")
        try:
            # Try with text parameter
            res = m.apply_tts(
                text=text,
                speaker=speaker,
                sample_rate=sample_rate
            )
            print("Success with text parameter")
        except Exception as e2:
            print(f"text attempt failed: {e2}")
            try:
                # Try minimal parameters
                res = m.apply_tts(
                    text=text,
                    speaker=speaker
                )
                print("Success with minimal parameters")
            except Exception as e3:
                print(f"All attempts failed. Last error: {e3}")
                raise ValueError(f"Text processing failed. The model may not support this text. Error: {e3}")
    
    # Handle different return types
    if isinstance(res, tuple):
        audio = res[0]
    else:
        audio = res
    
    return audio


def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
    """
    Synthesize text to audio and save to temporary WAV file.
    
    Args:
        text: Text to synthesize
        speaker: Speaker voice to use
        sample_rate: Audio sample rate
        
    Returns:
        Path to generated WAV file
    """
    audio = _call_apply_tts(text, speaker, sample_rate)
    
    # Convert to numpy array if needed
    if torch.is_tensor(audio):
        audio = audio.cpu().numpy()
    
    # Ensure audio is in the right format
    if audio.dtype != np.int16:
        # Normalize to -1 to 1 range if needed
        if audio.max() > 1.0 or audio.min() < -1.0:
            audio = audio / max(abs(audio.max()), abs(audio.min()))
        # Convert to 16-bit PCM
        audio = (audio * 32767).astype(np.int16)
    
    # Create temporary file
    fd, path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    
    # Save audio using available library
    if USE_SCIPY:
        wavfile.write(path, sample_rate, audio)
    elif USE_SOUNDFILE:
        sf.write(path, audio, sample_rate)
    else:
        raise RuntimeError("No audio library available. Please install scipy or soundfile.")
    
    return path


def tts_gradio_fn(text, speaker, sample_rate):
    """
    Gradio interface function.
    
    Args:
        text: Input text
        speaker: Selected speaker voice
        sample_rate: Audio sample rate
        
    Returns:
        Path to generated audio file
    """
    if not text or not text.strip():
        raise gr.Error("Please enter some text to synthesize")
    
    # Warn if text is too long
    if len(text) > 200:
        raise gr.Error("Text is too long. Please use shorter text (under 200 characters)")
    
    try:
        path = synthesize_text_to_wavfile(text, speaker, sample_rate)
        return path
    except ValueError as e:
        raise gr.Error(f"Text processing failed: {str(e)}. Try simpler text or a different language.")
    except Exception as e:
        raise gr.Error(f"Speech generation failed: {str(e)}")


# Create Gradio interface
with gr.Blocks(title="Silero v4 Indic TTS") as demo:
    gr.Markdown("# Silero v4 Indic Text-to-Speech")
    gr.Markdown("Convert text to speech in multiple Indian languages")
    gr.Markdown("⚠️ **Note:** Use simple, short phrases for best results. Complex sentences may fail.")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter Text",
                placeholder="नमस्ते (Enter short text in Hindi, Bengali, Tamil, etc.)",
                lines=3,
                info="Keep text short and simple for best results"
            )
            
            speaker_dropdown = gr.Dropdown(
                choices=AVAILABLE_SPEAKERS,
                value=DEFAULT_SPEAKER,
                label="Select Speaker Voice"
            )
            
            sample_rate_dropdown = gr.Dropdown(
                choices=[8000, 16000, 24000, 48000],
                value=DEFAULT_SAMPLE_RATE,
                label="Sample Rate (Hz)"
            )
            
            submit_btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Audio",
                type="filepath"
            )
    
    # Examples
    gr.Examples(
        examples=[
            ["नमस्ते", "hindi_female", 48000],
            ["आप कैसे हैं", "hindi_male", 48000],
            ["হ্যালো", "bengali_female", 48000],
            ["வணக்கம்", "tamil_female", 48000],
            ["హలో", "telugu_female", 48000],
            ["ಹಲೋ", "kannada_female", 48000],
            ["હેલો", "gujarati_female", 48000],
        ],
        inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
        outputs=audio_output,
        fn=tts_gradio_fn,
        cache_examples=False
    )
    
    submit_btn.click(
        fn=tts_gradio_fn,
        inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
        outputs=audio_output
    )

# Launch the app with API enabled
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_api=True  # This enables the API documentation
    )