speak / app.py
toshuu's picture
Update app.py
4f01cd3 verified
import os
import sys
import tempfile
import torch
import gradio as gr
from datetime import datetime
import numpy as np
# Try to import audio libraries
try:
import scipy.io.wavfile as wavfile
USE_SCIPY = True
except ImportError:
USE_SCIPY = False
try:
import soundfile as sf
USE_SOUNDFILE = True
except ImportError:
USE_SOUNDFILE = False
# Configuration
MODEL_PATH = "v4_indic.pt"
DEFAULT_SPEAKER = "hindi_female"
DEFAULT_SAMPLE_RATE = 48000
print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")
# Load the model
print(f"Loading model from {MODEL_PATH}")
m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model")
print(f"Model object loaded: {type(m).__name__}")
# Inspect apply_tts signature
import inspect
sig = inspect.signature(m.apply_tts)
print(f"apply_tts signature: {sig}")
# Available speakers
AVAILABLE_SPEAKERS = [
"bengali_female", "bengali_male",
"gujarati_female", "gujarati_male",
"hindi_female", "hindi_male",
"kannada_female", "kannada_male",
"malayalam_female", "malayalam_male",
"manipuri_female",
"rajasthani_female", "rajasthani_male",
"tamil_female", "tamil_male",
"telugu_female", "telugu_male"
]
def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
"""
Wrapper to call apply_tts with proper error handling.
"""
# Validate speaker
if speaker not in AVAILABLE_SPEAKERS:
print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'")
speaker = DEFAULT_SPEAKER
# Clean and validate text
text = text.strip()
if not text:
raise ValueError("Text cannot be empty")
# Remove zero-width characters and normalize
text = text.replace('\u200d', '').replace('\u200c', '')
print(f"Calling apply_tts with text: '{text}', speaker: '{speaker}', sample_rate: {sample_rate}")
try:
# Try with ssml_text parameter (some models prefer this)
res = m.apply_tts(
ssml_text=text,
speaker=speaker,
sample_rate=sample_rate
)
print("Success with ssml_text parameter")
except Exception as e1:
print(f"ssml_text attempt failed: {e1}")
try:
# Try with text parameter
res = m.apply_tts(
text=text,
speaker=speaker,
sample_rate=sample_rate
)
print("Success with text parameter")
except Exception as e2:
print(f"text attempt failed: {e2}")
try:
# Try minimal parameters
res = m.apply_tts(
text=text,
speaker=speaker
)
print("Success with minimal parameters")
except Exception as e3:
print(f"All attempts failed. Last error: {e3}")
raise ValueError(f"Text processing failed. The model may not support this text. Error: {e3}")
# Handle different return types
if isinstance(res, tuple):
audio = res[0]
else:
audio = res
return audio
def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
"""
Synthesize text to audio and save to temporary WAV file.
Args:
text: Text to synthesize
speaker: Speaker voice to use
sample_rate: Audio sample rate
Returns:
Path to generated WAV file
"""
audio = _call_apply_tts(text, speaker, sample_rate)
# Convert to numpy array if needed
if torch.is_tensor(audio):
audio = audio.cpu().numpy()
# Ensure audio is in the right format
if audio.dtype != np.int16:
# Normalize to -1 to 1 range if needed
if audio.max() > 1.0 or audio.min() < -1.0:
audio = audio / max(abs(audio.max()), abs(audio.min()))
# Convert to 16-bit PCM
audio = (audio * 32767).astype(np.int16)
# Create temporary file
fd, path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
# Save audio using available library
if USE_SCIPY:
wavfile.write(path, sample_rate, audio)
elif USE_SOUNDFILE:
sf.write(path, audio, sample_rate)
else:
raise RuntimeError("No audio library available. Please install scipy or soundfile.")
return path
def tts_gradio_fn(text, speaker, sample_rate):
"""
Gradio interface function.
Args:
text: Input text
speaker: Selected speaker voice
sample_rate: Audio sample rate
Returns:
Path to generated audio file
"""
if not text or not text.strip():
raise gr.Error("Please enter some text to synthesize")
# Warn if text is too long
if len(text) > 200:
raise gr.Error("Text is too long. Please use shorter text (under 200 characters)")
try:
path = synthesize_text_to_wavfile(text, speaker, sample_rate)
return path
except ValueError as e:
raise gr.Error(f"Text processing failed: {str(e)}. Try simpler text or a different language.")
except Exception as e:
raise gr.Error(f"Speech generation failed: {str(e)}")
# Create Gradio interface
with gr.Blocks(title="Silero v4 Indic TTS") as demo:
gr.Markdown("# Silero v4 Indic Text-to-Speech")
gr.Markdown("Convert text to speech in multiple Indian languages")
gr.Markdown("⚠️ **Note:** Use simple, short phrases for best results. Complex sentences may fail.")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter Text",
placeholder="नमस्ते (Enter short text in Hindi, Bengali, Tamil, etc.)",
lines=3,
info="Keep text short and simple for best results"
)
speaker_dropdown = gr.Dropdown(
choices=AVAILABLE_SPEAKERS,
value=DEFAULT_SPEAKER,
label="Select Speaker Voice"
)
sample_rate_dropdown = gr.Dropdown(
choices=[8000, 16000, 24000, 48000],
value=DEFAULT_SAMPLE_RATE,
label="Sample Rate (Hz)"
)
submit_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Audio",
type="filepath"
)
# Examples
gr.Examples(
examples=[
["नमस्ते", "hindi_female", 48000],
["आप कैसे हैं", "hindi_male", 48000],
["হ্যালো", "bengali_female", 48000],
["வணக்கம்", "tamil_female", 48000],
["హలో", "telugu_female", 48000],
["ಹಲೋ", "kannada_female", 48000],
["હેલો", "gujarati_female", 48000],
],
inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
outputs=audio_output,
fn=tts_gradio_fn,
cache_examples=False
)
submit_btn.click(
fn=tts_gradio_fn,
inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
outputs=audio_output
)
# Launch the app with API enabled
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_api=True # This enables the API documentation
)