|
|
import os |
|
|
import sys |
|
|
import tempfile |
|
|
import torch |
|
|
import gradio as gr |
|
|
from datetime import datetime |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
try: |
|
|
import scipy.io.wavfile as wavfile |
|
|
USE_SCIPY = True |
|
|
except ImportError: |
|
|
USE_SCIPY = False |
|
|
try: |
|
|
import soundfile as sf |
|
|
USE_SOUNDFILE = True |
|
|
except ImportError: |
|
|
USE_SOUNDFILE = False |
|
|
|
|
|
|
|
|
MODEL_PATH = "v4_indic.pt" |
|
|
DEFAULT_SPEAKER = "hindi_female" |
|
|
DEFAULT_SAMPLE_RATE = 48000 |
|
|
|
|
|
print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====") |
|
|
|
|
|
|
|
|
print(f"Loading model from {MODEL_PATH}") |
|
|
m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model") |
|
|
print(f"Model object loaded: {type(m).__name__}") |
|
|
|
|
|
|
|
|
import inspect |
|
|
sig = inspect.signature(m.apply_tts) |
|
|
print(f"apply_tts signature: {sig}") |
|
|
|
|
|
|
|
|
AVAILABLE_SPEAKERS = [ |
|
|
"bengali_female", "bengali_male", |
|
|
"gujarati_female", "gujarati_male", |
|
|
"hindi_female", "hindi_male", |
|
|
"kannada_female", "kannada_male", |
|
|
"malayalam_female", "malayalam_male", |
|
|
"manipuri_female", |
|
|
"rajasthani_female", "rajasthani_male", |
|
|
"tamil_female", "tamil_male", |
|
|
"telugu_female", "telugu_male" |
|
|
] |
|
|
|
|
|
def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE): |
|
|
""" |
|
|
Wrapper to call apply_tts with proper error handling. |
|
|
""" |
|
|
|
|
|
if speaker not in AVAILABLE_SPEAKERS: |
|
|
print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'") |
|
|
speaker = DEFAULT_SPEAKER |
|
|
|
|
|
|
|
|
text = text.strip() |
|
|
if not text: |
|
|
raise ValueError("Text cannot be empty") |
|
|
|
|
|
|
|
|
text = text.replace('\u200d', '').replace('\u200c', '') |
|
|
|
|
|
print(f"Calling apply_tts with text: '{text}', speaker: '{speaker}', sample_rate: {sample_rate}") |
|
|
|
|
|
try: |
|
|
|
|
|
res = m.apply_tts( |
|
|
ssml_text=text, |
|
|
speaker=speaker, |
|
|
sample_rate=sample_rate |
|
|
) |
|
|
print("Success with ssml_text parameter") |
|
|
except Exception as e1: |
|
|
print(f"ssml_text attempt failed: {e1}") |
|
|
try: |
|
|
|
|
|
res = m.apply_tts( |
|
|
text=text, |
|
|
speaker=speaker, |
|
|
sample_rate=sample_rate |
|
|
) |
|
|
print("Success with text parameter") |
|
|
except Exception as e2: |
|
|
print(f"text attempt failed: {e2}") |
|
|
try: |
|
|
|
|
|
res = m.apply_tts( |
|
|
text=text, |
|
|
speaker=speaker |
|
|
) |
|
|
print("Success with minimal parameters") |
|
|
except Exception as e3: |
|
|
print(f"All attempts failed. Last error: {e3}") |
|
|
raise ValueError(f"Text processing failed. The model may not support this text. Error: {e3}") |
|
|
|
|
|
|
|
|
if isinstance(res, tuple): |
|
|
audio = res[0] |
|
|
else: |
|
|
audio = res |
|
|
|
|
|
return audio |
|
|
|
|
|
|
|
|
def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE): |
|
|
""" |
|
|
Synthesize text to audio and save to temporary WAV file. |
|
|
|
|
|
Args: |
|
|
text: Text to synthesize |
|
|
speaker: Speaker voice to use |
|
|
sample_rate: Audio sample rate |
|
|
|
|
|
Returns: |
|
|
Path to generated WAV file |
|
|
""" |
|
|
audio = _call_apply_tts(text, speaker, sample_rate) |
|
|
|
|
|
|
|
|
if torch.is_tensor(audio): |
|
|
audio = audio.cpu().numpy() |
|
|
|
|
|
|
|
|
if audio.dtype != np.int16: |
|
|
|
|
|
if audio.max() > 1.0 or audio.min() < -1.0: |
|
|
audio = audio / max(abs(audio.max()), abs(audio.min())) |
|
|
|
|
|
audio = (audio * 32767).astype(np.int16) |
|
|
|
|
|
|
|
|
fd, path = tempfile.mkstemp(suffix=".wav") |
|
|
os.close(fd) |
|
|
|
|
|
|
|
|
if USE_SCIPY: |
|
|
wavfile.write(path, sample_rate, audio) |
|
|
elif USE_SOUNDFILE: |
|
|
sf.write(path, audio, sample_rate) |
|
|
else: |
|
|
raise RuntimeError("No audio library available. Please install scipy or soundfile.") |
|
|
|
|
|
return path |
|
|
|
|
|
|
|
|
def tts_gradio_fn(text, speaker, sample_rate): |
|
|
""" |
|
|
Gradio interface function. |
|
|
|
|
|
Args: |
|
|
text: Input text |
|
|
speaker: Selected speaker voice |
|
|
sample_rate: Audio sample rate |
|
|
|
|
|
Returns: |
|
|
Path to generated audio file |
|
|
""" |
|
|
if not text or not text.strip(): |
|
|
raise gr.Error("Please enter some text to synthesize") |
|
|
|
|
|
|
|
|
if len(text) > 200: |
|
|
raise gr.Error("Text is too long. Please use shorter text (under 200 characters)") |
|
|
|
|
|
try: |
|
|
path = synthesize_text_to_wavfile(text, speaker, sample_rate) |
|
|
return path |
|
|
except ValueError as e: |
|
|
raise gr.Error(f"Text processing failed: {str(e)}. Try simpler text or a different language.") |
|
|
except Exception as e: |
|
|
raise gr.Error(f"Speech generation failed: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Silero v4 Indic TTS") as demo: |
|
|
gr.Markdown("# Silero v4 Indic Text-to-Speech") |
|
|
gr.Markdown("Convert text to speech in multiple Indian languages") |
|
|
gr.Markdown("⚠️ **Note:** Use simple, short phrases for best results. Complex sentences may fail.") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox( |
|
|
label="Enter Text", |
|
|
placeholder="नमस्ते (Enter short text in Hindi, Bengali, Tamil, etc.)", |
|
|
lines=3, |
|
|
info="Keep text short and simple for best results" |
|
|
) |
|
|
|
|
|
speaker_dropdown = gr.Dropdown( |
|
|
choices=AVAILABLE_SPEAKERS, |
|
|
value=DEFAULT_SPEAKER, |
|
|
label="Select Speaker Voice" |
|
|
) |
|
|
|
|
|
sample_rate_dropdown = gr.Dropdown( |
|
|
choices=[8000, 16000, 24000, 48000], |
|
|
value=DEFAULT_SAMPLE_RATE, |
|
|
label="Sample Rate (Hz)" |
|
|
) |
|
|
|
|
|
submit_btn = gr.Button("Generate Speech", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
audio_output = gr.Audio( |
|
|
label="Generated Audio", |
|
|
type="filepath" |
|
|
) |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["नमस्ते", "hindi_female", 48000], |
|
|
["आप कैसे हैं", "hindi_male", 48000], |
|
|
["হ্যালো", "bengali_female", 48000], |
|
|
["வணக்கம்", "tamil_female", 48000], |
|
|
["హలో", "telugu_female", 48000], |
|
|
["ಹಲೋ", "kannada_female", 48000], |
|
|
["હેલો", "gujarati_female", 48000], |
|
|
], |
|
|
inputs=[text_input, speaker_dropdown, sample_rate_dropdown], |
|
|
outputs=audio_output, |
|
|
fn=tts_gradio_fn, |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
submit_btn.click( |
|
|
fn=tts_gradio_fn, |
|
|
inputs=[text_input, speaker_dropdown, sample_rate_dropdown], |
|
|
outputs=audio_output |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
show_api=True |
|
|
) |