File size: 7,605 Bytes
bad894e de64ba8 bad894e de64ba8 5df7f8b bad894e de64ba8 bad894e 4f01cd3 de64ba8 bad894e de64ba8 bad894e de64ba8 3a268b8 de64ba8 3a268b8 de64ba8 3a268b8 de64ba8 30cf8cd de64ba8 30cf8cd de64ba8 30cf8cd 3a268b8 30cf8cd 3a268b8 bad894e de64ba8 5df7f8b de64ba8 5df7f8b de64ba8 30cf8cd de64ba8 30cf8cd de64ba8 30cf8cd de64ba8 30cf8cd de64ba8 30cf8cd de64ba8 4f01cd3 bad894e de64ba8 4f01cd3 de64ba8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
import os
import sys
import tempfile
import torch
import gradio as gr
from datetime import datetime
import numpy as np
# Try to import audio libraries
try:
import scipy.io.wavfile as wavfile
USE_SCIPY = True
except ImportError:
USE_SCIPY = False
try:
import soundfile as sf
USE_SOUNDFILE = True
except ImportError:
USE_SOUNDFILE = False
# Configuration
MODEL_PATH = "v4_indic.pt"
DEFAULT_SPEAKER = "hindi_female"
DEFAULT_SAMPLE_RATE = 48000
print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")
# Load the model
print(f"Loading model from {MODEL_PATH}")
m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model")
print(f"Model object loaded: {type(m).__name__}")
# Inspect apply_tts signature
import inspect
sig = inspect.signature(m.apply_tts)
print(f"apply_tts signature: {sig}")
# Available speakers
AVAILABLE_SPEAKERS = [
"bengali_female", "bengali_male",
"gujarati_female", "gujarati_male",
"hindi_female", "hindi_male",
"kannada_female", "kannada_male",
"malayalam_female", "malayalam_male",
"manipuri_female",
"rajasthani_female", "rajasthani_male",
"tamil_female", "tamil_male",
"telugu_female", "telugu_male"
]
def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
"""
Wrapper to call apply_tts with proper error handling.
"""
# Validate speaker
if speaker not in AVAILABLE_SPEAKERS:
print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'")
speaker = DEFAULT_SPEAKER
# Clean and validate text
text = text.strip()
if not text:
raise ValueError("Text cannot be empty")
# Remove zero-width characters and normalize
text = text.replace('\u200d', '').replace('\u200c', '')
print(f"Calling apply_tts with text: '{text}', speaker: '{speaker}', sample_rate: {sample_rate}")
try:
# Try with ssml_text parameter (some models prefer this)
res = m.apply_tts(
ssml_text=text,
speaker=speaker,
sample_rate=sample_rate
)
print("Success with ssml_text parameter")
except Exception as e1:
print(f"ssml_text attempt failed: {e1}")
try:
# Try with text parameter
res = m.apply_tts(
text=text,
speaker=speaker,
sample_rate=sample_rate
)
print("Success with text parameter")
except Exception as e2:
print(f"text attempt failed: {e2}")
try:
# Try minimal parameters
res = m.apply_tts(
text=text,
speaker=speaker
)
print("Success with minimal parameters")
except Exception as e3:
print(f"All attempts failed. Last error: {e3}")
raise ValueError(f"Text processing failed. The model may not support this text. Error: {e3}")
# Handle different return types
if isinstance(res, tuple):
audio = res[0]
else:
audio = res
return audio
def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
"""
Synthesize text to audio and save to temporary WAV file.
Args:
text: Text to synthesize
speaker: Speaker voice to use
sample_rate: Audio sample rate
Returns:
Path to generated WAV file
"""
audio = _call_apply_tts(text, speaker, sample_rate)
# Convert to numpy array if needed
if torch.is_tensor(audio):
audio = audio.cpu().numpy()
# Ensure audio is in the right format
if audio.dtype != np.int16:
# Normalize to -1 to 1 range if needed
if audio.max() > 1.0 or audio.min() < -1.0:
audio = audio / max(abs(audio.max()), abs(audio.min()))
# Convert to 16-bit PCM
audio = (audio * 32767).astype(np.int16)
# Create temporary file
fd, path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
# Save audio using available library
if USE_SCIPY:
wavfile.write(path, sample_rate, audio)
elif USE_SOUNDFILE:
sf.write(path, audio, sample_rate)
else:
raise RuntimeError("No audio library available. Please install scipy or soundfile.")
return path
def tts_gradio_fn(text, speaker, sample_rate):
"""
Gradio interface function.
Args:
text: Input text
speaker: Selected speaker voice
sample_rate: Audio sample rate
Returns:
Path to generated audio file
"""
if not text or not text.strip():
raise gr.Error("Please enter some text to synthesize")
# Warn if text is too long
if len(text) > 200:
raise gr.Error("Text is too long. Please use shorter text (under 200 characters)")
try:
path = synthesize_text_to_wavfile(text, speaker, sample_rate)
return path
except ValueError as e:
raise gr.Error(f"Text processing failed: {str(e)}. Try simpler text or a different language.")
except Exception as e:
raise gr.Error(f"Speech generation failed: {str(e)}")
# Create Gradio interface
with gr.Blocks(title="Silero v4 Indic TTS") as demo:
gr.Markdown("# Silero v4 Indic Text-to-Speech")
gr.Markdown("Convert text to speech in multiple Indian languages")
gr.Markdown("⚠️ **Note:** Use simple, short phrases for best results. Complex sentences may fail.")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter Text",
placeholder="नमस्ते (Enter short text in Hindi, Bengali, Tamil, etc.)",
lines=3,
info="Keep text short and simple for best results"
)
speaker_dropdown = gr.Dropdown(
choices=AVAILABLE_SPEAKERS,
value=DEFAULT_SPEAKER,
label="Select Speaker Voice"
)
sample_rate_dropdown = gr.Dropdown(
choices=[8000, 16000, 24000, 48000],
value=DEFAULT_SAMPLE_RATE,
label="Sample Rate (Hz)"
)
submit_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Audio",
type="filepath"
)
# Examples
gr.Examples(
examples=[
["नमस्ते", "hindi_female", 48000],
["आप कैसे हैं", "hindi_male", 48000],
["হ্যালো", "bengali_female", 48000],
["வணக்கம்", "tamil_female", 48000],
["హలో", "telugu_female", 48000],
["ಹಲೋ", "kannada_female", 48000],
["હેલો", "gujarati_female", 48000],
],
inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
outputs=audio_output,
fn=tts_gradio_fn,
cache_examples=False
)
submit_btn.click(
fn=tts_gradio_fn,
inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
outputs=audio_output
)
# Launch the app with API enabled
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_api=True # This enables the API documentation
) |