Spaces:

toshuu
/

speak

Runtime error

App Files Files Community

speak / app.py

toshuu

Update app.py

4f01cd3 verified about 1 month ago

raw

history blame contribute delete

7.61 kB

	import os
	import sys
	import tempfile
	import torch
	import gradio as gr
	from datetime import datetime
	import numpy as np

	# Try to import audio libraries
	try:
	import scipy.io.wavfile as wavfile
	USE_SCIPY = True
	except ImportError:
	USE_SCIPY = False
	try:
	import soundfile as sf
	USE_SOUNDFILE = True
	except ImportError:
	USE_SOUNDFILE = False

	# Configuration
	MODEL_PATH = "v4_indic.pt"
	DEFAULT_SPEAKER = "hindi_female"
	DEFAULT_SAMPLE_RATE = 48000

	print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")

	# Load the model
	print(f"Loading model from {MODEL_PATH}")
	m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model")
	print(f"Model object loaded: {type(m).__name__}")

	# Inspect apply_tts signature
	import inspect
	sig = inspect.signature(m.apply_tts)
	print(f"apply_tts signature: {sig}")

	# Available speakers
	AVAILABLE_SPEAKERS = [
	"bengali_female", "bengali_male",
	"gujarati_female", "gujarati_male",
	"hindi_female", "hindi_male",
	"kannada_female", "kannada_male",
	"malayalam_female", "malayalam_male",
	"manipuri_female",
	"rajasthani_female", "rajasthani_male",
	"tamil_female", "tamil_male",
	"telugu_female", "telugu_male"
	]

	def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
	"""
	Wrapper to call apply_tts with proper error handling.
	"""
	# Validate speaker
	if speaker not in AVAILABLE_SPEAKERS:
	print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'")
	speaker = DEFAULT_SPEAKER

	# Clean and validate text
	text = text.strip()
	if not text:
	raise ValueError("Text cannot be empty")

	# Remove zero-width characters and normalize
	text = text.replace('\u200d', '').replace('\u200c', '')

	print(f"Calling apply_tts with text: '{text}', speaker: '{speaker}', sample_rate: {sample_rate}")

	try:
	# Try with ssml_text parameter (some models prefer this)
	res = m.apply_tts(
	ssml_text=text,
	speaker=speaker,
	sample_rate=sample_rate
	)
	print("Success with ssml_text parameter")
	except Exception as e1:
	print(f"ssml_text attempt failed: {e1}")
	try:
	# Try with text parameter
	res = m.apply_tts(
	text=text,
	speaker=speaker,
	sample_rate=sample_rate
	)
	print("Success with text parameter")
	except Exception as e2:
	print(f"text attempt failed: {e2}")
	try:
	# Try minimal parameters
	res = m.apply_tts(
	text=text,
	speaker=speaker
	)
	print("Success with minimal parameters")
	except Exception as e3:
	print(f"All attempts failed. Last error: {e3}")
	raise ValueError(f"Text processing failed. The model may not support this text. Error: {e3}")

	# Handle different return types
	if isinstance(res, tuple):
	audio = res[0]
	else:
	audio = res

	return audio


	def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
	"""
	Synthesize text to audio and save to temporary WAV file.

	Args:
	text: Text to synthesize
	speaker: Speaker voice to use
	sample_rate: Audio sample rate

	Returns:
	Path to generated WAV file
	"""
	audio = _call_apply_tts(text, speaker, sample_rate)

	# Convert to numpy array if needed
	if torch.is_tensor(audio):
	audio = audio.cpu().numpy()

	# Ensure audio is in the right format
	if audio.dtype != np.int16:
	# Normalize to -1 to 1 range if needed
	if audio.max() > 1.0 or audio.min() < -1.0:
	audio = audio / max(abs(audio.max()), abs(audio.min()))
	# Convert to 16-bit PCM
	audio = (audio * 32767).astype(np.int16)

	# Create temporary file
	fd, path = tempfile.mkstemp(suffix=".wav")
	os.close(fd)

	# Save audio using available library
	if USE_SCIPY:
	wavfile.write(path, sample_rate, audio)
	elif USE_SOUNDFILE:
	sf.write(path, audio, sample_rate)
	else:
	raise RuntimeError("No audio library available. Please install scipy or soundfile.")

	return path


	def tts_gradio_fn(text, speaker, sample_rate):
	"""
	Gradio interface function.

	Args:
	text: Input text
	speaker: Selected speaker voice
	sample_rate: Audio sample rate

	Returns:
	Path to generated audio file
	"""
	if not text or not text.strip():
	raise gr.Error("Please enter some text to synthesize")

	# Warn if text is too long
	if len(text) > 200:
	raise gr.Error("Text is too long. Please use shorter text (under 200 characters)")

	try:
	path = synthesize_text_to_wavfile(text, speaker, sample_rate)
	return path
	except ValueError as e:
	raise gr.Error(f"Text processing failed: {str(e)}. Try simpler text or a different language.")
	except Exception as e:
	raise gr.Error(f"Speech generation failed: {str(e)}")


	# Create Gradio interface
	with gr.Blocks(title="Silero v4 Indic TTS") as demo:
	gr.Markdown("# Silero v4 Indic Text-to-Speech")
	gr.Markdown("Convert text to speech in multiple Indian languages")
	gr.Markdown("⚠️ Note: Use simple, short phrases for best results. Complex sentences may fail.")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Enter Text",
	placeholder="नमस्ते (Enter short text in Hindi, Bengali, Tamil, etc.)",
	lines=3,
	info="Keep text short and simple for best results"
	)

	speaker_dropdown = gr.Dropdown(
	choices=AVAILABLE_SPEAKERS,
	value=DEFAULT_SPEAKER,
	label="Select Speaker Voice"
	)

	sample_rate_dropdown = gr.Dropdown(
	choices=[8000, 16000, 24000, 48000],
	value=DEFAULT_SAMPLE_RATE,
	label="Sample Rate (Hz)"
	)

	submit_btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(
	label="Generated Audio",
	type="filepath"
	)

	# Examples
	gr.Examples(
	examples=[
	["नमस्ते", "hindi_female", 48000],
	["आप कैसे हैं", "hindi_male", 48000],
	["হ্যালো", "bengali_female", 48000],
	["வணக்கம்", "tamil_female", 48000],
	["హలో", "telugu_female", 48000],
	["ಹಲೋ", "kannada_female", 48000],
	["હેલો", "gujarati_female", 48000],
	],
	inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
	outputs=audio_output,
	fn=tts_gradio_fn,
	cache_examples=False
	)

	submit_btn.click(
	fn=tts_gradio_fn,
	inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
	outputs=audio_output
	)

	# Launch the app with API enabled
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_api=True # This enables the API documentation
	)