Spaces:

ashishkblink
/

awaz

Sleeping

awaz / app.py

Ashish Kumar

Add note that model is private and requires authentication

c2c851c 4 months ago

23.7 kB

	"""
	Gradio app for Aawaz Hindi TTS Playground with Voice Cloning
	Host this on Hugging Face Spaces: ashishkblink/awaz
	"""

	import gradio as gr
	import torch

	# Try to import spaces (only available on Hugging Face Spaces)
	try:
	import spaces
	ON_SPACES = True
	except ImportError:
	ON_SPACES = False
	# Create a dummy decorator for local use
	class Spaces:
	@staticmethod
	def GPU(func):
	return func
	spaces = Spaces()
	from transformers import VitsModel, VitsTokenizer, AutoModel, AutoTokenizer
	import soundfile as sf
	import numpy as np
	from pathlib import Path
	import tempfile
	import os
	import librosa
	import librosa

	# Model configuration
	MODEL_ID = "ashishkblink/Aawaz" # Your model repository
	FALLBACK_MODEL = "facebook/mms-tts-hin" # Fallback if custom model fails

	# Load models (will be loaded on first use)
	model = None
	tokenizer = None
	voice_clone_model = None # For voice cloning (TTS library)
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Check if TTS is available for voice cloning
	try:
	from TTS.api import TTS
	TTS_AVAILABLE = True
	except ImportError:
	TTS_AVAILABLE = False


	def load_model():
	"""Load the standard TTS model."""
	global model, tokenizer

	if model is not None:
	return model, tokenizer

	print(f"Loading model: {MODEL_ID}...")
	try:
	# Try loading from your repository first
	try:
	model = VitsModel.from_pretrained(MODEL_ID)
	tokenizer = VitsTokenizer.from_pretrained(MODEL_ID)
	print(f"✅ Loaded model from {MODEL_ID}")
	except Exception as e:
	print(f"Could not load from {MODEL_ID}: {e}")
	print(f"Trying fallback: {FALLBACK_MODEL}")
	try:
	model = VitsModel.from_pretrained(FALLBACK_MODEL)
	tokenizer = VitsTokenizer.from_pretrained(FALLBACK_MODEL)
	print(f"✅ Loaded fallback model: {FALLBACK_MODEL}")
	except Exception as e2:
	# Try AutoModel as last resort
	print(f"Trying AutoModel...")
	model = AutoModel.from_pretrained(FALLBACK_MODEL)
	tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL)
	print(f"✅ Loaded using AutoModel")

	model = model.to(device)
	model.eval()
	return model, tokenizer

	except Exception as e:
	print(f"Error loading model: {e}")
	raise




	@spaces.GPU
	def synthesize(text, speed=1.0):
	"""
	Synthesize speech from text using standard TTS.

	Args:
	text: Input text (Hindi recommended, English may also work)
	speed: Speed multiplier (not all models support this)
	"""
	if not text or not text.strip():
	return None, "Please enter some text (Hindi recommended)."

	try:
	# Load model if not already loaded
	model, tokenizer = load_model()

	# Tokenize input
	inputs = tokenizer(text, return_tensors="pt")
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Generate speech
	with torch.no_grad():
	try:
	outputs = model(**inputs)

	# Extract audio - handle different output formats
	if hasattr(outputs, "waveform"):
	audio = outputs.waveform
	elif hasattr(outputs, "audio"):
	audio = outputs.audio
	elif isinstance(outputs, tuple) and len(outputs) > 0:
	audio = outputs[0]
	else:
	# Try generate method
	if hasattr(model, "generate"):
	audio = model.generate(**inputs)
	else:
	audio = outputs.last_hidden_state # Fallback

	# Convert to numpy
	if isinstance(audio, torch.Tensor):
	audio = audio.squeeze().cpu().numpy()
	else:
	audio = np.array(audio).squeeze()

	# Normalize audio
	if audio.max() > 1.0 or audio.min() < -1.0:
	audio = audio / (np.abs(audio).max() + 1e-8) * 0.95

	# Sample rate (default for VITS is usually 22050 or 16000)
	sample_rate = getattr(model.config, "sampling_rate", 22050)

	return (sample_rate, audio), None

	except Exception as e:
	error_msg = f"Error during synthesis: {str(e)}"
	print(error_msg)
	return None, error_msg

	except Exception as e:
	error_msg = f"Error: {str(e)}"
	print(error_msg)
	return None, error_msg


	def clone_voice(reference_audio, text, language="hi"):
	"""
	Clone voice from reference audio and synthesize speech.

	Note: Voice cloning works locally when TTS library is installed.
	On Hugging Face Spaces, this feature is disabled due to dependency size limits.
	"""
	if not text or not text.strip():
	return None, "Please enter some Hindi text."

	if reference_audio is None:
	return None, "Please record or upload a reference audio sample (3-10 seconds recommended)."

	# Try to use TTS library if available (local use)
	try:
	from TTS.api import TTS
	except ImportError:
	error_msg = (
	"🎭 Voice cloning requires TTS library.\n\n"
	"Install TTS: `pip install TTS`\n\n"
	"Note: Voice cloning is not available on Hugging Face Spaces due to build limits.\n"
	"This feature works when running locally with TTS installed."
	)
	return None, error_msg

	try:
	# Load voice cloning model (cached after first load)
	global voice_clone_model
	if voice_clone_model is None:
	print("Loading XTTS-v2 voice cloning model (first time may take 2-3 minutes)...")
	voice_clone_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=(device == "cuda"))
	print("✅ Voice cloning model loaded")

	# Handle audio input (Gradio returns tuple of (sample_rate, audio_array))
	if isinstance(reference_audio, tuple):
	sample_rate, audio_array = reference_audio
	else:
	# File path
	audio_array, sample_rate = sf.read(reference_audio)

	# Preprocess audio for better voice cloning results
	# Convert to mono if stereo
	if len(audio_array.shape) > 1:
	audio_array = np.mean(audio_array, axis=1)

	# Resample to 22050 Hz (XTTS works best at this sample rate)
	target_sr = 22050
	if sample_rate != target_sr:
	audio_array = librosa.resample(audio_array.astype(np.float32), orig_sr=sample_rate, target_sr=target_sr)
	sample_rate = target_sr

	# Normalize audio to prevent clipping
	max_val = np.abs(audio_array).max()
	if max_val > 0:
	audio_array = audio_array / max_val * 0.95

	# Ensure audio is not too short (at least 1 second) or too long (max 15 seconds)
	min_duration = 1.0
	max_duration = 15.0
	duration = len(audio_array) / sample_rate

	if duration < min_duration:
	# Pad with silence
	padding_samples = int((min_duration - duration) * sample_rate)
	audio_array = np.pad(audio_array, (0, padding_samples), mode='constant')
	elif duration > max_duration:
	# Trim to max duration
	max_samples = int(max_duration * sample_rate)
	audio_array = audio_array[:max_samples]

	# Save preprocessed reference audio to temp file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_ref:
	sf.write(tmp_ref.name, audio_array, sample_rate)
	ref_path = tmp_ref.name

	try:
	# Generate output path
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_out:
	out_path = tmp_out.name

	# Synthesize with voice cloning
	print(f"Synthesizing with voice cloning: {text[:50]}...")
	voice_clone_model.tts_to_file(
	text=text,
	file_path=out_path,
	speaker_wav=ref_path,
	language=language
	)

	# Load and return generated audio
	audio, sr = sf.read(out_path)

	return (sr, audio), None

	finally:
	# Cleanup temp files
	try:
	if os.path.exists(ref_path):
	os.unlink(ref_path)
	if os.path.exists(out_path):
	os.unlink(out_path)
	except:
	pass

	except Exception as e:
	error_msg = f"Error during voice cloning: {str(e)}"
	print(error_msg)
	import traceback
	traceback.print_exc()
	return None, error_msg


	# Gradio interface
	def create_interface():
	"""Create Gradio interface with tabs for standard TTS and voice cloning."""

	# Example texts in Hindi
	examples = [
	"नमस्ते, मैं आवाज़ हूँ।",
	"यह एक हिंदी टेक्स्ट-टू-स्पीच मॉडल है।",
	"आप कैसे हैं?",
	"मुझे हिंदी बोलना बहुत पसंद है।",
	"यह प्रौद्योगिकी अद्भुत है।"
	]

	with gr.Blocks(title="Aawaz - Hindi TTS Playground") as demo:
	gr.Markdown("""
	# 🎙️ Aawaz - Hindi Text-to-Speech Playground

	Fine-tuned Hindi TTS model with high-quality speech synthesis.

	✅ Recommended: Standard Hindi TTS
	- Uses your fine-tuned Hindi TTS model
	- High-quality, accurate Hindi speech
	- Fast and reliable

	⚠️ Experimental: Voice Cloning
	- Uses XTTS-v2 (limited Hindi support)
	- Results may vary
	""")

	with gr.Tabs():
	# Standard TTS Tab
	with gr.Tab("🎤 Standard TTS (Recommended)"):
	gr.Markdown("""
	### ✅ Generate High-Quality Hindi Speech

	This uses your fine-tuned Hindi TTS model for accurate, natural-sounding speech.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Hindi Text",
	placeholder="नमस्ते, यहाँ अपना हिंदी पाठ लिखें...",
	lines=5,
	value=examples[0]
	)
	generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")

	with gr.Column(scale=1):
	audio_output = gr.Audio(label="Generated Speech", type="numpy")
	error_output = gr.Textbox(label="Status", interactive=False)

	gr.Markdown("### Example Texts")
	gr.Examples(
	examples=examples,
	inputs=text_input,
	label="Click to use example"
	)

	generate_btn.click(
	fn=synthesize,
	inputs=[text_input],
	outputs=[audio_output, error_output]
	)

	text_input.submit(
	fn=synthesize,
	inputs=[text_input],
	outputs=[audio_output, error_output]
	)

	# Voice Cloning Tab - DISABLED due to poor Hindi support
	with gr.Tab("🎭 Voice Cloning (Not Recommended)"):
	gr.Markdown("""
	### ⚠️ Voice Cloning Not Recommended for Hindi

	XTTS-v2 voice cloning does not work well for Hindi and produces unclear, inaccurate results.

	✅ Please use the "Standard TTS" tab instead:
	- High-quality Hindi speech synthesis
	- Uses your fine-tuned model
	- Accurate and reliable
	- Fast generation

	Voice cloning with XTTS-v2 is disabled for Hindi due to poor quality.
	""")

	gr.Textbox(
	label="Status",
	value="Voice cloning is not recommended for Hindi. Please use the Standard TTS tab for best results.",
	interactive=False
	)

	# Keep the old code commented out in case user wants to try anyway
	if False and TTS_AVAILABLE:
	# Show actual voice cloning interface when TTS is available
	gr.Markdown("""
	### Clone Your Voice! 🎭

	⚠️ Important Notice:

	XTTS-v2 voice cloning has limited accuracy for Hindi and may produce unclear or inaccurate results.

	✅ For Best Hindi TTS Results:
	- Use the "Standard TTS" tab instead (top tab)
	- The Standard TTS uses your fine-tuned Hindi model
	- It produces high-quality, accurate Hindi speech

	If you still want to try voice cloning:
	1. Record or upload a 5-10 second clear audio sample
	2. Enter Hindi text to synthesize
	3. Click "Clone Voice & Generate"

	Note: Results may vary significantly. For production use, we recommend the Standard TTS tab.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	reference_audio = gr.Audio(
	label="Reference Voice (Record or Upload)",
	type="numpy",
	sources=["microphone", "upload"]
	)
	gr.Markdown("💡 Tip: Record 3-10 seconds of clear speech")

	with gr.Column(scale=2):
	clone_text_input = gr.Textbox(
	label="Hindi Text to Synthesize",
	placeholder="नमस्ते, मैं आवाज़ हूँ...",
	lines=5,
	value="नमस्ते, यह मेरी क्लोन की हुई आवाज़ है।"
	)
	clone_btn = gr.Button("🎭 Clone Voice & Generate", variant="primary", size="lg")

	clone_audio_output = gr.Audio(label="Cloned Voice Speech", type="numpy")
	clone_error_output = gr.Textbox(label="Status", interactive=False)

	gr.Examples(
	examples=examples,
	inputs=clone_text_input,
	label="Example texts for voice cloning"
	)

	clone_btn.click(
	fn=clone_voice,
	inputs=[reference_audio, clone_text_input],
	outputs=[clone_audio_output, clone_error_output]
	)

	clone_text_input.submit(
	fn=clone_voice,
	inputs=[reference_audio, clone_text_input],
	outputs=[clone_audio_output, clone_error_output]
	)
	else:
	# Show instructions when TTS is not available
	gr.Markdown("""
	### Clone Your Voice! (Local Only)

	⚠️ Note: Voice cloning requires TTS library.

	🚀 To use voice cloning:

	1. Install TTS library:
	```bash
	pip install TTS
	```

	2. Restart the app:
	```bash
	python app.py
	```
	""")

	clone_error_output = gr.Textbox(
	label="ℹ️ Information",
	value="TTS library is not installed. Install with: pip install TTS",
	interactive=False,
	lines=5
	)

	# API Documentation Tab
	with gr.Tab("🔌 API Usage"):
	gr.Markdown("""
	## 🔌 Using the API

	You can use this model programmatically via the Hugging Face Inference API.
	""")

	gr.Markdown("""
	### 📍 API Endpoint

	```
	https://router.huggingface.co/models/ashishkblink/Aawaz
	```

	Method: `POST`
	Authentication: Required (Hugging Face token) - Model is private

	🔑 Get your token: https://huggingface.co/settings/tokens

	⚠️ Note: This model is private, so authentication is required. Make sure your token has access to `ashishkblink/Aawaz`.
	""")

	with gr.Accordion("🐍 Python Example", open=True):
	gr.Markdown("""
	Using huggingface_hub (Recommended):
	```python
	from huggingface_hub import InferenceClient

	client = InferenceClient(
	model="ashishkblink/Aawaz",
	token="YOUR_HF_TOKEN" # Get at https://huggingface.co/settings/tokens
	)

	audio = client.text_to_speech("नमस्ते, मैं आवाज़ हूँ।")

	# Save to file
	with open("output.wav", "wb") as f:
	f.write(audio)
	```

	Using requests:
	```python
	import requests

	url = "https://router.huggingface.co/models/ashishkblink/Aawaz"
	headers = {
	"Authorization": "Bearer YOUR_HF_TOKEN",
	"Content-Type": "application/json"
	}
	data = {"inputs": "नमस्ते, मैं आवाज़ हूँ।"}

	response = requests.post(url, headers=headers, json=data)
	with open("output.wav", "wb") as f:
	f.write(response.content)
	```
	""")

	with gr.Accordion("💻 cURL Example", open=False):
	gr.Markdown("""
	```bash
	curl https://router.huggingface.co/models/ashishkblink/Aawaz \\
	-X POST \\
	-H "Authorization: Bearer YOUR_HF_TOKEN" \\
	-H "Content-Type: application/json" \\
	-d '{"inputs": "नमस्ते, मैं आवाज़ हूँ।"}' \\
	--output output.wav
	```
	""")

	with gr.Accordion("🌐 JavaScript/CodePen Example", open=False):
	gr.Markdown("""
	```javascript
	fetch('https://router.huggingface.co/models/ashishkblink/Aawaz', {
	method: 'POST',
	headers: {
	'Authorization': 'Bearer YOUR_HF_TOKEN',
	'Content-Type': 'application/json'
	},
	body: JSON.stringify({
	inputs: 'नमस्ते, मैं आवाज़ हूँ।'
	})
	})
	.then(response => response.blob())
	.then(blob => {
	const url = URL.createObjectURL(blob);
	const audio = new Audio(url);
	audio.play();

	// Or create download link
	const a = document.createElement('a');
	a.href = url;
	a.download = 'output.wav';
	a.click();
	})
	.catch(error => console.error('Error:', error));
	```
	""")

	with gr.Accordion("📋 Request/Response Details", open=False):
	gr.Markdown("""
	Request Body:
	```json
	{
	"inputs": "नमस्ते, मैं आवाज़ हूँ।"
	}
	```

	Response:
	- Format: WAV audio file
	- Sample Rate: 22050 Hz
	- Content-Type: `audio/wav`

	Input Requirements:
	- Text must be in Hindi (Devanagari script)
	- Max length: ~500 characters recommended
	- Language: Hindi only (English not supported)

	Error Codes:
	- `200`: Success
	- `401`: Invalid or missing token
	- `503`: Model is loading (wait 10-30 seconds and retry)
	- `429`: Rate limit exceeded
	""")

	gr.Markdown("""
	---
	📚 For complete documentation, visit:
	Model Repository: https://huggingface.co/ashishkblink/Aawaz
	""")

	gr.Markdown("""
	---
	Model Information:
	- Standard TTS: `ashishkblink/Aawaz` ✅ Available on Spaces
	- Voice Cloning: XTTS-v2 (Coqui TTS) ⚠️ Available locally only
	""")

	return demo


	if __name__ == "__main__":
	demo = create_interface()
	# Gradio 4.x doesn't support theme parameter in launch()
	demo.launch(server_name="0.0.0.0", server_port=7860)