awaz / app.py
Ashish Kumar
Add note that model is private and requires authentication
c2c851c
"""
Gradio app for Aawaz Hindi TTS Playground with Voice Cloning
Host this on Hugging Face Spaces: ashishkblink/awaz
"""
import gradio as gr
import torch
# Try to import spaces (only available on Hugging Face Spaces)
try:
import spaces
ON_SPACES = True
except ImportError:
ON_SPACES = False
# Create a dummy decorator for local use
class Spaces:
@staticmethod
def GPU(func):
return func
spaces = Spaces()
from transformers import VitsModel, VitsTokenizer, AutoModel, AutoTokenizer
import soundfile as sf
import numpy as np
from pathlib import Path
import tempfile
import os
import librosa
import librosa
# Model configuration
MODEL_ID = "ashishkblink/Aawaz" # Your model repository
FALLBACK_MODEL = "facebook/mms-tts-hin" # Fallback if custom model fails
# Load models (will be loaded on first use)
model = None
tokenizer = None
voice_clone_model = None # For voice cloning (TTS library)
device = "cuda" if torch.cuda.is_available() else "cpu"
# Check if TTS is available for voice cloning
try:
from TTS.api import TTS
TTS_AVAILABLE = True
except ImportError:
TTS_AVAILABLE = False
def load_model():
"""Load the standard TTS model."""
global model, tokenizer
if model is not None:
return model, tokenizer
print(f"Loading model: {MODEL_ID}...")
try:
# Try loading from your repository first
try:
model = VitsModel.from_pretrained(MODEL_ID)
tokenizer = VitsTokenizer.from_pretrained(MODEL_ID)
print(f"✅ Loaded model from {MODEL_ID}")
except Exception as e:
print(f"Could not load from {MODEL_ID}: {e}")
print(f"Trying fallback: {FALLBACK_MODEL}")
try:
model = VitsModel.from_pretrained(FALLBACK_MODEL)
tokenizer = VitsTokenizer.from_pretrained(FALLBACK_MODEL)
print(f"✅ Loaded fallback model: {FALLBACK_MODEL}")
except Exception as e2:
# Try AutoModel as last resort
print(f"Trying AutoModel...")
model = AutoModel.from_pretrained(FALLBACK_MODEL)
tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL)
print(f"✅ Loaded using AutoModel")
model = model.to(device)
model.eval()
return model, tokenizer
except Exception as e:
print(f"Error loading model: {e}")
raise
@spaces.GPU
def synthesize(text, speed=1.0):
"""
Synthesize speech from text using standard TTS.
Args:
text: Input text (Hindi recommended, English may also work)
speed: Speed multiplier (not all models support this)
"""
if not text or not text.strip():
return None, "Please enter some text (Hindi recommended)."
try:
# Load model if not already loaded
model, tokenizer = load_model()
# Tokenize input
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate speech
with torch.no_grad():
try:
outputs = model(**inputs)
# Extract audio - handle different output formats
if hasattr(outputs, "waveform"):
audio = outputs.waveform
elif hasattr(outputs, "audio"):
audio = outputs.audio
elif isinstance(outputs, tuple) and len(outputs) > 0:
audio = outputs[0]
else:
# Try generate method
if hasattr(model, "generate"):
audio = model.generate(**inputs)
else:
audio = outputs.last_hidden_state # Fallback
# Convert to numpy
if isinstance(audio, torch.Tensor):
audio = audio.squeeze().cpu().numpy()
else:
audio = np.array(audio).squeeze()
# Normalize audio
if audio.max() > 1.0 or audio.min() < -1.0:
audio = audio / (np.abs(audio).max() + 1e-8) * 0.95
# Sample rate (default for VITS is usually 22050 or 16000)
sample_rate = getattr(model.config, "sampling_rate", 22050)
return (sample_rate, audio), None
except Exception as e:
error_msg = f"Error during synthesis: {str(e)}"
print(error_msg)
return None, error_msg
except Exception as e:
error_msg = f"Error: {str(e)}"
print(error_msg)
return None, error_msg
def clone_voice(reference_audio, text, language="hi"):
"""
Clone voice from reference audio and synthesize speech.
Note: Voice cloning works locally when TTS library is installed.
On Hugging Face Spaces, this feature is disabled due to dependency size limits.
"""
if not text or not text.strip():
return None, "Please enter some Hindi text."
if reference_audio is None:
return None, "Please record or upload a reference audio sample (3-10 seconds recommended)."
# Try to use TTS library if available (local use)
try:
from TTS.api import TTS
except ImportError:
error_msg = (
"🎭 Voice cloning requires TTS library.\n\n"
"**Install TTS:** `pip install TTS`\n\n"
"**Note:** Voice cloning is not available on Hugging Face Spaces due to build limits.\n"
"This feature works when running locally with TTS installed."
)
return None, error_msg
try:
# Load voice cloning model (cached after first load)
global voice_clone_model
if voice_clone_model is None:
print("Loading XTTS-v2 voice cloning model (first time may take 2-3 minutes)...")
voice_clone_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=(device == "cuda"))
print("✅ Voice cloning model loaded")
# Handle audio input (Gradio returns tuple of (sample_rate, audio_array))
if isinstance(reference_audio, tuple):
sample_rate, audio_array = reference_audio
else:
# File path
audio_array, sample_rate = sf.read(reference_audio)
# Preprocess audio for better voice cloning results
# Convert to mono if stereo
if len(audio_array.shape) > 1:
audio_array = np.mean(audio_array, axis=1)
# Resample to 22050 Hz (XTTS works best at this sample rate)
target_sr = 22050
if sample_rate != target_sr:
audio_array = librosa.resample(audio_array.astype(np.float32), orig_sr=sample_rate, target_sr=target_sr)
sample_rate = target_sr
# Normalize audio to prevent clipping
max_val = np.abs(audio_array).max()
if max_val > 0:
audio_array = audio_array / max_val * 0.95
# Ensure audio is not too short (at least 1 second) or too long (max 15 seconds)
min_duration = 1.0
max_duration = 15.0
duration = len(audio_array) / sample_rate
if duration < min_duration:
# Pad with silence
padding_samples = int((min_duration - duration) * sample_rate)
audio_array = np.pad(audio_array, (0, padding_samples), mode='constant')
elif duration > max_duration:
# Trim to max duration
max_samples = int(max_duration * sample_rate)
audio_array = audio_array[:max_samples]
# Save preprocessed reference audio to temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_ref:
sf.write(tmp_ref.name, audio_array, sample_rate)
ref_path = tmp_ref.name
try:
# Generate output path
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_out:
out_path = tmp_out.name
# Synthesize with voice cloning
print(f"Synthesizing with voice cloning: {text[:50]}...")
voice_clone_model.tts_to_file(
text=text,
file_path=out_path,
speaker_wav=ref_path,
language=language
)
# Load and return generated audio
audio, sr = sf.read(out_path)
return (sr, audio), None
finally:
# Cleanup temp files
try:
if os.path.exists(ref_path):
os.unlink(ref_path)
if os.path.exists(out_path):
os.unlink(out_path)
except:
pass
except Exception as e:
error_msg = f"Error during voice cloning: {str(e)}"
print(error_msg)
import traceback
traceback.print_exc()
return None, error_msg
# Gradio interface
def create_interface():
"""Create Gradio interface with tabs for standard TTS and voice cloning."""
# Example texts in Hindi
examples = [
"नमस्ते, मैं आवाज़ हूँ।",
"यह एक हिंदी टेक्स्ट-टू-स्पीच मॉडल है।",
"आप कैसे हैं?",
"मुझे हिंदी बोलना बहुत पसंद है।",
"यह प्रौद्योगिकी अद्भुत है।"
]
with gr.Blocks(title="Aawaz - Hindi TTS Playground") as demo:
gr.Markdown("""
# 🎙️ Aawaz - Hindi Text-to-Speech Playground
Fine-tuned Hindi TTS model with high-quality speech synthesis.
**✅ Recommended: Standard Hindi TTS**
- Uses your fine-tuned Hindi TTS model
- High-quality, accurate Hindi speech
- Fast and reliable
**⚠️ Experimental: Voice Cloning**
- Uses XTTS-v2 (limited Hindi support)
- Results may vary
""")
with gr.Tabs():
# Standard TTS Tab
with gr.Tab("🎤 Standard TTS (Recommended)"):
gr.Markdown("""
### ✅ Generate High-Quality Hindi Speech
This uses your fine-tuned Hindi TTS model for accurate, natural-sounding speech.
""")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Hindi Text",
placeholder="नमस्ते, यहाँ अपना हिंदी पाठ लिखें...",
lines=5,
value=examples[0]
)
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Generated Speech", type="numpy")
error_output = gr.Textbox(label="Status", interactive=False)
gr.Markdown("### Example Texts")
gr.Examples(
examples=examples,
inputs=text_input,
label="Click to use example"
)
generate_btn.click(
fn=synthesize,
inputs=[text_input],
outputs=[audio_output, error_output]
)
text_input.submit(
fn=synthesize,
inputs=[text_input],
outputs=[audio_output, error_output]
)
# Voice Cloning Tab - DISABLED due to poor Hindi support
with gr.Tab("🎭 Voice Cloning (Not Recommended)"):
gr.Markdown("""
### ⚠️ Voice Cloning Not Recommended for Hindi
**XTTS-v2 voice cloning does not work well for Hindi** and produces unclear, inaccurate results.
**✅ Please use the "Standard TTS" tab instead:**
- High-quality Hindi speech synthesis
- Uses your fine-tuned model
- Accurate and reliable
- Fast generation
Voice cloning with XTTS-v2 is disabled for Hindi due to poor quality.
""")
gr.Textbox(
label="Status",
value="Voice cloning is not recommended for Hindi. Please use the Standard TTS tab for best results.",
interactive=False
)
# Keep the old code commented out in case user wants to try anyway
if False and TTS_AVAILABLE:
# Show actual voice cloning interface when TTS is available
gr.Markdown("""
### Clone Your Voice! 🎭
**⚠️ Important Notice:**
XTTS-v2 voice cloning has **limited accuracy for Hindi** and may produce unclear or inaccurate results.
**✅ For Best Hindi TTS Results:**
- Use the **"Standard TTS"** tab instead (top tab)
- The Standard TTS uses your fine-tuned Hindi model
- It produces high-quality, accurate Hindi speech
**If you still want to try voice cloning:**
1. Record or upload a 5-10 second clear audio sample
2. Enter Hindi text to synthesize
3. Click "Clone Voice & Generate"
**Note:** Results may vary significantly. For production use, we recommend the Standard TTS tab.
""")
with gr.Row():
with gr.Column(scale=1):
reference_audio = gr.Audio(
label="Reference Voice (Record or Upload)",
type="numpy",
sources=["microphone", "upload"]
)
gr.Markdown("**💡 Tip:** Record 3-10 seconds of clear speech")
with gr.Column(scale=2):
clone_text_input = gr.Textbox(
label="Hindi Text to Synthesize",
placeholder="नमस्ते, मैं आवाज़ हूँ...",
lines=5,
value="नमस्ते, यह मेरी क्लोन की हुई आवाज़ है।"
)
clone_btn = gr.Button("🎭 Clone Voice & Generate", variant="primary", size="lg")
clone_audio_output = gr.Audio(label="Cloned Voice Speech", type="numpy")
clone_error_output = gr.Textbox(label="Status", interactive=False)
gr.Examples(
examples=examples,
inputs=clone_text_input,
label="Example texts for voice cloning"
)
clone_btn.click(
fn=clone_voice,
inputs=[reference_audio, clone_text_input],
outputs=[clone_audio_output, clone_error_output]
)
clone_text_input.submit(
fn=clone_voice,
inputs=[reference_audio, clone_text_input],
outputs=[clone_audio_output, clone_error_output]
)
else:
# Show instructions when TTS is not available
gr.Markdown("""
### Clone Your Voice! (Local Only)
**⚠️ Note:** Voice cloning requires TTS library.
**🚀 To use voice cloning:**
1. **Install TTS library:**
```bash
pip install TTS
```
2. **Restart the app:**
```bash
python app.py
```
""")
clone_error_output = gr.Textbox(
label="ℹ️ Information",
value="TTS library is not installed. Install with: pip install TTS",
interactive=False,
lines=5
)
# API Documentation Tab
with gr.Tab("🔌 API Usage"):
gr.Markdown("""
## 🔌 Using the API
You can use this model programmatically via the Hugging Face Inference API.
""")
gr.Markdown("""
### 📍 API Endpoint
```
https://router.huggingface.co/models/ashishkblink/Aawaz
```
**Method:** `POST`
**Authentication:** Required (Hugging Face token) - Model is private
**🔑 Get your token:** https://huggingface.co/settings/tokens
**⚠️ Note:** This model is private, so authentication is required. Make sure your token has access to `ashishkblink/Aawaz`.
""")
with gr.Accordion("🐍 Python Example", open=True):
gr.Markdown("""
**Using huggingface_hub (Recommended):**
```python
from huggingface_hub import InferenceClient
client = InferenceClient(
model="ashishkblink/Aawaz",
token="YOUR_HF_TOKEN" # Get at https://huggingface.co/settings/tokens
)
audio = client.text_to_speech("नमस्ते, मैं आवाज़ हूँ।")
# Save to file
with open("output.wav", "wb") as f:
f.write(audio)
```
**Using requests:**
```python
import requests
url = "https://router.huggingface.co/models/ashishkblink/Aawaz"
headers = {
"Authorization": "Bearer YOUR_HF_TOKEN",
"Content-Type": "application/json"
}
data = {"inputs": "नमस्ते, मैं आवाज़ हूँ।"}
response = requests.post(url, headers=headers, json=data)
with open("output.wav", "wb") as f:
f.write(response.content)
```
""")
with gr.Accordion("💻 cURL Example", open=False):
gr.Markdown("""
```bash
curl https://router.huggingface.co/models/ashishkblink/Aawaz \\
-X POST \\
-H "Authorization: Bearer YOUR_HF_TOKEN" \\
-H "Content-Type: application/json" \\
-d '{"inputs": "नमस्ते, मैं आवाज़ हूँ।"}' \\
--output output.wav
```
""")
with gr.Accordion("🌐 JavaScript/CodePen Example", open=False):
gr.Markdown("""
```javascript
fetch('https://router.huggingface.co/models/ashishkblink/Aawaz', {
method: 'POST',
headers: {
'Authorization': 'Bearer YOUR_HF_TOKEN',
'Content-Type': 'application/json'
},
body: JSON.stringify({
inputs: 'नमस्ते, मैं आवाज़ हूँ।'
})
})
.then(response => response.blob())
.then(blob => {
const url = URL.createObjectURL(blob);
const audio = new Audio(url);
audio.play();
// Or create download link
const a = document.createElement('a');
a.href = url;
a.download = 'output.wav';
a.click();
})
.catch(error => console.error('Error:', error));
```
""")
with gr.Accordion("📋 Request/Response Details", open=False):
gr.Markdown("""
**Request Body:**
```json
{
"inputs": "नमस्ते, मैं आवाज़ हूँ।"
}
```
**Response:**
- **Format:** WAV audio file
- **Sample Rate:** 22050 Hz
- **Content-Type:** `audio/wav`
**Input Requirements:**
- Text must be in Hindi (Devanagari script)
- Max length: ~500 characters recommended
- Language: Hindi only (English not supported)
**Error Codes:**
- `200`: Success
- `401`: Invalid or missing token
- `503`: Model is loading (wait 10-30 seconds and retry)
- `429`: Rate limit exceeded
""")
gr.Markdown("""
---
**📚 For complete documentation, visit:**
Model Repository: https://huggingface.co/ashishkblink/Aawaz
""")
gr.Markdown("""
---
**Model Information:**
- Standard TTS: `ashishkblink/Aawaz` ✅ Available on Spaces
- Voice Cloning: XTTS-v2 (Coqui TTS) ⚠️ Available locally only
""")
return demo
if __name__ == "__main__":
demo = create_interface()
# Gradio 4.x doesn't support theme parameter in launch()
demo.launch(server_name="0.0.0.0", server_port=7860)