Spaces:
Sleeping
Sleeping
| """ | |
| Gradio app for Aawaz Hindi TTS Playground with Voice Cloning | |
| Host this on Hugging Face Spaces: ashishkblink/awaz | |
| """ | |
| import gradio as gr | |
| import torch | |
| # Try to import spaces (only available on Hugging Face Spaces) | |
| try: | |
| import spaces | |
| ON_SPACES = True | |
| except ImportError: | |
| ON_SPACES = False | |
| # Create a dummy decorator for local use | |
| class Spaces: | |
| def GPU(func): | |
| return func | |
| spaces = Spaces() | |
| from transformers import VitsModel, VitsTokenizer, AutoModel, AutoTokenizer | |
| import soundfile as sf | |
| import numpy as np | |
| from pathlib import Path | |
| import tempfile | |
| import os | |
| import librosa | |
| import librosa | |
| # Model configuration | |
| MODEL_ID = "ashishkblink/Aawaz" # Your model repository | |
| FALLBACK_MODEL = "facebook/mms-tts-hin" # Fallback if custom model fails | |
| # Load models (will be loaded on first use) | |
| model = None | |
| tokenizer = None | |
| voice_clone_model = None # For voice cloning (TTS library) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Check if TTS is available for voice cloning | |
| try: | |
| from TTS.api import TTS | |
| TTS_AVAILABLE = True | |
| except ImportError: | |
| TTS_AVAILABLE = False | |
| def load_model(): | |
| """Load the standard TTS model.""" | |
| global model, tokenizer | |
| if model is not None: | |
| return model, tokenizer | |
| print(f"Loading model: {MODEL_ID}...") | |
| try: | |
| # Try loading from your repository first | |
| try: | |
| model = VitsModel.from_pretrained(MODEL_ID) | |
| tokenizer = VitsTokenizer.from_pretrained(MODEL_ID) | |
| print(f"✅ Loaded model from {MODEL_ID}") | |
| except Exception as e: | |
| print(f"Could not load from {MODEL_ID}: {e}") | |
| print(f"Trying fallback: {FALLBACK_MODEL}") | |
| try: | |
| model = VitsModel.from_pretrained(FALLBACK_MODEL) | |
| tokenizer = VitsTokenizer.from_pretrained(FALLBACK_MODEL) | |
| print(f"✅ Loaded fallback model: {FALLBACK_MODEL}") | |
| except Exception as e2: | |
| # Try AutoModel as last resort | |
| print(f"Trying AutoModel...") | |
| model = AutoModel.from_pretrained(FALLBACK_MODEL) | |
| tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL) | |
| print(f"✅ Loaded using AutoModel") | |
| model = model.to(device) | |
| model.eval() | |
| return model, tokenizer | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| raise | |
| def synthesize(text, speed=1.0): | |
| """ | |
| Synthesize speech from text using standard TTS. | |
| Args: | |
| text: Input text (Hindi recommended, English may also work) | |
| speed: Speed multiplier (not all models support this) | |
| """ | |
| if not text or not text.strip(): | |
| return None, "Please enter some text (Hindi recommended)." | |
| try: | |
| # Load model if not already loaded | |
| model, tokenizer = load_model() | |
| # Tokenize input | |
| inputs = tokenizer(text, return_tensors="pt") | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| # Generate speech | |
| with torch.no_grad(): | |
| try: | |
| outputs = model(**inputs) | |
| # Extract audio - handle different output formats | |
| if hasattr(outputs, "waveform"): | |
| audio = outputs.waveform | |
| elif hasattr(outputs, "audio"): | |
| audio = outputs.audio | |
| elif isinstance(outputs, tuple) and len(outputs) > 0: | |
| audio = outputs[0] | |
| else: | |
| # Try generate method | |
| if hasattr(model, "generate"): | |
| audio = model.generate(**inputs) | |
| else: | |
| audio = outputs.last_hidden_state # Fallback | |
| # Convert to numpy | |
| if isinstance(audio, torch.Tensor): | |
| audio = audio.squeeze().cpu().numpy() | |
| else: | |
| audio = np.array(audio).squeeze() | |
| # Normalize audio | |
| if audio.max() > 1.0 or audio.min() < -1.0: | |
| audio = audio / (np.abs(audio).max() + 1e-8) * 0.95 | |
| # Sample rate (default for VITS is usually 22050 or 16000) | |
| sample_rate = getattr(model.config, "sampling_rate", 22050) | |
| return (sample_rate, audio), None | |
| except Exception as e: | |
| error_msg = f"Error during synthesis: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| except Exception as e: | |
| error_msg = f"Error: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| def clone_voice(reference_audio, text, language="hi"): | |
| """ | |
| Clone voice from reference audio and synthesize speech. | |
| Note: Voice cloning works locally when TTS library is installed. | |
| On Hugging Face Spaces, this feature is disabled due to dependency size limits. | |
| """ | |
| if not text or not text.strip(): | |
| return None, "Please enter some Hindi text." | |
| if reference_audio is None: | |
| return None, "Please record or upload a reference audio sample (3-10 seconds recommended)." | |
| # Try to use TTS library if available (local use) | |
| try: | |
| from TTS.api import TTS | |
| except ImportError: | |
| error_msg = ( | |
| "🎭 Voice cloning requires TTS library.\n\n" | |
| "**Install TTS:** `pip install TTS`\n\n" | |
| "**Note:** Voice cloning is not available on Hugging Face Spaces due to build limits.\n" | |
| "This feature works when running locally with TTS installed." | |
| ) | |
| return None, error_msg | |
| try: | |
| # Load voice cloning model (cached after first load) | |
| global voice_clone_model | |
| if voice_clone_model is None: | |
| print("Loading XTTS-v2 voice cloning model (first time may take 2-3 minutes)...") | |
| voice_clone_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=(device == "cuda")) | |
| print("✅ Voice cloning model loaded") | |
| # Handle audio input (Gradio returns tuple of (sample_rate, audio_array)) | |
| if isinstance(reference_audio, tuple): | |
| sample_rate, audio_array = reference_audio | |
| else: | |
| # File path | |
| audio_array, sample_rate = sf.read(reference_audio) | |
| # Preprocess audio for better voice cloning results | |
| # Convert to mono if stereo | |
| if len(audio_array.shape) > 1: | |
| audio_array = np.mean(audio_array, axis=1) | |
| # Resample to 22050 Hz (XTTS works best at this sample rate) | |
| target_sr = 22050 | |
| if sample_rate != target_sr: | |
| audio_array = librosa.resample(audio_array.astype(np.float32), orig_sr=sample_rate, target_sr=target_sr) | |
| sample_rate = target_sr | |
| # Normalize audio to prevent clipping | |
| max_val = np.abs(audio_array).max() | |
| if max_val > 0: | |
| audio_array = audio_array / max_val * 0.95 | |
| # Ensure audio is not too short (at least 1 second) or too long (max 15 seconds) | |
| min_duration = 1.0 | |
| max_duration = 15.0 | |
| duration = len(audio_array) / sample_rate | |
| if duration < min_duration: | |
| # Pad with silence | |
| padding_samples = int((min_duration - duration) * sample_rate) | |
| audio_array = np.pad(audio_array, (0, padding_samples), mode='constant') | |
| elif duration > max_duration: | |
| # Trim to max duration | |
| max_samples = int(max_duration * sample_rate) | |
| audio_array = audio_array[:max_samples] | |
| # Save preprocessed reference audio to temp file | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_ref: | |
| sf.write(tmp_ref.name, audio_array, sample_rate) | |
| ref_path = tmp_ref.name | |
| try: | |
| # Generate output path | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_out: | |
| out_path = tmp_out.name | |
| # Synthesize with voice cloning | |
| print(f"Synthesizing with voice cloning: {text[:50]}...") | |
| voice_clone_model.tts_to_file( | |
| text=text, | |
| file_path=out_path, | |
| speaker_wav=ref_path, | |
| language=language | |
| ) | |
| # Load and return generated audio | |
| audio, sr = sf.read(out_path) | |
| return (sr, audio), None | |
| finally: | |
| # Cleanup temp files | |
| try: | |
| if os.path.exists(ref_path): | |
| os.unlink(ref_path) | |
| if os.path.exists(out_path): | |
| os.unlink(out_path) | |
| except: | |
| pass | |
| except Exception as e: | |
| error_msg = f"Error during voice cloning: {str(e)}" | |
| print(error_msg) | |
| import traceback | |
| traceback.print_exc() | |
| return None, error_msg | |
| # Gradio interface | |
| def create_interface(): | |
| """Create Gradio interface with tabs for standard TTS and voice cloning.""" | |
| # Example texts in Hindi | |
| examples = [ | |
| "नमस्ते, मैं आवाज़ हूँ।", | |
| "यह एक हिंदी टेक्स्ट-टू-स्पीच मॉडल है।", | |
| "आप कैसे हैं?", | |
| "मुझे हिंदी बोलना बहुत पसंद है।", | |
| "यह प्रौद्योगिकी अद्भुत है।" | |
| ] | |
| with gr.Blocks(title="Aawaz - Hindi TTS Playground") as demo: | |
| gr.Markdown(""" | |
| # 🎙️ Aawaz - Hindi Text-to-Speech Playground | |
| Fine-tuned Hindi TTS model with high-quality speech synthesis. | |
| **✅ Recommended: Standard Hindi TTS** | |
| - Uses your fine-tuned Hindi TTS model | |
| - High-quality, accurate Hindi speech | |
| - Fast and reliable | |
| **⚠️ Experimental: Voice Cloning** | |
| - Uses XTTS-v2 (limited Hindi support) | |
| - Results may vary | |
| """) | |
| with gr.Tabs(): | |
| # Standard TTS Tab | |
| with gr.Tab("🎤 Standard TTS (Recommended)"): | |
| gr.Markdown(""" | |
| ### ✅ Generate High-Quality Hindi Speech | |
| This uses your fine-tuned Hindi TTS model for accurate, natural-sounding speech. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="Hindi Text", | |
| placeholder="नमस्ते, यहाँ अपना हिंदी पाठ लिखें...", | |
| lines=5, | |
| value=examples[0] | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio(label="Generated Speech", type="numpy") | |
| error_output = gr.Textbox(label="Status", interactive=False) | |
| gr.Markdown("### Example Texts") | |
| gr.Examples( | |
| examples=examples, | |
| inputs=text_input, | |
| label="Click to use example" | |
| ) | |
| generate_btn.click( | |
| fn=synthesize, | |
| inputs=[text_input], | |
| outputs=[audio_output, error_output] | |
| ) | |
| text_input.submit( | |
| fn=synthesize, | |
| inputs=[text_input], | |
| outputs=[audio_output, error_output] | |
| ) | |
| # Voice Cloning Tab - DISABLED due to poor Hindi support | |
| with gr.Tab("🎭 Voice Cloning (Not Recommended)"): | |
| gr.Markdown(""" | |
| ### ⚠️ Voice Cloning Not Recommended for Hindi | |
| **XTTS-v2 voice cloning does not work well for Hindi** and produces unclear, inaccurate results. | |
| **✅ Please use the "Standard TTS" tab instead:** | |
| - High-quality Hindi speech synthesis | |
| - Uses your fine-tuned model | |
| - Accurate and reliable | |
| - Fast generation | |
| Voice cloning with XTTS-v2 is disabled for Hindi due to poor quality. | |
| """) | |
| gr.Textbox( | |
| label="Status", | |
| value="Voice cloning is not recommended for Hindi. Please use the Standard TTS tab for best results.", | |
| interactive=False | |
| ) | |
| # Keep the old code commented out in case user wants to try anyway | |
| if False and TTS_AVAILABLE: | |
| # Show actual voice cloning interface when TTS is available | |
| gr.Markdown(""" | |
| ### Clone Your Voice! 🎭 | |
| **⚠️ Important Notice:** | |
| XTTS-v2 voice cloning has **limited accuracy for Hindi** and may produce unclear or inaccurate results. | |
| **✅ For Best Hindi TTS Results:** | |
| - Use the **"Standard TTS"** tab instead (top tab) | |
| - The Standard TTS uses your fine-tuned Hindi model | |
| - It produces high-quality, accurate Hindi speech | |
| **If you still want to try voice cloning:** | |
| 1. Record or upload a 5-10 second clear audio sample | |
| 2. Enter Hindi text to synthesize | |
| 3. Click "Clone Voice & Generate" | |
| **Note:** Results may vary significantly. For production use, we recommend the Standard TTS tab. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| reference_audio = gr.Audio( | |
| label="Reference Voice (Record or Upload)", | |
| type="numpy", | |
| sources=["microphone", "upload"] | |
| ) | |
| gr.Markdown("**💡 Tip:** Record 3-10 seconds of clear speech") | |
| with gr.Column(scale=2): | |
| clone_text_input = gr.Textbox( | |
| label="Hindi Text to Synthesize", | |
| placeholder="नमस्ते, मैं आवाज़ हूँ...", | |
| lines=5, | |
| value="नमस्ते, यह मेरी क्लोन की हुई आवाज़ है।" | |
| ) | |
| clone_btn = gr.Button("🎭 Clone Voice & Generate", variant="primary", size="lg") | |
| clone_audio_output = gr.Audio(label="Cloned Voice Speech", type="numpy") | |
| clone_error_output = gr.Textbox(label="Status", interactive=False) | |
| gr.Examples( | |
| examples=examples, | |
| inputs=clone_text_input, | |
| label="Example texts for voice cloning" | |
| ) | |
| clone_btn.click( | |
| fn=clone_voice, | |
| inputs=[reference_audio, clone_text_input], | |
| outputs=[clone_audio_output, clone_error_output] | |
| ) | |
| clone_text_input.submit( | |
| fn=clone_voice, | |
| inputs=[reference_audio, clone_text_input], | |
| outputs=[clone_audio_output, clone_error_output] | |
| ) | |
| else: | |
| # Show instructions when TTS is not available | |
| gr.Markdown(""" | |
| ### Clone Your Voice! (Local Only) | |
| **⚠️ Note:** Voice cloning requires TTS library. | |
| **🚀 To use voice cloning:** | |
| 1. **Install TTS library:** | |
| ```bash | |
| pip install TTS | |
| ``` | |
| 2. **Restart the app:** | |
| ```bash | |
| python app.py | |
| ``` | |
| """) | |
| clone_error_output = gr.Textbox( | |
| label="ℹ️ Information", | |
| value="TTS library is not installed. Install with: pip install TTS", | |
| interactive=False, | |
| lines=5 | |
| ) | |
| # API Documentation Tab | |
| with gr.Tab("🔌 API Usage"): | |
| gr.Markdown(""" | |
| ## 🔌 Using the API | |
| You can use this model programmatically via the Hugging Face Inference API. | |
| """) | |
| gr.Markdown(""" | |
| ### 📍 API Endpoint | |
| ``` | |
| https://router.huggingface.co/models/ashishkblink/Aawaz | |
| ``` | |
| **Method:** `POST` | |
| **Authentication:** Required (Hugging Face token) - Model is private | |
| **🔑 Get your token:** https://huggingface.co/settings/tokens | |
| **⚠️ Note:** This model is private, so authentication is required. Make sure your token has access to `ashishkblink/Aawaz`. | |
| """) | |
| with gr.Accordion("🐍 Python Example", open=True): | |
| gr.Markdown(""" | |
| **Using huggingface_hub (Recommended):** | |
| ```python | |
| from huggingface_hub import InferenceClient | |
| client = InferenceClient( | |
| model="ashishkblink/Aawaz", | |
| token="YOUR_HF_TOKEN" # Get at https://huggingface.co/settings/tokens | |
| ) | |
| audio = client.text_to_speech("नमस्ते, मैं आवाज़ हूँ।") | |
| # Save to file | |
| with open("output.wav", "wb") as f: | |
| f.write(audio) | |
| ``` | |
| **Using requests:** | |
| ```python | |
| import requests | |
| url = "https://router.huggingface.co/models/ashishkblink/Aawaz" | |
| headers = { | |
| "Authorization": "Bearer YOUR_HF_TOKEN", | |
| "Content-Type": "application/json" | |
| } | |
| data = {"inputs": "नमस्ते, मैं आवाज़ हूँ।"} | |
| response = requests.post(url, headers=headers, json=data) | |
| with open("output.wav", "wb") as f: | |
| f.write(response.content) | |
| ``` | |
| """) | |
| with gr.Accordion("💻 cURL Example", open=False): | |
| gr.Markdown(""" | |
| ```bash | |
| curl https://router.huggingface.co/models/ashishkblink/Aawaz \\ | |
| -X POST \\ | |
| -H "Authorization: Bearer YOUR_HF_TOKEN" \\ | |
| -H "Content-Type: application/json" \\ | |
| -d '{"inputs": "नमस्ते, मैं आवाज़ हूँ।"}' \\ | |
| --output output.wav | |
| ``` | |
| """) | |
| with gr.Accordion("🌐 JavaScript/CodePen Example", open=False): | |
| gr.Markdown(""" | |
| ```javascript | |
| fetch('https://router.huggingface.co/models/ashishkblink/Aawaz', { | |
| method: 'POST', | |
| headers: { | |
| 'Authorization': 'Bearer YOUR_HF_TOKEN', | |
| 'Content-Type': 'application/json' | |
| }, | |
| body: JSON.stringify({ | |
| inputs: 'नमस्ते, मैं आवाज़ हूँ।' | |
| }) | |
| }) | |
| .then(response => response.blob()) | |
| .then(blob => { | |
| const url = URL.createObjectURL(blob); | |
| const audio = new Audio(url); | |
| audio.play(); | |
| // Or create download link | |
| const a = document.createElement('a'); | |
| a.href = url; | |
| a.download = 'output.wav'; | |
| a.click(); | |
| }) | |
| .catch(error => console.error('Error:', error)); | |
| ``` | |
| """) | |
| with gr.Accordion("📋 Request/Response Details", open=False): | |
| gr.Markdown(""" | |
| **Request Body:** | |
| ```json | |
| { | |
| "inputs": "नमस्ते, मैं आवाज़ हूँ।" | |
| } | |
| ``` | |
| **Response:** | |
| - **Format:** WAV audio file | |
| - **Sample Rate:** 22050 Hz | |
| - **Content-Type:** `audio/wav` | |
| **Input Requirements:** | |
| - Text must be in Hindi (Devanagari script) | |
| - Max length: ~500 characters recommended | |
| - Language: Hindi only (English not supported) | |
| **Error Codes:** | |
| - `200`: Success | |
| - `401`: Invalid or missing token | |
| - `503`: Model is loading (wait 10-30 seconds and retry) | |
| - `429`: Rate limit exceeded | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| **📚 For complete documentation, visit:** | |
| Model Repository: https://huggingface.co/ashishkblink/Aawaz | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| **Model Information:** | |
| - Standard TTS: `ashishkblink/Aawaz` ✅ Available on Spaces | |
| - Voice Cloning: XTTS-v2 (Coqui TTS) ⚠️ Available locally only | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| # Gradio 4.x doesn't support theme parameter in launch() | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |