Spaces:
Sleeping
Sleeping
| """ | |
| Vakya 2.0 - Text-to-Speech Playground | |
| A Hugging Face Space for testing the Vakya TTS model | |
| """ | |
| import os | |
| import sys | |
| import tempfile | |
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| import torch | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| from pathlib import Path | |
| # Try to import f5_tts - handle different possible locations | |
| # The f5_tts directory should be in the same directory as app.py | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| # Add current directory to path (so we can import f5_tts if it's in the same dir) | |
| if current_dir not in sys.path: | |
| sys.path.insert(0, current_dir) | |
| # Also check for f5_tts in common locations | |
| possible_parent_paths = [ | |
| current_dir, # Same directory as app.py | |
| os.path.join(current_dir, ".."), # Parent directory | |
| "/app", # Common HF Spaces location | |
| ] | |
| f5_tts_imported = False | |
| import_error_details = [] | |
| for parent_path in possible_parent_paths: | |
| parent_path = os.path.abspath(parent_path) | |
| f5_tts_path = os.path.join(parent_path, "f5_tts") | |
| if os.path.exists(f5_tts_path) and os.path.isdir(f5_tts_path): | |
| # Add parent directory to path (not f5_tts itself) | |
| if parent_path not in sys.path: | |
| sys.path.insert(0, parent_path) | |
| try: | |
| from f5_tts.api import F5TTS | |
| from f5_tts.infer.utils_infer import preprocess_ref_audio_text | |
| f5_tts_imported = True | |
| print(f"✅ Successfully imported f5_tts from {parent_path}") | |
| break | |
| except ImportError as e: | |
| error_msg = str(e) | |
| import_error_details.append(f"{parent_path}: {error_msg}") | |
| print(f"⚠️ Tried {parent_path}, but import failed: {error_msg}") | |
| # Continue trying other paths | |
| continue | |
| except Exception as e: | |
| error_msg = str(e) | |
| import_error_details.append(f"{parent_path}: {type(e).__name__}: {error_msg}") | |
| print(f"⚠️ Tried {parent_path}, but error occurred: {error_msg}") | |
| continue | |
| if not f5_tts_imported: | |
| # Try direct import (in case it's installed as a package) | |
| try: | |
| from f5_tts.api import F5TTS | |
| from f5_tts.infer.utils_infer import preprocess_ref_audio_text | |
| f5_tts_imported = True | |
| print("✅ Successfully imported f5_tts (installed package)") | |
| except ImportError: | |
| pass | |
| if not f5_tts_imported: | |
| # Print debug information | |
| print(f"❌ Current directory: {current_dir}") | |
| print(f"❌ Python path: {sys.path[:5]}") | |
| print(f"❌ Checking for f5_tts in: {current_dir}") | |
| if os.path.exists(os.path.join(current_dir, "f5_tts")): | |
| print(f" ✅ f5_tts directory exists at: {os.path.join(current_dir, 'f5_tts')}") | |
| print(f" 📁 Contents: {os.listdir(os.path.join(current_dir, 'f5_tts'))[:10]}") | |
| else: | |
| print(f" ❌ f5_tts directory NOT found") | |
| error_summary = "\n".join(import_error_details) if import_error_details else "No import attempts made" | |
| raise ImportError( | |
| f"Could not import f5_tts. Please ensure the model code is available.\n" | |
| f"Current directory: {current_dir}\n" | |
| f"Looking for f5_tts in: {current_dir}\n" | |
| f"Python path: {sys.path[:3]}\n" | |
| f"Import errors:\n{error_summary}\n" | |
| f"If you see 'No module named X', add it to requirements.txt" | |
| ) | |
| # Model configuration | |
| MODEL_REPO_ID = "ashishkblink/vakya2.0" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Global model instance | |
| tts_model = None | |
| vocoder = None | |
| def load_model(): | |
| """Load the Vakya model from Hugging Face""" | |
| global tts_model | |
| if tts_model is None: | |
| print("Loading Vakya model...") | |
| print(f"Device: {DEVICE}") | |
| try: | |
| # Download model files from Hugging Face | |
| print("Downloading model files from Hugging Face...") | |
| print(f"Repository: {MODEL_REPO_ID}") | |
| # Try to download with authentication | |
| try: | |
| from huggingface_hub import login | |
| # Check if token is available via environment variable | |
| token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") | |
| if token: | |
| login(token=token, add_to_git_credential=False) | |
| except: | |
| pass # Token might not be set, that's okay if repo is public | |
| model_dir = snapshot_download( | |
| repo_id=MODEL_REPO_ID, | |
| cache_dir=None, | |
| local_files_only=False, | |
| token=os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") | |
| ) | |
| # Find checkpoint and vocab files | |
| model_dir_path = Path(model_dir) | |
| ckpt_files = list(model_dir_path.rglob("*.safetensors")) + list(model_dir_path.rglob("*.pt")) | |
| vocab_files = list(model_dir_path.rglob("vocab.txt")) | |
| ckpt_file = str(ckpt_files[0]) if ckpt_files else "" | |
| vocab_file = str(vocab_files[0]) if vocab_files else "" | |
| print(f"Checkpoint: {ckpt_file}") | |
| print(f"Vocab: {vocab_file}") | |
| # If files not found in repo, try using HF paths directly | |
| if not ckpt_file: | |
| print("Trying to download checkpoint from HF...") | |
| try: | |
| ckpt_file = hf_hub_download( | |
| repo_id=MODEL_REPO_ID, | |
| filename="model.safetensors", | |
| cache_dir=None | |
| ) | |
| except: | |
| try: | |
| ckpt_file = hf_hub_download( | |
| repo_id=MODEL_REPO_ID, | |
| filename="pytorch_model.bin", | |
| cache_dir=None | |
| ) | |
| except: | |
| pass | |
| if not vocab_file: | |
| print("Trying to download vocab from HF...") | |
| try: | |
| vocab_file = hf_hub_download( | |
| repo_id=MODEL_REPO_ID, | |
| filename="vocab.txt", | |
| cache_dir=None | |
| ) | |
| except: | |
| pass | |
| # Initialize F5TTS model | |
| # If ckpt_file is empty, F5TTS will use default | |
| tts_model = F5TTS( | |
| model_type="F5-TTS", | |
| ckpt_file=ckpt_file if ckpt_file else "", | |
| vocab_file=vocab_file if vocab_file else "", | |
| device=DEVICE, | |
| vocoder_name="vocos" | |
| ) | |
| print("✅ Model loaded successfully!") | |
| return "✅ Model loaded successfully!" | |
| except Exception as e: | |
| error_msg = str(e) | |
| error_details = f"❌ Error loading model: {error_msg}" | |
| print(error_details) | |
| # Check if it's an authentication error | |
| if "401" in error_msg or "Repository Not Found" in error_msg or "Invalid username or password" in error_msg: | |
| detailed_error = ( | |
| f"❌ Authentication Error: The model repository '{MODEL_REPO_ID}' is private or requires authentication.\n\n" | |
| f"**Solutions:**\n" | |
| f"1. **Make repository public** (Recommended for playground):\n" | |
| f" - Go to: https://huggingface.co/{MODEL_REPO_ID}/settings\n" | |
| f" - Change visibility to 'Public'\n\n" | |
| f"2. **Add authentication token to Space** (if keeping private):\n" | |
| f" - Go to Space Settings → Repository secrets\n" | |
| f" - Add secret: HF_TOKEN with your Hugging Face token\n" | |
| f" - Get token from: https://huggingface.co/settings/tokens\n\n" | |
| f"3. **Upload model files directly to Space** (alternative):\n" | |
| f" - Upload model checkpoint and vocab.txt to the Space repository\n" | |
| f" - The app will use local files instead" | |
| ) | |
| print(detailed_error) | |
| return detailed_error | |
| import traceback | |
| traceback.print_exc() | |
| return error_details | |
| return "✅ Model already loaded!" | |
| def generate_speech(ref_audio, ref_text, gen_text, speed, remove_silence): | |
| """Generate speech from text using reference audio""" | |
| global tts_model | |
| if tts_model is None: | |
| return None, "⚠️ Please load the model first by clicking 'Load Model' button." | |
| if ref_audio is None: | |
| return None, "⚠️ Please upload a reference audio file." | |
| if not gen_text or not gen_text.strip(): | |
| return None, "⚠️ Please enter text to generate." | |
| try: | |
| # Save uploaded audio to temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_ref: | |
| # Handle different audio input formats | |
| if isinstance(ref_audio, tuple): | |
| # Gradio audio format: (sample_rate, audio_data) | |
| sr, audio_data = ref_audio | |
| sf.write(tmp_ref.name, audio_data, sr) | |
| ref_audio_path = tmp_ref.name | |
| elif isinstance(ref_audio, str): | |
| # File path | |
| ref_audio_path = ref_audio | |
| else: | |
| return None, "⚠️ Invalid audio format." | |
| # Preprocess reference audio and text | |
| ref_audio_processed, ref_text_processed = preprocess_ref_audio_text( | |
| ref_audio_path, | |
| ref_text if ref_text else "", | |
| device=DEVICE | |
| ) | |
| # Generate speech | |
| print(f"Generating speech for: {gen_text[:50]}...") | |
| wav, sr, spect = tts_model.infer( | |
| ref_file=ref_audio_processed, | |
| ref_text=ref_text_processed, | |
| gen_text=gen_text, | |
| speed=speed, | |
| remove_silence=remove_silence, | |
| show_info=print, | |
| progress=None | |
| ) | |
| # Convert to numpy array if needed | |
| if isinstance(wav, torch.Tensor): | |
| wav = wav.cpu().numpy() | |
| # Ensure it's 1D | |
| if len(wav.shape) > 1: | |
| wav = wav.squeeze() | |
| # Normalize audio | |
| if wav.dtype == np.int16: | |
| wav = wav.astype(np.float32) / 32768.0 | |
| elif wav.max() > 1.0: | |
| wav = wav / np.abs(wav).max() | |
| # Return audio in Gradio format: (sample_rate, audio_data) | |
| return (sr, wav), f"✅ Generated {len(wav)/sr:.2f} seconds of audio" | |
| except Exception as e: | |
| error_msg = f"❌ Error generating speech: {str(e)}" | |
| print(error_msg) | |
| import traceback | |
| traceback.print_exc() | |
| return None, error_msg | |
| # Create Gradio interface | |
| with gr.Blocks(title="Vakya 2.0 - Text-to-Speech", theme=gr.themes.Soft()) as app: | |
| gr.Markdown(""" | |
| # 🎙️ Vakya 2.0 - Text-to-Speech Playground | |
| **Vakya** is a high-quality Text-to-Speech model supporting 11 Indian languages: | |
| Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu | |
| ### How to use: | |
| 1. Click **"Load Model"** to load the Vakya model (first time may take a few minutes) | |
| 2. Upload a **reference audio** file (WAV format recommended, <15 seconds for best results) | |
| 3. Enter the **reference text** (what is spoken in the reference audio) - optional, will auto-transcribe if left blank | |
| 4. Enter the **text to generate** (in any of the 11 supported languages) | |
| 5. Adjust settings if needed | |
| 6. Click **"Generate Speech"** to synthesize audio | |
| ### Tips: | |
| - Keep reference audio clips short (<15 seconds) for best results | |
| - Reference text helps the model understand the voice characteristics better | |
| - The model will automatically transcribe reference audio if text is not provided | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg") | |
| model_status = gr.Textbox(label="Model Status", value="⏳ Model not loaded", interactive=False) | |
| load_btn.click( | |
| fn=load_model, | |
| outputs=model_status | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| ref_audio_input = gr.Audio( | |
| label="Reference Audio", | |
| type="numpy", | |
| sources=["upload", "microphone"], | |
| format="wav" | |
| ) | |
| ref_text_input = gr.Textbox( | |
| label="Reference Text (Optional)", | |
| placeholder="Enter the text spoken in the reference audio. Leave blank for auto-transcription.", | |
| lines=3, | |
| info="This helps the model understand voice characteristics. Auto-transcription available if left blank." | |
| ) | |
| with gr.Column(): | |
| gen_text_input = gr.Textbox( | |
| label="Text to Generate", | |
| placeholder="Enter the text you want to synthesize in any supported Indian language...", | |
| lines=5, | |
| info="Supports: Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu" | |
| ) | |
| with gr.Accordion("⚙️ Advanced Settings", open=False): | |
| speed_slider = gr.Slider( | |
| label="Speed", | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| info="Adjust the speed of generated speech" | |
| ) | |
| remove_silence = gr.Checkbox( | |
| label="Remove Silences", | |
| value=False, | |
| info="Remove silences from generated audio (experimental)" | |
| ) | |
| generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") | |
| with gr.Row(): | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| type="numpy", | |
| autoplay=True | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| interactive=False | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[ | |
| ref_audio_input, | |
| ref_text_input, | |
| gen_text_input, | |
| speed_slider, | |
| remove_silence | |
| ], | |
| outputs=[audio_output, status_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### 📚 Model Information | |
| - **Model**: Vakya 2.0 | |
| - **Repository**: [ashishkblink/vakya2.0](https://huggingface.co/ashishkblink/vakya2.0) | |
| - **Based on**: [IndicF5](https://github.com/AI4Bharat/IndicF5) by AI4Bharat (IIT Madras) | |
| - **License**: MIT License | |
| - **Sample Rate**: 24000 Hz | |
| ### ⚠️ Terms of Use | |
| - You must have explicit permission to clone voices | |
| - Unauthorized voice cloning is strictly prohibited | |
| - Any misuse of this model is the responsibility of the user | |
| """) | |
| if __name__ == "__main__": | |
| app.queue().launch(share=False) | |