Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import soundfile as sf | |
| import os | |
| import tempfile | |
| import logging | |
| from pathlib import Path | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Global variable to store TTS model | |
| tts_model = None | |
| model_loaded = False | |
| def load_tts_model(): | |
| """Load the TTS model with multiple fallback methods""" | |
| global tts_model, model_loaded | |
| if model_loaded: | |
| return True | |
| try: | |
| # Method 1: Try loading from Hugging Face Hub | |
| try: | |
| from TTS.api import TTS | |
| from huggingface_hub import hf_hub_download | |
| model_repo = "SYSPIN/vits_Chhattisgarhi_Female" | |
| logger.info(f"Attempting to load model from {model_repo}...") | |
| # Download model files from HF | |
| model_path = hf_hub_download( | |
| repo_id=model_repo, | |
| filename="best_model.pth", | |
| cache_dir="./model_cache" | |
| ) | |
| config_path = hf_hub_download( | |
| repo_id=model_repo, | |
| filename="config.json", | |
| cache_dir="./model_cache" | |
| ) | |
| # Initialize TTS with downloaded files | |
| tts_model = TTS(model_path=model_path, config_path=config_path) | |
| model_loaded = True | |
| logger.info("✅ Model loaded successfully from Hugging Face Hub!") | |
| return True | |
| except ImportError: | |
| logger.warning("huggingface_hub not available, trying local files...") | |
| except Exception as e: | |
| logger.warning(f"Failed to load from HF Hub: {e}") | |
| # Method 2: Try loading from local files (if uploaded to space or cloned) | |
| local_paths = [ | |
| ("./best_model.pth", "./config.json"), # Current directory | |
| ("./model/best_model.pth", "./model/config.json"), # Model subdirectory | |
| ("../best_model.pth", "../config.json"), # Parent directory | |
| ] | |
| for model_path, config_path in local_paths: | |
| if os.path.exists(model_path) and os.path.exists(config_path): | |
| logger.info(f"Found local model files at {model_path}") | |
| from TTS.api import TTS | |
| tts_model = TTS(model_path=model_path, config_path=config_path) | |
| model_loaded = True | |
| logger.info("✅ Model loaded successfully from local files!") | |
| return True | |
| # Method 3: Try to use a generic VITS model as fallback | |
| logger.warning("Custom model not found, trying generic VITS model...") | |
| try: | |
| from TTS.api import TTS | |
| # Use a generic multilingual model as fallback | |
| tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2") | |
| model_loaded = True | |
| logger.info("✅ Loaded fallback multilingual model") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to load fallback model: {e}") | |
| return False | |
| except Exception as e: | |
| logger.error(f"Critical error loading model: {str(e)}") | |
| return False | |
| def generate_speech(text, speed=1.0): | |
| """Generate speech from text""" | |
| global tts_model, model_loaded | |
| if not text.strip(): | |
| return None, "⚠️ Please enter some text to synthesize." | |
| # Try to load model if not already loaded | |
| if not model_loaded: | |
| success = load_tts_model() | |
| if not success: | |
| return None, "❌ Error: Could not load any TTS model. Please check the setup." | |
| try: | |
| logger.info(f"Synthesizing: {text[:50]}...") | |
| # Create temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| output_path = tmp_file.name | |
| # Generate speech - handle different TTS API versions | |
| try: | |
| # Method for custom models | |
| tts_model.tts_to_file( | |
| text=text, | |
| file_path=output_path, | |
| speed=speed | |
| ) | |
| except TypeError: | |
| # Fallback for models that don't support speed parameter | |
| try: | |
| tts_model.tts_to_file(text=text, file_path=output_path) | |
| except Exception: | |
| # For XTTS and other models that need different parameters | |
| tts_model.tts_to_file( | |
| text=text, | |
| file_path=output_path, | |
| speaker_wav=None, # Use default speaker | |
| language="hi" # Hindi as closest language | |
| ) | |
| # Check if file was created and has content | |
| if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: | |
| return None, "❌ Error: Audio file was not generated properly." | |
| # Read audio data | |
| audio_data, sample_rate = sf.read(output_path) | |
| # Clean up | |
| os.unlink(output_path) | |
| if len(audio_data) == 0: | |
| return None, "❌ Error: Generated audio is empty." | |
| logger.info("✅ Speech generated successfully!") | |
| return (sample_rate, audio_data), "✅ Speech generated successfully!" | |
| except Exception as e: | |
| error_msg = f"❌ Error during synthesis: {str(e)}" | |
| logger.error(error_msg) | |
| return None, error_msg | |
| # Sample texts | |
| examples = [ | |
| ["नमस्कार, का हाल बा?", 1.0], | |
| ["आज मोसम बहुत बढ़िया हे।", 1.0], | |
| ["तुमन कइसे हव?", 0.9], | |
| ["धन्यवाद।", 1.1], | |
| ["Hello, how are you?", 1.0] # English fallback for testing | |
| ] | |
| # Create Gradio interface | |
| with gr.Blocks( | |
| title="Chhattisgarhi TTS", | |
| theme=gr.themes.Default(primary_hue="blue") | |
| ) as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; margin: 20px 0;"> | |
| <h1>🗣️ Chhattisgarhi Text-to-Speech</h1> | |
| <p style="color: #666;">Generate natural Chhattisgarhi speech with AI</p> | |
| <p style="color: #888; font-size: 0.9em;">Powered by SySpin & Coqui TTS</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="📝 Enter Text", | |
| placeholder="छत्तीसगढ़ी में अपना टेक्स्ट लिखें... (Enter Chhattisgarhi text here)", | |
| lines=3, | |
| max_lines=6 | |
| ) | |
| speed_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="🎚️ Speech Speed", | |
| info="Adjust speaking rate (may not work with all models)" | |
| ) | |
| generate_btn = gr.Button( | |
| "🎵 Generate Speech", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Quick Examples") | |
| for text, _ in examples: | |
| btn = gr.Button(text, size="sm") | |
| btn.click(lambda x=text: x, outputs=text_input) | |
| with gr.Row(): | |
| audio_output = gr.Audio( | |
| label="🔊 Generated Speech", | |
| type="numpy" | |
| ) | |
| status_output = gr.Textbox( | |
| label="📊 Status", | |
| interactive=False, | |
| max_lines=3 | |
| ) | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[text_input, speed_slider], | |
| outputs=[audio_output, status_output], | |
| fn=generate_speech, | |
| cache_examples=False | |
| ) | |
| with gr.Accordion("ℹ️ Model Information", open=False): | |
| gr.Markdown(""" | |
| ### About This Model | |
| - **Language**: Chhattisgarhi (छत्तीसगढ़ी) | |
| - **Voice Type**: Female | |
| - **Training**: SySpin dataset | |
| - **Engine**: Coqui TTS | |
| ### Model Loading Strategy | |
| 1. First tries to load the custom Chhattisgarhi model from Hugging Face | |
| 2. Falls back to local files if available | |
| 3. Uses a multilingual model as last resort | |
| ### How to Use | |
| 1. Enter your text in Chhattisgarhi (Devanagari script preferred) | |
| 2. Adjust speech speed if needed (may not work with all models) | |
| 3. Click "Generate Speech" | |
| 4. Listen to the generated audio | |
| ### Tips | |
| - Use proper punctuation for natural pauses | |
| - Shorter sentences often work better | |
| - If the custom model fails, a fallback model will be used | |
| """) | |
| # Event binding | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, speed_slider], | |
| outputs=[audio_output, status_output] | |
| ) | |
| # Load model on startup | |
| demo.load( | |
| fn=lambda: "🔄 Loading TTS model..." if not load_tts_model() else "✅ Model ready!", | |
| outputs=status_output | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |