indicF5

Sleeping

App Files Files Community

ashishkblink commited on Jan 5

Commit

dcc8cc6

verified ·

1 Parent(s): 285f6b8

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +259 -255

app.py CHANGED Viewed

@@ -1,145 +1,198 @@
 """
-Vakya TTS - Hugging Face Space Playground
-India's No. 1 TTS Model for Hindi and Other Indian Languages
 """
-import gradio as gr
-from TTS.api import TTS
 import os
 import tempfile
 from pathlib import Path
-# Initialize the TTS model
-MODEL_NAME = "ashishkblink/vakya"
-print("🚀 Loading Vakya TTS model...")
-print(f"📦 Model: {MODEL_NAME}")
-tts = None
-# TTS 0.22.0 expects model names in format: "model_type/language/dataset/model" (4 parts)
-# Custom HuggingFace models use format: "username/modelname" (2 parts)
-# This is a known limitation - we'll try multiple approaches
-try:
-    # Method 1: Try direct loading
-    print("📦 Attempting Method 1: Direct model loading...")
-    tts = TTS(model_name=MODEL_NAME)
-    print("✅ Model loaded successfully with Method 1!")
-except Exception as e1:
-    error_msg1 = str(e1)
-    print(f"⚠️ Method 1 failed: {error_msg1}")
-    # Check if it's the unpacking error
-    if "not enough values to unpack" in error_msg1 or "expected 4" in error_msg1:
-        print("\n" + "="*70)
-        print("⚠️ DETECTED: Model name format issue")
-        print("="*70)
-        print(f"The TTS library expects model names in format:")
-        print(f"  'model_type/language/dataset/model' (4 parts)")
-        print(f"But your model is: '{MODEL_NAME}' (2 parts)")
-        print("\nThis suggests TTS 0.22.0 may not support custom HuggingFace")
-        print("model repositories in this format.")
-        print("="*70 + "\n")
-    # Method 2: Try with explicit model type (won't work but shows we tried)
-    try:
-        print("📦 Attempting Method 2: With explicit model type...")
-        tts = TTS(model_name=MODEL_NAME, model_type="tts_models/multilingual/multi-dataset/xtts_v2")
-        print("✅ Model loaded successfully with Method 2!")
-    except Exception as e2:
-        print(f"⚠️ Method 2 failed: {e2}")
-        # Method 3: Try base XTTS-v2 (to verify TTS works)
         try:
-            print("\n📦 Attempting Method 3: Base XTTS-v2 model (for testing)...")
-            tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
-            print("✅ Base XTTS-v2 model loaded successfully!")
-            print("⚠️ NOTE: Using base XTTS-v2 instead of custom Vakya model")
-        except Exception as e3:
-            print(f"❌ Method 3 also failed: {e3}")
             import traceback
             traceback.print_exc()
-            tts = None
-            print("\n" + "="*70)
-            print("❌ ERROR: Could not load any TTS model")
-            print("="*70)
-            print("\nPossible solutions:")
-            print("1. Check if the model repository structure on HuggingFace")
-            print("   matches what TTS library expects")
-            print("2. The model may need to be in a different format")
-            print("3. TTS 0.22.0 may not support custom HuggingFace models")
-            print("   in 'username/modelname' format")
-            print("\nThe app will continue but TTS functionality will be disabled.")
-            print("="*70 + "\n")
-# Supported languages for Indian languages
-INDIAN_LANGUAGES = {
-    "Hindi": "hi",
-    "English": "en",
-    "Marathi": "mr",
-    "Telugu": "te",
-    "Tamil": "ta",
-    "Kannada": "kn",
-    "Gujarati": "gu",
-    "Punjabi": "pa",
-    "Bengali": "bn",
-    "Urdu": "ur",
-}
-# Example texts for each language
-EXAMPLE_TEXTS = {
-    "hi": "नमस्ते, यह वाक्य TTS मॉडल है। यह भारत का नंबर एक टेक्स्ट-टू-स्पीच मॉडल है।",
-    "en": "Hello, this is the Vakya TTS model. It is India's number one text-to-speech model.",
-    "mr": "नमस्कार, हे वाक्य TTS मॉडेल आहे. हे भारतातील नंबर वन टेक्स्ट-टू-स्पीच मॉडेल आहे.",
-    "te": "నమస్కారం, ఇది వాక్య TTS మోడల్. ఇది భారతదేశంలోని నంబర్ వన్ టెక్స్ట్-టు-స్పీచ్ మోడల్.",
-    "ta": "வணக்கம், இது வாக்கிய TTS மாதிரி. இது இந்தியாவின் நம்பர் ஒன் டெக்ஸ்ட்-டு-ஸ்பீச் மாதிரி.",
-    "kn": "ನಮಸ್ಕಾರ, ಇದು ವಾಕ್ಯ TTS ಮಾದರಿ. ಇದು ಭಾರತದ ನಂಬರ್ ವನ್ ಟೆಕ್ಸ್ಟ್-ಟು-ಸ್ಪೀಚ್ ಮಾದರಿ.",
-    "gu": "નમસ્તે, આ વાક્ય TTS મોડલ છે. આ ભારતનું નંબર વન ટેક્સ્ટ-ટુ-સ્પીચ મોડલ છે.",
-    "pa": "ਸਤ ਸ੍ਰੀ ਅਕਾਲ, ਇਹ ਵਾਕ TTS ਮਾਡਲ ਹੈ। ਇਹ ਭਾਰਤ ਦਾ ਨੰਬਰ ਵਨ ਟੈਕਸਟ-ਟੂ-ਸਪੀਚ ਮਾਡਲ ਹੈ।",
-    "bn": "নমস্কার, এটি বাক্য TTS মডেল। এটি ভারতের নম্বর ওয়ান টেক্সট-টু-স্পিচ মডেল।",
-    "ur": "السلام علیکم، یہ واکیہ TTS ماڈل ہے۔ یہ بھارت کا نمبر ایک ٹیکسٹ-ٹو-اسپیچ ماڈل ہے۔",
-}
-def synthesize_speech(text, language, speaker_audio):
-    """
-    Synthesize speech from text using Vakya TTS model
-    """
-    if tts is None:
-        return None, "❌ Model not loaded. Please check the logs."
-    if not text or not text.strip():
-        return None, "⚠️ Please enter some text to synthesize."
-    # Get language code
-    lang_code = INDIAN_LANGUAGES.get(language, "hi")
-    # Create temporary file for output
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-        output_path = tmp_file.name
     try:
-        # XTTS requires a speaker_wav for voice cloning
-        # If speaker audio is provided, use it
-        if speaker_audio is not None:
-            speaker_wav = speaker_audio
-        else:
-            # Try to use a default sample from the model
-            # XTTS can work without explicit speaker_wav if using TTS.api
-            # Let's use a simple approach - try with a minimal default
-            speaker_wav = None
-        # Synthesize speech using TTS API
-        # The TTS.api handles the speaker_wav internally if not provided
-        tts.tts_to_file(
-            text=text,
-            speaker_wav=speaker_wav if speaker_wav else None,
-            language=lang_code,
-            file_path=output_path
         )
-        return output_path, "✅ Speech generated successfully! 🎉"
     except Exception as e:
         error_msg = f"❌ Error generating speech: {str(e)}"
         print(error_msg)
@@ -147,165 +200,116 @@ def synthesize_speech(text, language, speaker_audio):
         traceback.print_exc()
         return None, error_msg
-# Custom CSS for better styling
-css = """
-.gradio-container {
-    font-family: 'Inter', sans-serif;
-}
-.header {
-    text-align: center;
-    padding: 20px;
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    color: white;
-    border-radius: 10px;
-    margin-bottom: 20px;
-}
-.header h1 {
-    margin: 0;
-    font-size: 2.5em;
-}
-.header p {
-    margin: 10px 0 0 0;
-    font-size: 1.2em;
-    opacity: 0.9;
-}
-"""
 # Create Gradio interface
-with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
-    gr.HTML("""
-    <div class="header">
-        <h1>🎤 Vakya TTS</h1>
-        <p>India's No. 1 TTS Model for Hindi and Other Indian Languages</p>
-    </div>
-    """)
     gr.Markdown("""
-    ### Welcome to Vakya TTS Playground! 🚀
-    **Test the power of India's premier Text-to-Speech model:**
-    - 🎯 **High-quality Hindi TTS** - Optimized for Hindi pronunciation
-    - 🌍 **Multi-Indian Language Support** - Supports 10+ Indian languages
-    - 🎭 **Voice Cloning** - Clone voices from just 6 seconds of audio
-    - ⚡ **Real-time Synthesis** - Fast and efficient speech generation
-    **How to use:**
-    1. Enter your text in the text box
-    2. Select the language (Hindi, English, Marathi, Telugu, Tamil, etc.)
-    3. (Optional) Upload a speaker reference audio file for voice cloning
-    4. Click "Generate Speech" and enjoy! 🎉
     """)
     with gr.Row():
-        with gr.Column(scale=1):
-            text_input = gr.Textbox(
-                label="📝 Enter Text",
-                placeholder="Type your text here... (e.g., नमस्ते, यह वाक्य TTS मॉडल है)",
-                lines=5,
-                value=EXAMPLE_TEXTS["hi"]
-            )
-            language_dropdown = gr.Dropdown(
-                label="🌍 Select Language",
-                choices=list(INDIAN_LANGUAGES.keys()),
-                value="Hindi"
-            )
-            speaker_audio = gr.Audio(
-                label="🎤 Speaker Reference Audio (Optional)",
-                type="filepath"
             )
-            gr.Markdown("*Upload a 6+ second audio file to clone the voice. Leave empty for default voice.*")
-            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
-            status_text = gr.Textbox(
-                label="Status",
-                interactive=False,
-                value="Ready to generate speech!"
             )
-        with gr.Column(scale=1):
-            output_audio = gr.Audio(
-                label="🔊 Generated Speech",
-                type="filepath"
             )
-            gr.Markdown("""
-            ### 💡 Tips:
-            - For best results in Hindi, use Devanagari script (नमस्ते)
-            - Speaker audio should be clear and at least 6 seconds long
-            - You can download the generated audio by clicking the download button
-            """)
-    # Examples section
-    gr.Markdown("### 📚 Example Texts (Click to use)")
-    def make_example_loader(example_text, lang_name):
-        """Create a function to load example text and language"""
-        def load_example():
-            return example_text, lang_name
-        return load_example
     with gr.Row():
-        for lang_name, lang_code in list(INDIAN_LANGUAGES.items())[:5]:
-            example_text = EXAMPLE_TEXTS.get(lang_code, "")
-            example_btn = gr.Button(
-                f"{lang_name} Example",
-                size="sm"
-            )
-            example_btn.click(
-                fn=make_example_loader(example_text, lang_name),
-                outputs=[text_input, language_dropdown],
-                api_name=f"load_example_{lang_name.lower().replace(' ', '_')}"
-            )
-    with gr.Row():
-        for lang_name, lang_code in list(INDIAN_LANGUAGES.items())[5:]:
-            example_text = EXAMPLE_TEXTS.get(lang_code, "")
-            example_btn = gr.Button(
-                f"{lang_name} Example",
-                size="sm"
-            )
-            example_btn.click(
-                fn=make_example_loader(example_text, lang_name),
-                outputs=[text_input, language_dropdown],
-                api_name=f"load_example_{lang_name.lower().replace(' ', '_')}"
-            )
-    # Footer
     gr.Markdown("""
     ---
-    ### 🔗 Links
-    - **Model Repository**: [ashishkblink/vakya](https://huggingface.co/ashishkblink/vakya)
-    - **Built with**: [Coqui TTS](https://github.com/coqui-ai/TTS)
-    ### 📄 License
-    Apache 2.0
-    *Built with ❤️ for the Indian language community*
     """)
-    # Connect the generate button
-    generate_btn.click(
-        fn=synthesize_speech,
-        inputs=[text_input, language_dropdown, speaker_audio],
-        outputs=[output_audio, status_text]
-    )
-    # Auto-load example when language changes
-    language_dropdown.change(
-        fn=lambda lang: EXAMPLE_TEXTS.get(INDIAN_LANGUAGES.get(lang, "hi"), ""),
-        inputs=[language_dropdown],
-        outputs=[text_input]
-    )
-# Launch the app
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 """
+Vakya 2.0 - Text-to-Speech Playground
+A Hugging Face Space for testing the Vakya TTS model
 """
 import os
+import sys
 import tempfile
+import gradio as gr
+import numpy as np
+import soundfile as sf
+import torch
+from huggingface_hub import hf_hub_download, snapshot_download
 from pathlib import Path
+# Try to import f5_tts - handle different possible locations
+try:
+    from f5_tts.api import F5TTS
+    from f5_tts.infer.utils_infer import preprocess_ref_audio_text
+except ImportError:
+    # Try adding local paths
+    current_dir = os.path.dirname(__file__)
+    possible_paths = [
+        os.path.join(current_dir, "vakya_model"),
+        os.path.join(current_dir, "f5_tts"),
+        os.path.join(current_dir, "..", "vakya_model"),
+    ]
+    for path in possible_paths:
+        if os.path.exists(path):
+            sys.path.insert(0, path)
+            try:
+                from f5_tts.api import F5TTS
+                from f5_tts.infer.utils_infer import preprocess_ref_audio_text
+                break
+            except ImportError:
+                continue
+    else:
+        raise ImportError(
+            "Could not import f5_tts. Please ensure the model code is available. "
+            "You may need to include the f5_tts directory in your Space or install it as a package."
+        )
+# Model configuration
+MODEL_REPO_ID = "ashishkblink/vakya2.0"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Global model instance
+tts_model = None
+vocoder = None
+def load_model():
+    """Load the Vakya model from Hugging Face"""
+    global tts_model
+    if tts_model is None:
+        print("Loading Vakya model...")
+        print(f"Device: {DEVICE}")
         try:
+            # Download model files from Hugging Face
+            print("Downloading model files from Hugging Face...")
+            model_dir = snapshot_download(
+                repo_id=MODEL_REPO_ID,
+                cache_dir=None,
+                local_files_only=False
+            )
+            # Find checkpoint and vocab files
+            model_dir_path = Path(model_dir)
+            ckpt_files = list(model_dir_path.rglob("*.safetensors")) + list(model_dir_path.rglob("*.pt"))
+            vocab_files = list(model_dir_path.rglob("vocab.txt"))
+            ckpt_file = str(ckpt_files[0]) if ckpt_files else ""
+            vocab_file = str(vocab_files[0]) if vocab_files else ""
+            print(f"Checkpoint: {ckpt_file}")
+            print(f"Vocab: {vocab_file}")
+            # If files not found in repo, try using HF paths directly
+            if not ckpt_file:
+                print("Trying to download checkpoint from HF...")
+                try:
+                    ckpt_file = hf_hub_download(
+                        repo_id=MODEL_REPO_ID,
+                        filename="model.safetensors",
+                        cache_dir=None
+                    )
+                except:
+                    try:
+                        ckpt_file = hf_hub_download(
+                            repo_id=MODEL_REPO_ID,
+                            filename="pytorch_model.bin",
+                            cache_dir=None
+                        )
+                    except:
+                        pass
+            if not vocab_file:
+                print("Trying to download vocab from HF...")
+                try:
+                    vocab_file = hf_hub_download(
+                        repo_id=MODEL_REPO_ID,
+                        filename="vocab.txt",
+                        cache_dir=None
+                    )
+                except:
+                    pass
+            # Initialize F5TTS model
+            # If ckpt_file is empty, F5TTS will use default
+            tts_model = F5TTS(
+                model_type="F5-TTS",
+                ckpt_file=ckpt_file if ckpt_file else "",
+                vocab_file=vocab_file if vocab_file else "",
+                device=DEVICE,
+                vocoder_name="vocos"
+            )
+            print("✅ Model loaded successfully!")
+            return "✅ Model loaded successfully!"
+        except Exception as e:
+            error_msg = f"❌ Error loading model: {str(e)}"
+            print(error_msg)
             import traceback
             traceback.print_exc()
+            return error_msg
+    return "✅ Model already loaded!"
+def generate_speech(ref_audio, ref_text, gen_text, speed, remove_silence):
+    """Generate speech from text using reference audio"""
+    global tts_model
+    if tts_model is None:
+        return None, "⚠️ Please load the model first by clicking 'Load Model' button."
+    if ref_audio is None:
+        return None, "⚠️ Please upload a reference audio file."
+    if not gen_text or not gen_text.strip():
+        return None, "⚠️ Please enter text to generate."
     try:
+        # Save uploaded audio to temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_ref:
+            # Handle different audio input formats
+            if isinstance(ref_audio, tuple):
+                # Gradio audio format: (sample_rate, audio_data)
+                sr, audio_data = ref_audio
+                sf.write(tmp_ref.name, audio_data, sr)
+                ref_audio_path = tmp_ref.name
+            elif isinstance(ref_audio, str):
+                # File path
+                ref_audio_path = ref_audio
+            else:
+                return None, "⚠️ Invalid audio format."
+        # Preprocess reference audio and text
+        ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
+            ref_audio_path,
+            ref_text if ref_text else "",
+            device=DEVICE
         )
+        # Generate speech
+        print(f"Generating speech for: {gen_text[:50]}...")
+        wav, sr, spect = tts_model.infer(
+            ref_file=ref_audio_processed,
+            ref_text=ref_text_processed,
+            gen_text=gen_text,
+            speed=speed,
+            remove_silence=remove_silence,
+            show_info=print,
+            progress=None
+        )
+        # Convert to numpy array if needed
+        if isinstance(wav, torch.Tensor):
+            wav = wav.cpu().numpy()
+        # Ensure it's 1D
+        if len(wav.shape) > 1:
+            wav = wav.squeeze()
+        # Normalize audio
+        if wav.dtype == np.int16:
+            wav = wav.astype(np.float32) / 32768.0
+        elif wav.max() > 1.0:
+            wav = wav / np.abs(wav).max()
+        # Return audio in Gradio format: (sample_rate, audio_data)
+        return (sr, wav), f"✅ Generated {len(wav)/sr:.2f} seconds of audio"
     except Exception as e:
         error_msg = f"❌ Error generating speech: {str(e)}"
         print(error_msg)
         traceback.print_exc()
         return None, error_msg
 # Create Gradio interface
+with gr.Blocks(title="Vakya 2.0 - Text-to-Speech", theme=gr.themes.Soft()) as app:
     gr.Markdown("""
+    # 🎙️ Vakya 2.0 - Text-to-Speech Playground
+    **Vakya** is a high-quality Text-to-Speech model supporting 11 Indian languages:
+    Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu
+    ### How to use:
+    1. Click **"Load Model"** to load the Vakya model (first time may take a few minutes)
+    2. Upload a **reference audio** file (WAV format recommended, <15 seconds for best results)
+    3. Enter the **reference text** (what is spoken in the reference audio) - optional, will auto-transcribe if left blank
+    4. Enter the **text to generate** (in any of the 11 supported languages)
+    5. Adjust settings if needed
+    6. Click **"Generate Speech"** to synthesize audio
+    ### Tips:
+    - Keep reference audio clips short (<15 seconds) for best results
+    - Reference text helps the model understand the voice characteristics better
+    - The model will automatically transcribe reference audio if text is not provided
     """)
     with gr.Row():
+        with gr.Column():
+            load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
+            model_status = gr.Textbox(label="Model Status", value="⏳ Model not loaded", interactive=False)
+        load_btn.click(
+            fn=load_model,
+            outputs=model_status
+        )
+    with gr.Row():
+        with gr.Column():
+            ref_audio_input = gr.Audio(
+                label="Reference Audio",
+                type="numpy",
+                sources=["upload", "microphone"],
+                format="wav"
             )
+            ref_text_input = gr.Textbox(
+                label="Reference Text (Optional)",
+                placeholder="Enter the text spoken in the reference audio. Leave blank for auto-transcription.",
+                lines=3,
+                info="This helps the model understand voice characteristics. Auto-transcription available if left blank."
             )
+        with gr.Column():
+            gen_text_input = gr.Textbox(
+                label="Text to Generate",
+                placeholder="Enter the text you want to synthesize in any supported Indian language...",
+                lines=5,
+                info="Supports: Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu"
             )
+            with gr.Accordion("⚙️ Advanced Settings", open=False):
+                speed_slider = gr.Slider(
+                    label="Speed",
+                    minimum=0.5,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    info="Adjust the speed of generated speech"
+                )
+                remove_silence = gr.Checkbox(
+                    label="Remove Silences",
+                    value=False,
+                    info="Remove silences from generated audio (experimental)"
+                )
+    generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
     with gr.Row():
+        audio_output = gr.Audio(
+            label="Generated Audio",
+            type="numpy",
+            autoplay=True
+        )
+        status_output = gr.Textbox(
+            label="Status",
+            interactive=False
+        )
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[
+            ref_audio_input,
+            ref_text_input,
+            gen_text_input,
+            speed_slider,
+            remove_silence
+        ],
+        outputs=[audio_output, status_output]
+    )
     gr.Markdown("""
     ---
+    ### 📚 Model Information
+    - **Model**: Vakya 2.0
+    - **Repository**: [ashishkblink/vakya2.0](https://huggingface.co/ashishkblink/vakya2.0)
+    - **Based on**: [IndicF5](https://github.com/AI4Bharat/IndicF5) by AI4Bharat (IIT Madras)
+    - **License**: MIT License
+    - **Sample Rate**: 24000 Hz
+    ### ⚠️ Terms of Use
+    - You must have explicit permission to clone voices
+    - Unauthorized voice cloning is strictly prohibited
+    - Any misuse of this model is the responsibility of the user
     """)
 if __name__ == "__main__":
+    app.queue().launch(share=False)