import gradio as gr import numpy as np import os def synthesize_speech(text, speaker_id=0): """ Placeholder function for speech synthesis Replace this with actual model inference when you have trained models """ if not text.strip(): return None # This is a placeholder - replace with actual model inference sample_rate = 24000 duration = max(1.0, len(text) * 0.08) # rough estimate samples = int(sample_rate * duration) # Generate simple sine wave as placeholder t = np.linspace(0, duration, samples) frequency = 440 + (speaker_id * 50) # vary frequency by speaker # Create a more interesting waveform audio = ( 0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) + 0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) + 0.05 * np.random.randn(samples) # add some noise ) # Apply fade in/out fade_samples = int(0.1 * sample_rate) audio[:fade_samples] *= np.linspace(0, 1, fade_samples) audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) return (sample_rate, audio.astype(np.float32)) def create_demo(): with gr.Blocks( title="Learnable-Speech Demo", theme=gr.themes.Default(), css=""" .gradio-container { max-width: 1200px !important; } """ ) as demo: gr.Markdown( """ # 🎤 Learnable-Speech: High-Quality 24kHz Speech Synthesis An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE. > **⚠️ This is a demo interface with placeholder audio. To use the actual model, you need to train it first!** ## 🚀 How to Train Your Own Model: 1. **Follow the [Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md)** 2. **Use the provided training scripts** in the `scripts/` directory 3. **Upload your trained models** to Hugging Face Hub 4. **Replace the placeholder code** in this Space with your models ### Quick Start: ```bash # 1. Prepare your dataset ./scripts/prepare_data.sh # 2. Train the model ./scripts/train_full_pipeline.sh # 3. Upload to Hugging Face python scripts/upload_to_hf.py --username your_username ``` """ ) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text to synthesize", placeholder="Enter text here...", lines=3, value="Hello, this is a demo of Learnable-Speech synthesis." ) with gr.Row(): speaker_slider = gr.Slider( minimum=0, maximum=10, value=0, step=1, label="Speaker ID" ) generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") with gr.Column(): audio_output = gr.Audio( label="Generated Speech", type="numpy" ) with gr.Accordion("🎯 Training Status & Next Steps", open=True): gr.Markdown( """ ### 📋 Current Status: - ✅ **Demo Interface**: Ready - ❌ **Trained Models**: Not available (placeholder audio only) - ❌ **Model Inference**: Not implemented yet ### 🔧 To Enable Real Speech Synthesis: 1. **Train the models** using the provided pipeline 2. **Upload trained checkpoints** to Hugging Face Hub 3. **Update the inference code** in `synthesize_speech()` function 4. **Test with real model outputs** ### 📚 Resources: - [📖 Complete Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md) - [🛠️ Training Scripts](https://github.com/primepake/learnable-speech/tree/main/scripts) - [📄 Research Paper](https://arxiv.org/pdf/2505.07916) - [💻 GitHub Repository](https://github.com/primepake/learnable-speech) """ ) gr.Markdown( """ ### Key Features - **24kHz Audio Support**: High-quality audio generation at 24kHz sampling rate - **Flow matching AE**: Flow matching training for autoencoders - **Immiscible assignment**: Support immiscible adding noise while training - **Contrastive Flow matching**: Support Contrastive training ### Architecture **Stage 1**: Audio to Discrete Tokens - Converts raw audio into discrete representations using FSQ (S3Tokenizer) **Stage 2**: Discrete Tokens to Continuous Latent Space - Maps discrete tokens to continuous latent space using VAE ### Training Pipeline 1. Extract discrete tokens using trained FSQ S3Tokenizer 2. Generate continuous latent representations using trained DAC-VAE 3. Train Stage 1: BPE tokens → Discrete FSQ 4. Train Stage 2: Discrete FSQ → DAC-VAE Continuous latent space ### Links - [GitHub Repository](https://github.com/primepake/learnable-speech) - [Technical Paper](https://arxiv.org/pdf/2505.07916) """ ) with gr.Row(): gr.Examples( examples=[ ["Hello everyone! I am here to tell you that Learnable-Speech is amazing!"], ["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle."], ["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis."], ["This implementation uses flow matching for high-quality 24kHz audio generation."], ], inputs=[text_input], fn=lambda x: synthesize_speech(x, 0), outputs=audio_output, cache_examples=False, label="Example Texts" ) generate_btn.click( fn=synthesize_speech, inputs=[text_input, speaker_slider], outputs=audio_output ) return demo if __name__ == "__main__": # Get environment variables for flexible deployment port = int(os.environ.get("PORT", 7860)) host = os.environ.get("HOST", "0.0.0.0") demo = create_demo() # Try to launch with error handling try: demo.launch( server_name=host, server_port=port, share=False, show_error=True, quiet=False, enable_queue=True ) except Exception as e: print(f"Failed to launch on {host}:{port}, trying with share=True") demo.launch( share=True, show_error=True, quiet=False, enable_queue=True )