Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import os | |
| def synthesize_speech(text, speaker_id=0): | |
| """ | |
| Placeholder function for speech synthesis | |
| Replace this with actual model inference when you have trained models | |
| """ | |
| if not text.strip(): | |
| return None | |
| sample_rate = 24000 | |
| duration = max(1.0, len(text) * 0.08) # rough estimate | |
| samples = int(sample_rate * duration) | |
| # Generate sine-based waveform | |
| t = np.linspace(0, duration, samples, endpoint=False) | |
| frequency = 440 + (speaker_id * 50) | |
| audio = ( | |
| 0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) + | |
| 0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) + | |
| 0.05 * np.random.randn(samples) | |
| ) | |
| # Fade in/out safely | |
| fade_samples = min(int(0.1 * sample_rate), samples // 2) | |
| if fade_samples > 0: | |
| audio[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
| audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
| return (sample_rate, audio.astype(np.float32)) | |
| def create_demo(): | |
| with gr.Blocks( | |
| title="Learnable-Speech Demo", | |
| theme=gr.themes.Default(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎤 Learnable-Speech: High-Quality 24kHz Speech Synthesis | |
| An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE. | |
| > **⚠️ This is a demo interface with placeholder audio. To use the actual model, you need to train it first!** | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to synthesize", | |
| placeholder="Enter text here...", | |
| lines=3, | |
| value="Hello, this is a demo of Learnable-Speech synthesis." | |
| ) | |
| speaker_slider = gr.Slider( | |
| minimum=0, | |
| maximum=10, | |
| value=0, | |
| step=1, | |
| label="Speaker ID" | |
| ) | |
| generate_btn = gr.Button("🎵 Generate Speech", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="numpy" | |
| ) | |
| generate_btn.click( | |
| fn=synthesize_speech, | |
| inputs=[text_input, speaker_slider], | |
| outputs=audio_output | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["Hello everyone! I am here to tell you that Learnable-Speech is amazing!"], | |
| ["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle."], | |
| ["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis."], | |
| ["This implementation uses flow matching for high-quality 24kHz audio generation."], | |
| ], | |
| inputs=[text_input], | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| host = os.environ.get("HOST", "0.0.0.0") | |
| demo = create_demo() | |
| try: | |
| demo.launch( | |
| server_name=host, | |
| server_port=port, | |
| share=False, | |
| show_error=True, | |
| quiet=False, | |
| enable_queue=True | |
| ) | |
| except Exception: | |
| print(f"Failed to launch on {host}:{port}, trying with share=True") | |
| demo.launch( | |
| share=True, | |
| show_error=True, | |
| quiet=False, | |
| enable_queue=True | |
| ) | |