Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

learnable-speech / app.py

mnhatdaous

Fix Gradio app deployment issues

aeff66c 3 months ago

raw

history blame

7.7 kB

	import gradio as gr
	import numpy as np
	import os

	def synthesize_speech(text, speaker_id=0):
	"""
	Placeholder function for speech synthesis
	Replace this with actual model inference when you have trained models
	"""
	if not text.strip():
	return None

	# This is a placeholder - replace with actual model inference
	sample_rate = 24000
	duration = max(1.0, len(text) * 0.08) # rough estimate
	samples = int(sample_rate * duration)

	# Generate simple sine wave as placeholder
	t = np.linspace(0, duration, samples)
	frequency = 440 + (speaker_id * 50) # vary frequency by speaker

	# Create a more interesting waveform
	audio = (
	0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) +
	0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) +
	0.05 * np.random.randn(samples) # add some noise
	)

	# Apply fade in/out
	fade_samples = int(0.1 * sample_rate)
	audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
	audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)

	return (sample_rate, audio.astype(np.float32))

	def create_demo():
	with gr.Blocks(
	title="Learnable-Speech Demo",
	theme=gr.themes.Default(),
	css="""
	.gradio-container {
	max-width: 1200px !important;
	}
	"""
	) as demo:
	gr.Markdown(
	"""
	# 🎤 Learnable-Speech: High-Quality 24kHz Speech Synthesis

	An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.

	> ⚠️ This is a demo interface with placeholder audio. To use the actual model, you need to train it first!

	## 🚀 How to Train Your Own Model:

	1. Follow the [Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md)
	2. Use the provided training scripts in the `scripts/` directory
	3. Upload your trained models to Hugging Face Hub
	4. Replace the placeholder code in this Space with your models

	### Quick Start:
	```bash
	# 1. Prepare your dataset
	./scripts/prepare_data.sh

	# 2. Train the model
	./scripts/train_full_pipeline.sh

	# 3. Upload to Hugging Face
	python scripts/upload_to_hf.py --username your_username
	```
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to synthesize",
	placeholder="Enter text here...",
	lines=3,
	value="Hello, this is a demo of Learnable-Speech synthesis."
	)

	with gr.Row():
	speaker_slider = gr.Slider(
	minimum=0,
	maximum=10,
	value=0,
	step=1,
	label="Speaker ID"
	)

	generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")

	with gr.Column():
	audio_output = gr.Audio(
	label="Generated Speech",
	type="numpy"
	)

	with gr.Accordion("🎯 Training Status & Next Steps", open=True):
	gr.Markdown(
	"""
	### 📋 Current Status:
	- ✅ Demo Interface: Ready
	- ❌ Trained Models: Not available (placeholder audio only)
	- ❌ Model Inference: Not implemented yet

	### 🔧 To Enable Real Speech Synthesis:
	1. Train the models using the provided pipeline
	2. Upload trained checkpoints to Hugging Face Hub
	3. Update the inference code in `synthesize_speech()` function
	4. Test with real model outputs

	### 📚 Resources:
	- [📖 Complete Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md)
	- [🛠️ Training Scripts](https://github.com/primepake/learnable-speech/tree/main/scripts)
	- [📄 Research Paper](https://arxiv.org/pdf/2505.07916)
	- [💻 GitHub Repository](https://github.com/primepake/learnable-speech)
	"""
	)
	gr.Markdown(
	"""
	### Key Features
	- 24kHz Audio Support: High-quality audio generation at 24kHz sampling rate
	- Flow matching AE: Flow matching training for autoencoders
	- Immiscible assignment: Support immiscible adding noise while training
	- Contrastive Flow matching: Support Contrastive training

	### Architecture
	Stage 1: Audio to Discrete Tokens - Converts raw audio into discrete representations using FSQ (S3Tokenizer)

	Stage 2: Discrete Tokens to Continuous Latent Space - Maps discrete tokens to continuous latent space using VAE

	### Training Pipeline
	1. Extract discrete tokens using trained FSQ S3Tokenizer
	2. Generate continuous latent representations using trained DAC-VAE
	3. Train Stage 1: BPE tokens → Discrete FSQ
	4. Train Stage 2: Discrete FSQ → DAC-VAE Continuous latent space

	### Links
	- [GitHub Repository](https://github.com/primepake/learnable-speech)
	- [Technical Paper](https://arxiv.org/pdf/2505.07916)
	"""
	)

	with gr.Row():
	gr.Examples(
	examples=[
	["Hello everyone! I am here to tell you that Learnable-Speech is amazing!"],
	["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle."],
	["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis."],
	["This implementation uses flow matching for high-quality 24kHz audio generation."],
	],
	inputs=[text_input],
	fn=lambda x: synthesize_speech(x, 0),
	outputs=audio_output,
	cache_examples=False,
	label="Example Texts"
	)

	generate_btn.click(
	fn=synthesize_speech,
	inputs=[text_input, speaker_slider],
	outputs=audio_output
	)

	return demo

	if __name__ == "__main__":
	# Get environment variables for flexible deployment
	port = int(os.environ.get("PORT", 7860))
	host = os.environ.get("HOST", "0.0.0.0")

	demo = create_demo()

	# Try to launch with error handling
	try:
	demo.launch(
	server_name=host,
	server_port=port,
	share=False,
	show_error=True,
	quiet=False,
	enable_queue=True
	)
	except Exception as e:
	print(f"Failed to launch on {host}:{port}, trying with share=True")
	demo.launch(
	share=True,
	show_error=True,
	quiet=False,
	enable_queue=True
	)