mnhatdaous commited on
Commit
36fbe52
·
1 Parent(s): aeff66c

Fix Gradio app deployment issues

Browse files
Files changed (1) hide show
  1. app.py +28 -102
app.py CHANGED
@@ -10,26 +10,25 @@ def synthesize_speech(text, speaker_id=0):
10
  if not text.strip():
11
  return None
12
 
13
- # This is a placeholder - replace with actual model inference
14
  sample_rate = 24000
15
  duration = max(1.0, len(text) * 0.08) # rough estimate
16
  samples = int(sample_rate * duration)
17
 
18
- # Generate simple sine wave as placeholder
19
- t = np.linspace(0, duration, samples)
20
- frequency = 440 + (speaker_id * 50) # vary frequency by speaker
21
 
22
- # Create a more interesting waveform
23
  audio = (
24
  0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) +
25
  0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) +
26
- 0.05 * np.random.randn(samples) # add some noise
27
  )
28
 
29
- # Apply fade in/out
30
- fade_samples = int(0.1 * sample_rate)
31
- audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
32
- audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
 
33
 
34
  return (sample_rate, audio.astype(np.float32))
35
 
@@ -50,25 +49,6 @@ def create_demo():
50
  An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.
51
 
52
  > **⚠️ This is a demo interface with placeholder audio. To use the actual model, you need to train it first!**
53
-
54
- ## 🚀 How to Train Your Own Model:
55
-
56
- 1. **Follow the [Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md)**
57
- 2. **Use the provided training scripts** in the `scripts/` directory
58
- 3. **Upload your trained models** to Hugging Face Hub
59
- 4. **Replace the placeholder code** in this Space with your models
60
-
61
- ### Quick Start:
62
- ```bash
63
- # 1. Prepare your dataset
64
- ./scripts/prepare_data.sh
65
-
66
- # 2. Train the model
67
- ./scripts/train_full_pipeline.sh
68
-
69
- # 3. Upload to Hugging Face
70
- python scripts/upload_to_hf.py --username your_username
71
- ```
72
  """
73
  )
74
 
@@ -81,16 +61,15 @@ def create_demo():
81
  value="Hello, this is a demo of Learnable-Speech synthesis."
82
  )
83
 
84
- with gr.Row():
85
- speaker_slider = gr.Slider(
86
- minimum=0,
87
- maximum=10,
88
- value=0,
89
- step=1,
90
- label="Speaker ID"
91
- )
92
 
93
- generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
94
 
95
  with gr.Column():
96
  audio_output = gr.Audio(
@@ -98,83 +77,30 @@ def create_demo():
98
  type="numpy"
99
  )
100
 
101
- with gr.Accordion("🎯 Training Status & Next Steps", open=True):
102
- gr.Markdown(
103
- """
104
- ### 📋 Current Status:
105
- - ✅ **Demo Interface**: Ready
106
- - ❌ **Trained Models**: Not available (placeholder audio only)
107
- - ❌ **Model Inference**: Not implemented yet
108
-
109
- ### 🔧 To Enable Real Speech Synthesis:
110
- 1. **Train the models** using the provided pipeline
111
- 2. **Upload trained checkpoints** to Hugging Face Hub
112
- 3. **Update the inference code** in `synthesize_speech()` function
113
- 4. **Test with real model outputs**
114
-
115
- ### 📚 Resources:
116
- - [📖 Complete Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md)
117
- - [🛠️ Training Scripts](https://github.com/primepake/learnable-speech/tree/main/scripts)
118
- - [📄 Research Paper](https://arxiv.org/pdf/2505.07916)
119
- - [💻 GitHub Repository](https://github.com/primepake/learnable-speech)
120
- """
121
- )
122
- gr.Markdown(
123
- """
124
- ### Key Features
125
- - **24kHz Audio Support**: High-quality audio generation at 24kHz sampling rate
126
- - **Flow matching AE**: Flow matching training for autoencoders
127
- - **Immiscible assignment**: Support immiscible adding noise while training
128
- - **Contrastive Flow matching**: Support Contrastive training
129
-
130
- ### Architecture
131
- **Stage 1**: Audio to Discrete Tokens - Converts raw audio into discrete representations using FSQ (S3Tokenizer)
132
-
133
- **Stage 2**: Discrete Tokens to Continuous Latent Space - Maps discrete tokens to continuous latent space using VAE
134
-
135
- ### Training Pipeline
136
- 1. Extract discrete tokens using trained FSQ S3Tokenizer
137
- 2. Generate continuous latent representations using trained DAC-VAE
138
- 3. Train Stage 1: BPE tokens → Discrete FSQ
139
- 4. Train Stage 2: Discrete FSQ → DAC-VAE Continuous latent space
140
-
141
- ### Links
142
- - [GitHub Repository](https://github.com/primepake/learnable-speech)
143
- - [Technical Paper](https://arxiv.org/pdf/2505.07916)
144
- """
145
- )
146
-
147
- with gr.Row():
148
- gr.Examples(
149
- examples=[
150
- ["Hello everyone! I am here to tell you that Learnable-Speech is amazing!"],
151
- ["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle."],
152
- ["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis."],
153
- ["This implementation uses flow matching for high-quality 24kHz audio generation."],
154
- ],
155
- inputs=[text_input],
156
- fn=lambda x: synthesize_speech(x, 0),
157
- outputs=audio_output,
158
- cache_examples=False,
159
- label="Example Texts"
160
- )
161
-
162
  generate_btn.click(
163
  fn=synthesize_speech,
164
  inputs=[text_input, speaker_slider],
165
  outputs=audio_output
166
  )
 
 
 
 
 
 
 
 
 
 
167
 
168
  return demo
169
 
170
  if __name__ == "__main__":
171
- # Get environment variables for flexible deployment
172
  port = int(os.environ.get("PORT", 7860))
173
  host = os.environ.get("HOST", "0.0.0.0")
174
 
175
  demo = create_demo()
176
 
177
- # Try to launch with error handling
178
  try:
179
  demo.launch(
180
  server_name=host,
@@ -184,7 +110,7 @@ if __name__ == "__main__":
184
  quiet=False,
185
  enable_queue=True
186
  )
187
- except Exception as e:
188
  print(f"Failed to launch on {host}:{port}, trying with share=True")
189
  demo.launch(
190
  share=True,
 
10
  if not text.strip():
11
  return None
12
 
 
13
  sample_rate = 24000
14
  duration = max(1.0, len(text) * 0.08) # rough estimate
15
  samples = int(sample_rate * duration)
16
 
17
+ # Generate sine-based waveform
18
+ t = np.linspace(0, duration, samples, endpoint=False)
19
+ frequency = 440 + (speaker_id * 50)
20
 
 
21
  audio = (
22
  0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) +
23
  0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) +
24
+ 0.05 * np.random.randn(samples)
25
  )
26
 
27
+ # Fade in/out safely
28
+ fade_samples = min(int(0.1 * sample_rate), samples // 2)
29
+ if fade_samples > 0:
30
+ audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
31
+ audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
32
 
33
  return (sample_rate, audio.astype(np.float32))
34
 
 
49
  An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.
50
 
51
  > **⚠️ This is a demo interface with placeholder audio. To use the actual model, you need to train it first!**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
  )
54
 
 
61
  value="Hello, this is a demo of Learnable-Speech synthesis."
62
  )
63
 
64
+ speaker_slider = gr.Slider(
65
+ minimum=0,
66
+ maximum=10,
67
+ value=0,
68
+ step=1,
69
+ label="Speaker ID"
70
+ )
 
71
 
72
+ generate_btn = gr.Button("🎵 Generate Speech", variant="primary")
73
 
74
  with gr.Column():
75
  audio_output = gr.Audio(
 
77
  type="numpy"
78
  )
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  generate_btn.click(
81
  fn=synthesize_speech,
82
  inputs=[text_input, speaker_slider],
83
  outputs=audio_output
84
  )
85
+
86
+ gr.Examples(
87
+ examples=[
88
+ ["Hello everyone! I am here to tell you that Learnable-Speech is amazing!"],
89
+ ["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle."],
90
+ ["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis."],
91
+ ["This implementation uses flow matching for high-quality 24kHz audio generation."],
92
+ ],
93
+ inputs=[text_input],
94
+ )
95
 
96
  return demo
97
 
98
  if __name__ == "__main__":
 
99
  port = int(os.environ.get("PORT", 7860))
100
  host = os.environ.get("HOST", "0.0.0.0")
101
 
102
  demo = create_demo()
103
 
 
104
  try:
105
  demo.launch(
106
  server_name=host,
 
110
  quiet=False,
111
  enable_queue=True
112
  )
113
+ except Exception:
114
  print(f"Failed to launch on {host}:{port}, trying with share=True")
115
  demo.launch(
116
  share=True,