Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

mnhatdaous commited on Sep 9

Commit

36fbe52

1 Parent(s): aeff66c

Fix Gradio app deployment issues

Browse files

Files changed (1) hide show

app.py +28 -102

app.py CHANGED Viewed

@@ -10,26 +10,25 @@ def synthesize_speech(text, speaker_id=0):
     if not text.strip():
         return None
-    # This is a placeholder - replace with actual model inference
     sample_rate = 24000
     duration = max(1.0, len(text) * 0.08)  # rough estimate
     samples = int(sample_rate * duration)
-    # Generate simple sine wave as placeholder
-    t = np.linspace(0, duration, samples)
-    frequency = 440 + (speaker_id * 50)  # vary frequency by speaker
-    # Create a more interesting waveform
     audio = (
         0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) +
         0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) +
-        0.05 * np.random.randn(samples)  # add some noise
     )
-    # Apply fade in/out
-    fade_samples = int(0.1 * sample_rate)
-    audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
-    audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
     return (sample_rate, audio.astype(np.float32))
@@ -50,25 +49,6 @@ def create_demo():
             An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.
             > **⚠️ This is a demo interface with placeholder audio. To use the actual model, you need to train it first!**
-            ## 🚀 How to Train Your Own Model:
-            1. **Follow the [Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md)**
-            2. **Use the provided training scripts** in the `scripts/` directory
-            3. **Upload your trained models** to Hugging Face Hub
-            4. **Replace the placeholder code** in this Space with your models
-            ### Quick Start:
-            ```bash
-            # 1. Prepare your dataset
-            ./scripts/prepare_data.sh
-            # 2. Train the model
-            ./scripts/train_full_pipeline.sh
-            # 3. Upload to Hugging Face
-            python scripts/upload_to_hf.py --username your_username
-            ```
             """
         )
@@ -81,16 +61,15 @@ def create_demo():
                     value="Hello, this is a demo of Learnable-Speech synthesis."
                 )
-                with gr.Row():
-                    speaker_slider = gr.Slider(
-                        minimum=0,
-                        maximum=10,
-                        value=0,
-                        step=1,
-                        label="Speaker ID"
-                    )
-                generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
             with gr.Column():
                 audio_output = gr.Audio(
@@ -98,83 +77,30 @@ def create_demo():
                     type="numpy"
                 )
-        with gr.Accordion("🎯 Training Status & Next Steps", open=True):
-            gr.Markdown(
-                """
-                ### 📋 Current Status:
-                - ✅ **Demo Interface**: Ready
-                - ❌ **Trained Models**: Not available (placeholder audio only)
-                - ❌ **Model Inference**: Not implemented yet
-                ### 🔧 To Enable Real Speech Synthesis:
-                1. **Train the models** using the provided pipeline
-                2. **Upload trained checkpoints** to Hugging Face Hub
-                3. **Update the inference code** in `synthesize_speech()` function
-                4. **Test with real model outputs**
-                ### 📚 Resources:
-                - [📖 Complete Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md)
-                - [🛠️ Training Scripts](https://github.com/primepake/learnable-speech/tree/main/scripts)
-                - [📄 Research Paper](https://arxiv.org/pdf/2505.07916)
-                - [💻 GitHub Repository](https://github.com/primepake/learnable-speech)
-                """
-            )
-            gr.Markdown(
-                """
-                ### Key Features
-                - **24kHz Audio Support**: High-quality audio generation at 24kHz sampling rate
-                - **Flow matching AE**: Flow matching training for autoencoders
-                - **Immiscible assignment**: Support immiscible adding noise while training
-                - **Contrastive Flow matching**: Support Contrastive training
-                ### Architecture
-                **Stage 1**: Audio to Discrete Tokens - Converts raw audio into discrete representations using FSQ (S3Tokenizer)
-                **Stage 2**: Discrete Tokens to Continuous Latent Space - Maps discrete tokens to continuous latent space using VAE
-                ### Training Pipeline
-                1. Extract discrete tokens using trained FSQ S3Tokenizer
-                2. Generate continuous latent representations using trained DAC-VAE
-                3. Train Stage 1: BPE tokens → Discrete FSQ
-                4. Train Stage 2: Discrete FSQ → DAC-VAE Continuous latent space
-                ### Links
-                - [GitHub Repository](https://github.com/primepake/learnable-speech)
-                - [Technical Paper](https://arxiv.org/pdf/2505.07916)
-                """
-            )
-        with gr.Row():
-            gr.Examples(
-                examples=[
-                    ["Hello everyone! I am here to tell you that Learnable-Speech is amazing!"],
-                    ["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle."],
-                    ["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis."],
-                    ["This implementation uses flow matching for high-quality 24kHz audio generation."],
-                ],
-                inputs=[text_input],
-                fn=lambda x: synthesize_speech(x, 0),
-                outputs=audio_output,
-                cache_examples=False,
-                label="Example Texts"
-            )
         generate_btn.click(
             fn=synthesize_speech,
             inputs=[text_input, speaker_slider],
             outputs=audio_output
         )
     return demo
 if __name__ == "__main__":
-    # Get environment variables for flexible deployment
     port = int(os.environ.get("PORT", 7860))
     host = os.environ.get("HOST", "0.0.0.0")
     demo = create_demo()
-    # Try to launch with error handling
     try:
         demo.launch(
             server_name=host,
@@ -184,7 +110,7 @@ if __name__ == "__main__":
             quiet=False,
             enable_queue=True
         )
-    except Exception as e:
         print(f"Failed to launch on {host}:{port}, trying with share=True")
         demo.launch(
             share=True,

     if not text.strip():
         return None
     sample_rate = 24000
     duration = max(1.0, len(text) * 0.08)  # rough estimate
     samples = int(sample_rate * duration)
+    # Generate sine-based waveform
+    t = np.linspace(0, duration, samples, endpoint=False)
+    frequency = 440 + (speaker_id * 50)
     audio = (
         0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) +
         0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) +
+        0.05 * np.random.randn(samples)
     )
+    # Fade in/out safely
+    fade_samples = min(int(0.1 * sample_rate), samples // 2)
+    if fade_samples > 0:
+        audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
+        audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
     return (sample_rate, audio.astype(np.float32))
             An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.
             > **⚠️ This is a demo interface with placeholder audio. To use the actual model, you need to train it first!**
             """
         )
                     value="Hello, this is a demo of Learnable-Speech synthesis."
                 )
+                speaker_slider = gr.Slider(
+                    minimum=0,
+                    maximum=10,
+                    value=0,
+                    step=1,
+                    label="Speaker ID"
+                )
+                generate_btn = gr.Button("🎵 Generate Speech", variant="primary")
             with gr.Column():
                 audio_output = gr.Audio(
                     type="numpy"
                 )
         generate_btn.click(
             fn=synthesize_speech,
             inputs=[text_input, speaker_slider],
             outputs=audio_output
         )
+        gr.Examples(
+            examples=[
+                ["Hello everyone! I am here to tell you that Learnable-Speech is amazing!"],
+                ["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle."],
+                ["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis."],
+                ["This implementation uses flow matching for high-quality 24kHz audio generation."],
+            ],
+            inputs=[text_input],
+        )
     return demo
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
     host = os.environ.get("HOST", "0.0.0.0")
     demo = create_demo()
     try:
         demo.launch(
             server_name=host,
             quiet=False,
             enable_queue=True
         )
+    except Exception:
         print(f"Failed to launch on {host}:{port}, trying with share=True")
         demo.launch(
             share=True,