Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

App Files Files Community

DevNumb commited on Dec 5, 2025

Commit

9de60e0

verified ·

1 Parent(s): 6099104

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -128

app.py CHANGED Viewed

@@ -4,10 +4,12 @@ import numpy as np
 import tempfile
 import time
 import warnings
 warnings.filterwarnings("ignore")
-# Custom CSS for beautiful UI
-custom_css = """
 .gradio-container {
     max-width: 1200px !important;
     margin: 0 auto !important;
@@ -106,7 +108,7 @@ custom_css = """
     left: 100%;
 }
-.fancy-textbox textarea {
     background: rgba(255, 255, 255, 0.05) !important;
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
     border-radius: 15px !important;
@@ -116,7 +118,7 @@ custom_css = """
     transition: all 0.3s ease !important;
 }
-.fancy-textbox textarea:focus {
     border-color: #667eea !important;
     box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
     background: rgba(255, 255, 255, 0.08) !important;
@@ -174,13 +176,13 @@ custom_css = """
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
 }
-.custom-slider .gr-slider {
     background: rgba(255, 255, 255, 0.1) !important;
     height: 8px !important;
     border-radius: 10px !important;
 }
-.custom-slider .gr-slider::-webkit-slider-thumb {
     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
     border: none !important;
     width: 24px !important;
@@ -204,15 +206,21 @@ custom_css = """
     border-color: rgba(255, 255, 255, 0.5) !important;
     transform: translateY(-2px) !important;
 }
 """
-# Global variable for model (simple caching)
 _tts_model = None
-_tts_processor = None
 def load_model():
     """Load the TTS model once"""
-    global _tts_model, _tts_processor
     if _tts_model is None:
         print("🚀 Loading VibeVoice model...")
@@ -258,19 +266,14 @@ stats = TTSStats()
 def generate_simple_tone(text, sampling_rate=16000):
     """Generate a simple tone for fallback"""
-    # Create tone based on text
-    duration = min(len(text) * 0.05, 5)  # Up to 5 seconds
     t = np.linspace(0, duration, int(sampling_rate * duration))
-    # Generate tone with varying frequency based on text
-    base_freq = 220 + (hash(text) % 200)  # Vary frequency
     audio = 0.5 * np.sin(2 * np.pi * base_freq * t)
-    # Add harmonics
     audio += 0.2 * np.sin(2 * np.pi * base_freq * 2 * t)
     audio += 0.1 * np.sin(2 * np.pi * base_freq * 3 * t)
-    # Envelope to make it sound more natural
     envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
     audio *= envelope
@@ -285,24 +288,18 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
         if len(text) > 1000:
             text = text[:1000]
-        # Update stats
         stats.add_generation(text)
-        # Load model
         model = load_model()
         if model == "simple":
-            # Use simple tone generation
             audio, sampling_rate = generate_simple_tone(text)
-            message = f"⚠️ Using simple tone generation (model not available)<br>Text: {text[:50]}..."
         else:
-            # Use transformer pipeline
             print(f"Generating speech for: {text[:50]}...")
             result = model(text)
             audio = result["audio"]
             sampling_rate = result["sampling_rate"]
-            # Format message based on emotion
             emotion_icons = {
                 "neutral": "😐",
                 "happy": "😊",
@@ -314,7 +311,9 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
             message = f"{icon} Generated {len(text)} characters with {emotion} tone"
         # Normalize audio
-        audio = audio / np.max(np.abs(audio)) * 0.95
         # Apply speed adjustment
         if speed != 1.0:
@@ -323,17 +322,15 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
             audio = signal.resample(audio, new_length)
         # Save to temporary file
-        import scipy.io.wavfile
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
             success_message = f"""
-            <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;'>
                 <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
                 <div style='color: rgba(255,255,255,0.8);'>
                     Length: <strong>{len(audio)/sampling_rate:.1f}s</strong> |
-                    Speed: <strong>{speed}x</strong> |
-                    Emotion: <strong>{emotion}</strong>
                 </div>
             </div>
             """
@@ -341,9 +338,7 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
     except Exception as e:
         print(f"Error generating speech: {e}")
-        # Create silent audio as fallback
         try:
-            import scipy.io.wavfile
             silent_audio = np.zeros(16000, dtype=np.float32)
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
@@ -375,18 +370,17 @@ def update_stats_display():
     </div>
     """
-# Create the interface
-with gr.Blocks(
-    title="🎵 VibeVoice TTS",
-    css=custom_css
-) as demo:
     # Header Section
-    with gr.Column(elem_classes="header"):
         gr.HTML("""
-        <div style="text-align: center;">
             <h1>🎵 VibeVoice TTS</h1>
-            <p style="font-size: 1.2em; opacity: 0.9;">Transform Text into Speech</p>
             <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
                     🤖 AI Powered
@@ -404,110 +398,99 @@ with gr.Blocks(
     # Main Content
     with gr.Row():
         # Left Panel - Input Controls
-        with gr.Column(scale=1, elem_classes="glass-card"):
             gr.Markdown("### 📝 Input Text")
             text_input = gr.Textbox(
                 label="",
                 placeholder="Enter your text here... (Max 1000 characters)",
-                lines=6,
-                elem_classes="fancy-textbox"
             )
             gr.Markdown("### 🎭 Voice Settings")
-            with gr.Row():
-                emotion = gr.Dropdown(
-                    label="Voice Emotion",
-                    choices=["neutral", "happy", "excited", "calm", "professional"],
-                    value="neutral"
-                )
-            with gr.Row():
-                speed = gr.Slider(
-                    minimum=0.5,
-                    maximum=2.0,
-                    value=1.0,
-                    step=0.1,
-                    label="Speaking Speed",
-                    elem_classes="custom-slider"
-                )
             # Action Buttons
             with gr.Row():
-                generate_btn = gr.Button(
-                    "✨ Generate Speech",
-                    variant="primary",
-                    elem_classes="glow-button"
-                )
-                clear_btn = gr.Button(
-                    "Clear",
-                    variant="secondary",
-                    elem_classes="secondary-button"
-                )
             # Quick Actions
             gr.Markdown("### ⚡ Quick Actions")
             with gr.Row():
-                quick_test = gr.Button("Test Voice", elem_classes="secondary-button")
-                quick_clear = gr.Button("Clear Text", elem_classes="secondary-button")
         # Right Panel - Output Display
-        with gr.Column(scale=1, elem_classes="glass-card"):
             gr.Markdown("### 🎧 Generated Audio")
-            with gr.Column(elem_classes="audio-player"):
-                audio_output = gr.Audio(
-                    label="",
-                    type="filepath"
-                )
-                # Status and Info
-                status_display = gr.HTML(
-                    value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
-                )
-    # Bottom Section - Stats and Examples
-    with gr.Column(elem_classes="glass-card"):
-        with gr.Tabs(elem_classes="tab-nav"):
-            with gr.TabItem("📈 Statistics"):
-                stats_display = gr.HTML(
-                    value=update_stats_display()
-                )
-                refresh_stats = gr.Button("Refresh Stats", elem_classes="secondary-button")
-            with gr.TabItem("💡 Examples"):
-                gr.Examples(
-                    examples=[
-                        ["Hello, welcome to VibeVoice text-to-speech!"],
-                        ["This is a demonstration of AI speech synthesis."],
-                        ["The weather is beautiful today."],
-                        ["Artificial intelligence is amazing technology."],
-                        ["Please enjoy this text to speech demonstration."]
-                    ],
-                    inputs=text_input,
-                    label="Click any example to try it"
-                )
-            with gr.TabItem("ℹ️ About"):
-                gr.Markdown("""
-                ## About VibeVoice TTS
-                This application converts text into speech using AI technology.
-                ### Features:
-                - **AI-Powered**: Uses advanced machine learning models
-                - **Multiple Emotions**: Choose different voice tones
-                - **Adjustable Speed**: Control speaking rate
-                - **Real-time**: Fast generation
-                ### Tips:
-                - Keep text under 500 characters for best results
-                - Try different emotions for varied expressions
-                - Adjust speed to match your preference
-                ⚠️ **Note**: If the model fails to load, a simple tone generator will be used as fallback.
-                """)
     # Footer
     gr.HTML("""
@@ -520,7 +503,6 @@ with gr.Blocks(
     # Event Handlers
     def process_generation(text, emotion_val, speed_val):
-        """Handle speech generation"""
         if not text or text.strip() == "":
             return None, "⚠️ Please enter some text first!", update_stats_display()
@@ -533,8 +515,7 @@ with gr.Blocks(
         return "", None, "Cleared. Ready for new input.", update_stats_display()
     def test_voice():
-        test_text = "Hello! This is a test of the VibeVoice text-to-speech system."
-        return test_text
     # Connect buttons
     generate_btn.click(
@@ -566,13 +547,6 @@ with gr.Blocks(
         inputs=[],
         outputs=[stats_display]
     )
-    # Initialize stats on load
-    demo.load(
-        fn=update_stats_display,
-        inputs=[],
-        outputs=[stats_display]
-    )
 # Launch the app
 if __name__ == "__main__":

 import tempfile
 import time
 import warnings
+import scipy.io.wavfile
 warnings.filterwarnings("ignore")
+# Inline CSS for Gradio 3.x
+css = """
+<style>
 .gradio-container {
     max-width: 1200px !important;
     margin: 0 auto !important;
     left: 100%;
 }
+textarea {
     background: rgba(255, 255, 255, 0.05) !important;
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
     border-radius: 15px !important;
     transition: all 0.3s ease !important;
 }
+textarea:focus {
     border-color: #667eea !important;
     box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
     background: rgba(255, 255, 255, 0.08) !important;
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
 }
+input[type="range"] {
     background: rgba(255, 255, 255, 0.1) !important;
     height: 8px !important;
     border-radius: 10px !important;
 }
+input[type="range"]::-webkit-slider-thumb {
     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
     border: none !important;
     width: 24px !important;
     border-color: rgba(255, 255, 255, 0.5) !important;
     transform: translateY(-2px) !important;
 }
+#component-0 {
+    min-height: 100vh;
+    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
+    padding: 2rem;
+}
+</style>
 """
+# Global variable for model
 _tts_model = None
 def load_model():
     """Load the TTS model once"""
+    global _tts_model
     if _tts_model is None:
         print("🚀 Loading VibeVoice model...")
 def generate_simple_tone(text, sampling_rate=16000):
     """Generate a simple tone for fallback"""
+    duration = min(len(text) * 0.05, 5)
     t = np.linspace(0, duration, int(sampling_rate * duration))
+    base_freq = 220 + (hash(text) % 200)
     audio = 0.5 * np.sin(2 * np.pi * base_freq * t)
     audio += 0.2 * np.sin(2 * np.pi * base_freq * 2 * t)
     audio += 0.1 * np.sin(2 * np.pi * base_freq * 3 * t)
     envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
     audio *= envelope
         if len(text) > 1000:
             text = text[:1000]
         stats.add_generation(text)
         model = load_model()
         if model == "simple":
             audio, sampling_rate = generate_simple_tone(text)
+            message = f"⚠️ Using simple tone generation (model not available)"
         else:
             print(f"Generating speech for: {text[:50]}...")
             result = model(text)
             audio = result["audio"]
             sampling_rate = result["sampling_rate"]
             emotion_icons = {
                 "neutral": "😐",
                 "happy": "😊",
             message = f"{icon} Generated {len(text)} characters with {emotion} tone"
         # Normalize audio
+        max_val = np.max(np.abs(audio))
+        if max_val > 0:
+            audio = audio / max_val * 0.95
         # Apply speed adjustment
         if speed != 1.0:
             audio = signal.resample(audio, new_length)
         # Save to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
             success_message = f"""
+            <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
                 <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
                 <div style='color: rgba(255,255,255,0.8);'>
                     Length: <strong>{len(audio)/sampling_rate:.1f}s</strong> |
+                    Speed: <strong>{speed}x</strong>
                 </div>
             </div>
             """
     except Exception as e:
         print(f"Error generating speech: {e}")
         try:
             silent_audio = np.zeros(16000, dtype=np.float32)
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
     </div>
     """
+# Create the interface with proper Gradio 3.x syntax
+with gr.Blocks() as demo:
+    # Add CSS as HTML
+    gr.HTML(css)
     # Header Section
+    with gr.Column():
         gr.HTML("""
+        <div class="header">
             <h1>🎵 VibeVoice TTS</h1>
+            <p>Transform Text into Natural Speech</p>
             <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
                     🤖 AI Powered
     # Main Content
     with gr.Row():
         # Left Panel - Input Controls
+        with gr.Column(scale=1):
+            gr.HTML('<div class="glass-card">')
             gr.Markdown("### 📝 Input Text")
             text_input = gr.Textbox(
                 label="",
                 placeholder="Enter your text here... (Max 1000 characters)",
+                lines=6
             )
             gr.Markdown("### 🎭 Voice Settings")
+            emotion = gr.Dropdown(
+                label="Voice Emotion",
+                choices=["neutral", "happy", "excited", "calm", "professional"],
+                value="neutral"
+            )
+            speed = gr.Slider(
+                minimum=0.5,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                label="Speaking Speed"
+            )
             # Action Buttons
             with gr.Row():
+                generate_btn = gr.Button("✨ Generate Speech", variant="primary")
+                clear_btn = gr.Button("Clear", variant="secondary")
             # Quick Actions
             gr.Markdown("### ⚡ Quick Actions")
             with gr.Row():
+                quick_test = gr.Button("Test Voice", variant="secondary")
+                quick_clear = gr.Button("Clear Text", variant="secondary")
+            gr.HTML('</div>')
         # Right Panel - Output Display
+        with gr.Column(scale=1):
+            gr.HTML('<div class="glass-card">')
             gr.Markdown("### 🎧 Generated Audio")
+            gr.HTML('<div class="audio-player">')
+            audio_output = gr.Audio(label="", type="filepath")
+            status_display = gr.HTML(
+                value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
+            )
+            gr.HTML('</div>')
+            gr.HTML('</div>')
+    # Bottom Section - Tabs
+    gr.HTML('<div class="glass-card">')
+    with gr.Tabs():
+        with gr.TabItem("📈 Statistics"):
+            stats_display = gr.HTML(value=update_stats_display())
+            refresh_stats = gr.Button("Refresh Stats", variant="secondary")
+        with gr.TabItem("💡 Examples"):
+            gr.Examples(
+                examples=[
+                    ["Hello, welcome to VibeVoice text-to-speech!"],
+                    ["This is a demonstration of AI speech synthesis."],
+                    ["The weather is beautiful today."],
+                    ["Artificial intelligence is amazing technology."],
+                    ["Please enjoy this text to speech demonstration."]
+                ],
+                inputs=text_input,
+                label="Click any example to try it"
+            )
+        with gr.TabItem("ℹ️ About"):
+            gr.Markdown("""
+            ## About VibeVoice TTS
+            This application converts text into speech using AI technology.
+            ### Features:
+            - **AI-Powered**: Uses advanced machine learning models
+            - **Multiple Emotions**: Choose different voice tones
+            - **Adjustable Speed**: Control speaking rate
+            - **Real-time**: Fast generation
+            ### Tips:
+            - Keep text under 500 characters for best results
+            - Try different emotions for varied expressions
+            - Adjust speed to match your preference
+            ⚠️ **Note**: If the model fails to load, a simple tone generator will be used as fallback.
+            """)
+    gr.HTML('</div>')
     # Footer
     gr.HTML("""
     # Event Handlers
     def process_generation(text, emotion_val, speed_val):
         if not text or text.strip() == "":
             return None, "⚠️ Please enter some text first!", update_stats_display()
         return "", None, "Cleared. Ready for new input.", update_stats_display()
     def test_voice():
+        return "Hello! This is a test of the VibeVoice text-to-speech system."
     # Connect buttons
     generate_btn.click(
         inputs=[],
         outputs=[stats_display]
     )
 # Launch the app
 if __name__ == "__main__":